[ { "title": "A Baseline for Any Order Gradient Estimation in Stochastic Computation Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3768", "id": "3768", "author_site": "Jingkai Mao, Jakob Foerster, Tim Rockt\u00e4schel, Maruan Al-Shedivat, Gregory Farquhar, Shimon Whiteson", "author": "Jingkai Mao; Jakob Foerster; Tim Rockt\u00e4schel; Maruan Al-Shedivat; Gregory Farquhar; Shimon Whiteson", "abstract": "By enabling correct differentiation in Stochastic Computation Graphs (SCGs), the infinitely differentiable Monte-Carlo estimator (DiCE) can generate correct estimates for the higher order gradients that arise in, e.g., multi-agent reinforcement learning and meta-learning. However, the baseline term in DiCE that serves as a control variate for reducing variance applies only to first order gradient estimation, limiting the utility of higher-order gradient estimates. To improve the sample efficiency of DiCE, we propose a new baseline term for higher order gradient estimation. This term may be easily included in the objective, and produces unbiased variance-reduced estimators under (automatic) differentiation, without affecting the estimate of the objective itself or of the first order gradient estimate. It reuses the same baseline function (e.g., the state-value function in reinforcement learning) already used for the first order baseline. We provide theoretical analysis and numerical evaluations of this new baseline, which demonstrate that it can dramatically reduce the variance of DiCE\u2019s second order gradient estimators and also show empirically that it reduces the variance of third and fourth order gradients. This computational tool can be easily used to estimate higher order gradients with unprecedented efficiency and simplicity wherever automatic differentiation is utilised, and it has the potential to unlock applications of higher order gradients in reinforcement learning and meta-learning.", "bibtex": "@InProceedings{pmlr-v97-mao19a,\n title = \t {A Baseline for Any Order Gradient Estimation in Stochastic Computation Graphs},\n author = {Mao, Jingkai and Foerster, Jakob and Rockt{\\\"a}schel, Tim and Al-Shedivat, Maruan and Farquhar, Gregory and Whiteson, Shimon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4343--4351},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mao19a/mao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mao19a.html},\n abstract = \t {By enabling correct differentiation in Stochastic Computation Graphs (SCGs), the infinitely differentiable Monte-Carlo estimator (DiCE) can generate correct estimates for the higher order gradients that arise in, e.g., multi-agent reinforcement learning and meta-learning. However, the baseline term in DiCE that serves as a control variate for reducing variance applies only to first order gradient estimation, limiting the utility of higher-order gradient estimates. To improve the sample efficiency of DiCE, we propose a new baseline term for higher order gradient estimation. This term may be easily included in the objective, and produces unbiased variance-reduced estimators under (automatic) differentiation, without affecting the estimate of the objective itself or of the first order gradient estimate. It reuses the same baseline function (e.g., the state-value function in reinforcement learning) already used for the first order baseline. We provide theoretical analysis and numerical evaluations of this new baseline, which demonstrate that it can dramatically reduce the variance of DiCE\u2019s second order gradient estimators and also show empirically that it reduces the variance of third and fourth order gradients. This computational tool can be easily used to estimate higher order gradients with unprecedented efficiency and simplicity wherever automatic differentiation is utilised, and it has the potential to unlock applications of higher order gradients in reinforcement learning and meta-learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/mao19a/mao19a.pdf", "supp": "", "pdf_size": 671881, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15108761383450828268&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Man AHL, London, UK + University of Oxford; Department of Computer Science, University of Oxford, Oxford, UK; Department of Computer Science, University College London, London, UK; School of Computer Science, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA; Department of Computer Science, University of Oxford, Oxford, UK; Department of Computer Science, University of Oxford, Oxford, UK", "aff_domain": "man.com;fb.com; ; ; ; ", "email": "man.com;fb.com; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/mao19a.html", "aff_unique_index": "0+1;1;2;3;1;1", "aff_unique_norm": "Man AHL;University of Oxford;University College London;Carnegie Mellon University", "aff_unique_dep": ";;Department of Computer Science;School of Computer Science", "aff_unique_url": ";https://www.ox.ac.uk;https://www.ucl.ac.uk;https://www.cmu.edu", "aff_unique_abbr": ";Oxford;UCL;CMU", "aff_campus_unique_index": ";1;2;3;1;1", "aff_campus_unique": ";Oxford;London;Pittsburgh", "aff_country_unique_index": "0+0;0;0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "A Better k-means++ Algorithm via Local Search", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4015", "id": "4015", "author_site": "Silvio Lattanzi, Christian Sohler", "author": "Silvio Lattanzi; Christian Sohler", "abstract": "In this paper, we develop a new variant of k-means++ seeding that in expectation achieves a constant approximation guarantee. We obtain this result by a simple combination of k-means++ sampling with a local search strategy. We evaluate our algorithm empirically and show that it also improves the quality of a solution in practice.", "bibtex": "@InProceedings{pmlr-v97-lattanzi19a,\n title = \t {A Better k-means++ Algorithm via Local Search},\n author = {Lattanzi, Silvio and Sohler, Christian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3662--3671},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lattanzi19a/lattanzi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lattanzi19a.html},\n abstract = \t {In this paper, we develop a new variant of k-means++ seeding that in expectation achieves a constant approximation guarantee. We obtain this result by a simple combination of k-means++ sampling with a local search strategy. We evaluate our algorithm empirically and show that it also improves the quality of a solution in practice.}\n}", "pdf": "http://proceedings.mlr.press/v97/lattanzi19a/lattanzi19a.pdf", "supp": "", "pdf_size": 597809, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12711402331922142979&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google Research, Zurich, ZH, Switzerland; Google Research, Zurich, ZH, Switzerland", "aff_domain": "google.com;google.com", "email": "google.com;google.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/lattanzi19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Res.", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "A Block Coordinate Descent Proximal Method for Simultaneous Filtering and Parameter Estimation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4187", "id": "4187", "author_site": "Ramin Raziperchikolaei, Harish Bhat", "author": "Ramin Raziperchikolaei; Harish Bhat", "abstract": "We propose and analyze a block coordinate descent proximal algorithm (BCD-prox) for simultaneous filtering and parameter estimation of ODE models. As we show on ODE systems with up to d=40 dimensions, as compared to state-of-the-art methods, BCD-prox exhibits increased robustness (to noise, parameter initialization, and hyperparameters), decreased training times, and improved accuracy of both filtered states and estimated parameters. We show how BCD-prox can be used with multistep numerical discretizations, and we establish convergence of BCD-prox under hypotheses that include real systems of interest.", "bibtex": "@InProceedings{pmlr-v97-raziperchikolaei19a,\n title = \t {A Block Coordinate Descent Proximal Method for Simultaneous Filtering and Parameter Estimation},\n author = {Raziperchikolaei, Ramin and Bhat, Harish},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5380--5388},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/raziperchikolaei19a/raziperchikolaei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/raziperchikolaei19a.html},\n abstract = \t {We propose and analyze a block coordinate descent proximal algorithm (BCD-prox) for simultaneous filtering and parameter estimation of ODE models. As we show on ODE systems with up to d=40 dimensions, as compared to state-of-the-art methods, BCD-prox exhibits increased robustness (to noise, parameter initialization, and hyperparameters), decreased training times, and improved accuracy of both filtered states and estimated parameters. We show how BCD-prox can be used with multistep numerical discretizations, and we establish convergence of BCD-prox under hypotheses that include real systems of interest.}\n}", "pdf": "http://proceedings.mlr.press/v97/raziperchikolaei19a/raziperchikolaei19a.pdf", "supp": "", "pdf_size": 582488, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5958729165149647284&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Rakuten Institute of Technology, San Mateo, CA, USA+Department of Computer Science, University of California, Merced, USA; Department of Mathematics, University of Utah, USA+Department of Applied Mathematics, University of California, Merced, USA", "aff_domain": "rakuten.com;ucmerced.edu", "email": "rakuten.com;ucmerced.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/raziperchikolaei19a.html", "aff_unique_index": "0+1;2+1", "aff_unique_norm": "Rakuten Institute of Technology;University of California, Merced;University of Utah", "aff_unique_dep": ";Department of Computer Science;Department of Mathematics", "aff_unique_url": "https://www.rakuten-research.com/en/technology/;https://www.ucmerced.edu;https://www.utah.edu", "aff_unique_abbr": "RIT;UC Merced;Utah", "aff_campus_unique_index": "0+1;1", "aff_campus_unique": "San Mateo;Merced;", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "United States" }, { "title": "A Composite Randomized Incremental Gradient Method", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3875", "id": "3875", "author_site": "Junyu Zhang, Lin Xiao", "author": "Junyu Zhang; Lin Xiao", "abstract": "We consider the problem of minimizing the composition of a smooth function (which can be nonconvex) and a smooth vector mapping, where both of them can be express as the average of a large number of components. We propose a composite randomized incremental gradient method by extending the SAGA framework. The gradient sample complexity of our method matches that of several recently developed methods based on SVRG in the general case. However, for structured problems where linear convergence rates can be obtained, our method can be much better for ill-conditioned problems. In addition, when the finite-sum structure only appear for the inner mapping, the sample complexity of our method is the same as that of SAGA for minimizing finite sum of smooth nonconvex functions, despite the additional outer composition and the stochastic composite gradients being biased in our case.", "bibtex": "@InProceedings{pmlr-v97-zhang19n,\n title = \t {A Composite Randomized Incremental Gradient Method},\n author = {Zhang, Junyu and Xiao, Lin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7454--7462},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19n/zhang19n.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19n.html},\n abstract = \t {We consider the problem of minimizing the composition of a smooth function (which can be nonconvex) and a smooth vector mapping, where both of them can be express as the average of a large number of components. We propose a composite randomized incremental gradient method by extending the SAGA framework. The gradient sample complexity of our method matches that of several recently developed methods based on SVRG in the general case. However, for structured problems where linear convergence rates can be obtained, our method can be much better for ill-conditioned problems. In addition, when the finite-sum structure only appear for the inner mapping, the sample complexity of our method is the same as that of SAGA for minimizing finite sum of smooth nonconvex functions, despite the additional outer composition and the stochastic composite gradients being biased in our case.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19n/zhang19n.pdf", "supp": "", "pdf_size": 309131, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13000648400562607687&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Industrial and Systems Engineering, University of Minnesota, Minneapolis, Minnesota, USA; Microsoft Research, Redmond, Washington, USA", "aff_domain": "umn.edu;microsoft.com", "email": "umn.edu;microsoft.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhang19n.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Minnesota;Microsoft", "aff_unique_dep": "Department of Industrial and Systems Engineering;Microsoft Research", "aff_unique_url": "https://www.umn.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UMN;MSR", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Minneapolis;Redmond", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Conditional-Gradient-Based Augmented Lagrangian Framework", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4129", "id": "4129", "author_site": "Alp Yurtsever, Olivier Fercoq, Volkan Cevher", "author": "Alp Yurtsever; Olivier Fercoq; Volkan Cevher", "abstract": "This paper considers a generic convex minimization template with affine constraints over a compact domain, which covers key semidefinite programming applications. The existing conditional gradient methods either do not apply to our template or are too slow in practice. To this end, we propose a new conditional gradient method, based on a unified treatment of smoothing and augmented Lagrangian frameworks. The proposed method maintains favorable properties of the classical conditional gradient method, such as cheap linear minimization oracle calls and sparse representation of the decision variable. We prove $O(1/\\sqrt{k})$ convergence rate for our method in the objective residual and the feasibility gap. This rate is essentially the same as the state of the art CG-type methods for our problem template, but the proposed method is arguably superior in practice compared to existing methods in various applications.", "bibtex": "@InProceedings{pmlr-v97-yurtsever19a,\n title = \t {A Conditional-Gradient-Based Augmented Lagrangian Framework},\n author = {Yurtsever, Alp and Fercoq, Olivier and Cevher, Volkan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7272--7281},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yurtsever19a/yurtsever19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yurtsever19a.html},\n abstract = \t {This paper considers a generic convex minimization template with affine constraints over a compact domain, which covers key semidefinite programming applications. The existing conditional gradient methods either do not apply to our template or are too slow in practice. To this end, we propose a new conditional gradient method, based on a unified treatment of smoothing and augmented Lagrangian frameworks. The proposed method maintains favorable properties of the classical conditional gradient method, such as cheap linear minimization oracle calls and sparse representation of the decision variable. We prove $O(1/\\sqrt{k})$ convergence rate for our method in the objective residual and the feasibility gap. This rate is essentially the same as the state of the art CG-type methods for our problem template, but the proposed method is arguably superior in practice compared to existing methods in various applications.}\n}", "pdf": "http://proceedings.mlr.press/v97/yurtsever19a/yurtsever19a.pdf", "supp": "", "pdf_size": 1291101, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8799076852937453964&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "LIONS, Ecole Polytechnique F\u00e9d\u00e9rale de Lausanne, Switzerland; LTCI, T\u00e9l\u00e9com ParisTech, Universit\u00e9 Paris-Saclay, France; LIONS, Ecole Polytechnique F\u00e9d\u00e9rale de Lausanne, Switzerland", "aff_domain": "epfl.ch; ; ", "email": "epfl.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yurtsever19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;T\u00e9l\u00e9com ParisTech", "aff_unique_dep": "LIONS;LTCI", "aff_unique_url": "https://www.epfl.ch;https://www.telecom-paris.fr", "aff_unique_abbr": "EPFL;T\u00e9l\u00e9com ParisTech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;France" }, { "title": "A Contrastive Divergence for Combining Variational Inference and MCMC", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3854", "id": "3854", "author_site": "Francisco Ruiz, Michalis Titsias", "author": "Francisco Ruiz; Michalis Titsias", "abstract": "We develop a method to combine Markov chain Monte Carlo (MCMC) and variational inference (VI), leveraging the advantages of both inference approaches. Specifically, we improve the variational distribution by running a few MCMC steps. To make inference tractable, we introduce the variational contrastive divergence (VCD), a new divergence that replaces the standard Kullback-Leibler (KL) divergence used in VI. The VCD captures a notion of discrepancy between the initial variational distribution and its improved version (obtained after running the MCMC steps), and it converges asymptotically to the symmetrized KL divergence between the variational distribution and the posterior of interest. The VCD objective can be optimized efficiently with respect to the variational parameters via stochastic optimization. We show experimentally that optimizing the VCD leads to better predictive performance on two latent variable models: logistic matrix factorization and variational autoencoders (VAEs).", "bibtex": "@InProceedings{pmlr-v97-ruiz19a,\n title = \t {A Contrastive Divergence for Combining Variational Inference and {MCMC}},\n author = {Ruiz, Francisco and Titsias, Michalis},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5537--5545},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ruiz19a/ruiz19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ruiz19a.html},\n abstract = \t {We develop a method to combine Markov chain Monte Carlo (MCMC) and variational inference (VI), leveraging the advantages of both inference approaches. Specifically, we improve the variational distribution by running a few MCMC steps. To make inference tractable, we introduce the variational contrastive divergence (VCD), a new divergence that replaces the standard Kullback-Leibler (KL) divergence used in VI. The VCD captures a notion of discrepancy between the initial variational distribution and its improved version (obtained after running the MCMC steps), and it converges asymptotically to the symmetrized KL divergence between the variational distribution and the posterior of interest. The VCD objective can be optimized efficiently with respect to the variational parameters via stochastic optimization. We show experimentally that optimizing the VCD leads to better predictive performance on two latent variable models: logistic matrix factorization and variational autoencoders (VAEs).}\n}", "pdf": "http://proceedings.mlr.press/v97/ruiz19a/ruiz19a.pdf", "supp": "", "pdf_size": 468043, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10765853948406678619&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "University of Cambridge, Cambridge, UK + Columbia University, New York, USA; DeepMind, London, UK", "aff_domain": "columbia.edu; ", "email": "columbia.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ruiz19a.html", "aff_unique_index": "0+1;2", "aff_unique_norm": "University of Cambridge;Columbia University;DeepMind", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cam.ac.uk;https://www.columbia.edu;https://deepmind.com", "aff_unique_abbr": "Cambridge;Columbia;DeepMind", "aff_campus_unique_index": "0+1;2", "aff_campus_unique": "Cambridge;New York;London", "aff_country_unique_index": "0+1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "A Convergence Theory for Deep Learning via Over-Parameterization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4040", "id": "4040", "author_site": "Zeyuan Allen-Zhu, Yuanzhi Li, Zhao Song", "author": "Zeyuan Allen-Zhu; Yuanzhi Li; Zhao Song", "abstract": "Deep neural networks (DNNs) have demonstrated dominating performance in many fields; since AlexNet, networks used in practice are going wider and deeper. On the theoretical side, a long line of works have been focusing on why we can train neural networks when there is only one hidden layer. The theory of multi-layer networks remains unsettled. In this work, we prove simple algorithms such as stochastic gradient descent (SGD) can find Global Minima on the training objective of DNNs in Polynomial Time. We only make two assumptions: the inputs do not degenerate and the network is over-parameterized. The latter means the number of hidden neurons is sufficiently large: polynomial in L, the number of DNN layers and in n, the number of training samples. As concrete examples, starting from randomly initialized weights, we show that SGD attains 100% training accuracy in classification tasks, or minimizes regression loss in linear convergence speed eps \u00a0 e^{-T}, with running time polynomial in n and L. Our theory applies to the widely-used but non-smooth ReLU activation, and to any smooth and possibly non-convex loss functions. In terms of network architectures, our theory at least applies to fully-connected neural networks, convolutional neural networks (CNN), and residual neural networks (ResNet).", "bibtex": "@InProceedings{pmlr-v97-allen-zhu19a,\n title = \t {A Convergence Theory for Deep Learning via Over-Parameterization},\n author = {Allen-Zhu, Zeyuan and Li, Yuanzhi and Song, Zhao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {242--252},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/allen-zhu19a/allen-zhu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/allen-zhu19a.html},\n abstract = \t {Deep neural networks (DNNs) have demonstrated dominating performance in many fields; since AlexNet, networks used in practice are going wider and deeper. On the theoretical side, a long line of works have been focusing on why we can train neural networks when there is only one hidden layer. The theory of multi-layer networks remains unsettled. In this work, we prove simple algorithms such as stochastic gradient descent (SGD) can find Global Minima on the training objective of DNNs in Polynomial Time. We only make two assumptions: the inputs do not degenerate and the network is over-parameterized. The latter means the number of hidden neurons is sufficiently large: polynomial in L, the number of DNN layers and in n, the number of training samples. As concrete examples, starting from randomly initialized weights, we show that SGD attains 100% training accuracy in classification tasks, or minimizes regression loss in linear convergence speed eps \u00a0 e^{-T}, with running time polynomial in n and L. Our theory applies to the widely-used but non-smooth ReLU activation, and to any smooth and possibly non-convex loss functions. In terms of network architectures, our theory at least applies to fully-connected neural networks, convolutional neural networks (CNN), and residual neural networks (ResNet).}\n}", "pdf": "http://proceedings.mlr.press/v97/allen-zhu19a/allen-zhu19a.pdf", "supp": "", "pdf_size": 1240689, "gs_citation": 1811, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8694714211834483408&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Microsoft Research AI+Stanford University+Princeton University+UT-Austin+University Washington+Harvard University; Stanford University; UT-Austin", "aff_domain": "csail.mit.edu;stanford.edu;utexas.edu", "email": "csail.mit.edu;stanford.edu;utexas.edu", "github": "", "project": "https://arxiv.org/abs/1811.03962", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/allen-zhu19a.html", "aff_unique_index": "0+1+2+3+4+5;1;3", "aff_unique_norm": "Microsoft;Stanford University;Princeton University;University of Texas at Austin;University of Washington;Harvard University", "aff_unique_dep": "AI;;;;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.stanford.edu;https://www.princeton.edu;https://www.utexas.edu;https://www.washington.edu;https://www.harvard.edu", "aff_unique_abbr": "MSR;Stanford;Princeton;UT Austin;UW;Harvard", "aff_campus_unique_index": "1+2;1;2", "aff_campus_unique": ";Stanford;Austin", "aff_country_unique_index": "0+0+0+0+0+0;0;0", "aff_country_unique": "United States" }, { "title": "A Deep Reinforcement Learning Perspective on Internet Congestion Control", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4028", "id": "4028", "author_site": "Nathan Jay, Noga H. Rotman, Brighten Godfrey, Michael Schapira, Aviv Tamar", "author": "Nathan Jay; Noga Rotman; Brighten Godfrey; Michael Schapira; Aviv Tamar", "abstract": "We present and investigate a novel and timely application domain for deep reinforcement learning (RL): Internet congestion control. Congestion control is the core networking task of modulating traffic sources\u2019 data-transmission rates to efficiently utilize network capacity, and is the subject of extensive attention in light of the advent of Internet services such as live video, virtual reality, Internet-of-Things, and more. We show that casting congestion control as RL enables training deep network policies that capture intricate patterns in data traffic and network conditions, and leverage this to outperform the state-of-the-art. We also highlight significant challenges facing real-world adoption of RL-based congestion control, including fairness, safety, and generalization, which are not trivial to address within conventional RL formalism. To facilitate further research and reproducibility of our results, we present a test suite for RL-guided congestion control based on the OpenAI Gym interface.", "bibtex": "@InProceedings{pmlr-v97-jay19a,\n title = \t {A Deep Reinforcement Learning Perspective on Internet Congestion Control},\n author = {Jay, Nathan and Rotman, Noga and Godfrey, Brighten and Schapira, Michael and Tamar, Aviv},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3050--3059},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jay19a/jay19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jay19a.html},\n abstract = \t {We present and investigate a novel and timely application domain for deep reinforcement learning (RL): Internet congestion control. Congestion control is the core networking task of modulating traffic sources\u2019 data-transmission rates to efficiently utilize network capacity, and is the subject of extensive attention in light of the advent of Internet services such as live video, virtual reality, Internet-of-Things, and more. We show that casting congestion control as RL enables training deep network policies that capture intricate patterns in data traffic and network conditions, and leverage this to outperform the state-of-the-art. We also highlight significant challenges facing real-world adoption of RL-based congestion control, including fairness, safety, and generalization, which are not trivial to address within conventional RL formalism. To facilitate further research and reproducibility of our results, we present a test suite for RL-guided congestion control based on the OpenAI Gym interface.}\n}", "pdf": "http://proceedings.mlr.press/v97/jay19a/jay19a.pdf", "supp": "", "pdf_size": 892033, "gs_citation": 404, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16172700989093189948&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "University of Illinois at Urbana-Champaign; Hebrew University of Jerusalem; University of Illinois at Urbana-Champaign; Hebrew University of Jerusalem; Technion", "aff_domain": "illinois.edu;cs.huji.ac.il; ; ; ", "email": "illinois.edu;cs.huji.ac.il; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/jay19a.html", "aff_unique_index": "0;1;0;1;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;Hebrew University of Jerusalem;Technion - Israel Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.huji.ac.il;https://www.technion.ac.il/en/", "aff_unique_abbr": "UIUC;HUJI;Technion", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Urbana-Champaign;Jerusalem;", "aff_country_unique_index": "0;1;0;1;1", "aff_country_unique": "United States;Israel" }, { "title": "A Dynamical Systems Perspective on Nesterov Acceleration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3920", "id": "3920", "author_site": "Michael Muehlebach, Michael Jordan", "author": "Michael Muehlebach; Michael Jordan", "abstract": "We present a dynamical system framework for understanding Nesterov\u2019s accelerated gradient method. In contrast to earlier work, our derivation does not rely on a vanishing step size argument. We show that Nesterov acceleration arises from discretizing an ordinary differential equation with a semi-implicit Euler integration scheme. We analyze both the underlying differential equation as well as the discretization to obtain insights into the phenomenon of acceleration. The analysis suggests that a curvature-dependent damping term lies at the heart of the phenomenon. We further establish connections between the discretized and the continuous-time dynamics.", "bibtex": "@InProceedings{pmlr-v97-muehlebach19a,\n title = \t {A Dynamical Systems Perspective on {N}esterov Acceleration},\n author = {Muehlebach, Michael and Jordan, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4656--4662},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/muehlebach19a/muehlebach19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/muehlebach19a.html},\n abstract = \t {We present a dynamical system framework for understanding Nesterov\u2019s accelerated gradient method. In contrast to earlier work, our derivation does not rely on a vanishing step size argument. We show that Nesterov acceleration arises from discretizing an ordinary differential equation with a semi-implicit Euler integration scheme. We analyze both the underlying differential equation as well as the discretization to obtain insights into the phenomenon of acceleration. The analysis suggests that a curvature-dependent damping term lies at the heart of the phenomenon. We further establish connections between the discretized and the continuous-time dynamics.}\n}", "pdf": "http://proceedings.mlr.press/v97/muehlebach19a/muehlebach19a.pdf", "supp": "", "pdf_size": 1034869, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10427249069932476609&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Electrical Engineering and Computer Science Department, UC Berkeley; Electrical Engineering and Computer Science Department, UC Berkeley", "aff_domain": "berkeley.edu; ", "email": "berkeley.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/muehlebach19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Electrical Engineering and Computer Science Department", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Framework for Bayesian Optimization in Embedded Subspaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4216", "id": "4216", "author_site": "Amin Nayebi, Alexander Munteanu, Matthias Poloczek", "author": "Amin Nayebi; Alexander Munteanu; Matthias Poloczek", "abstract": "We present a theoretically founded approach for high-dimensional Bayesian optimization based on low-dimensional subspace embeddings. We prove that the error in the Gaussian process model is bounded tightly when going from the original high-dimensional search domain to the low-dimensional embedding. This implies that the optimization process in the low-dimensional embedding proceeds essentially as if it were run directly on an unknown active subspace of low dimensionality. The argument applies to a large class of algorithms and GP models, including non-stationary kernels. Moreover, we provide an efficient implementation based on hashing and demonstrate empirically that this subspace embedding achieves considerably better results than the previously proposed methods for high-dimensional BO based on Gaussian matrix projections and structure-learning.", "bibtex": "@InProceedings{pmlr-v97-nayebi19a,\n title = \t {A Framework for {B}ayesian Optimization in Embedded Subspaces},\n author = {Nayebi, Amin and Munteanu, Alexander and Poloczek, Matthias},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4752--4761},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nayebi19a/nayebi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nayebi19a.html},\n abstract = \t {We present a theoretically founded approach for high-dimensional Bayesian optimization based on low-dimensional subspace embeddings. We prove that the error in the Gaussian process model is bounded tightly when going from the original high-dimensional search domain to the low-dimensional embedding. This implies that the optimization process in the low-dimensional embedding proceeds essentially as if it were run directly on an unknown active subspace of low dimensionality. The argument applies to a large class of algorithms and GP models, including non-stationary kernels. Moreover, we provide an efficient implementation based on hashing and demonstrate empirically that this subspace embedding achieves considerably better results than the previously proposed methods for high-dimensional BO based on Gaussian matrix projections and structure-learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/nayebi19a/nayebi19a.pdf", "supp": "", "pdf_size": 720133, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10400464070686203612&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Dortmund Data Science Center, Faculties of Statistics and Computer Science, TU Dortmund, Dortmund, Germany+Department of Systems and Industrial Engineering, University of Arizona, Tucson, AZ, USA; Department of Systems and Industrial Engineering, University of Arizona, Tucson, AZ, USA+Uber AI Labs, San Francisco, CA, USA; Uber AI Labs, San Francisco, CA, USA", "aff_domain": "uber.com; ;uber.com", "email": "uber.com; ;uber.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/nayebi19a.html", "aff_unique_index": "0+1;1+2;2", "aff_unique_norm": "TU Dortmund;University of Arizona;Uber AI Labs", "aff_unique_dep": "Faculties of Statistics and Computer Science;Department of Systems and Industrial Engineering;AI Labs", "aff_unique_url": "https://www.tu-dortmund.de;https://www.arizona.edu;https://www.uber.com", "aff_unique_abbr": "TUD;UArizona;Uber AI Labs", "aff_campus_unique_index": "0+1;1+2;2", "aff_campus_unique": "Dortmund;Tucson;San Francisco", "aff_country_unique_index": "0+1;1+1;1", "aff_country_unique": "Germany;United States" }, { "title": "A Gradual, Semi-Discrete Approach to Generative Network Training via Explicit Wasserstein Minimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4293", "id": "4293", "author_site": "Yucheng Chen, Matus Telgarsky, Chao Zhang, Bolton Bailey, Daniel Hsu, Jian Peng", "author": "Yucheng Chen; Matus Telgarsky; Chao Zhang; Bolton Bailey; Daniel Hsu; Jian Peng", "abstract": "This paper provides a simple procedure to fit generative networks to target distributions, with the goal of a small Wasserstein distance (or other optimal transport costs). The approach is based on two principles: (a) if the source randomness of the network is a continuous distribution (the \"semi-discrete\" setting), then the Wasserstein distance is realized by a deterministic optimal transport mapping; (b) given an optimal transport mapping between a generator network and a target distribution, the Wasserstein distance may be decreased via a regression between the generated data and the mapped target points. The procedure here therefore alternates these two steps, forming an optimal transport and regressing against it, gradually adjusting the generator network towards the target distribution. Mathematically, this approach is shown to minimize the Wasserstein distance to both the empirical target distribution, and also its underlying population counterpart. Empirically, good performance is demonstrated on the training and testing sets of the MNIST and Thin-8 data. The paper closes with a discussion of the unsuitability of the Wasserstein distance for certain tasks, as has been identified in prior work (Arora et al., 2017; Huang et al., 2017).", "bibtex": "@InProceedings{pmlr-v97-chen19h,\n title = \t {A Gradual, Semi-Discrete Approach to Generative Network Training via Explicit {W}asserstein Minimization},\n author = {Chen, Yucheng and Telgarsky, Matus and Zhang, Chao and Bailey, Bolton and Hsu, Daniel and Peng, Jian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1071--1080},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19h/chen19h.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19h.html},\n abstract = \t {This paper provides a simple procedure to fit generative networks to target distributions, with the goal of a small Wasserstein distance (or other optimal transport costs). The approach is based on two principles: (a) if the source randomness of the network is a continuous distribution (the \"semi-discrete\" setting), then the Wasserstein distance is realized by a deterministic optimal transport mapping; (b) given an optimal transport mapping between a generator network and a target distribution, the Wasserstein distance may be decreased via a regression between the generated data and the mapped target points. The procedure here therefore alternates these two steps, forming an optimal transport and regressing against it, gradually adjusting the generator network towards the target distribution. Mathematically, this approach is shown to minimize the Wasserstein distance to both the empirical target distribution, and also its underlying population counterpart. Empirically, good performance is demonstrated on the training and testing sets of the MNIST and Thin-8 data. The paper closes with a discussion of the unsuitability of the Wasserstein distance for certain tasks, as has been identified in prior work (Arora et al., 2017; Huang et al., 2017).}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19h/chen19h.pdf", "supp": "", "pdf_size": 1708954, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11878735819424652238&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, University of Illinois at Urbana-Champaign; Department of Computer Science, University of Illinois at Urbana-Champaign; Department of Computer Science, University of Illinois at Urbana-Champaign; Department of Computer Science, University of Illinois at Urbana-Champaign; Department of Computer Science, Columbia University; Department of Computer Science, University of Illinois at Urbana-Champaign", "aff_domain": "illinois.edu; ; ; ; ; ", "email": "illinois.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/chen19h.html", "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Columbia University", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://illinois.edu;https://www.columbia.edu", "aff_unique_abbr": "UIUC;Columbia", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Kernel Perspective for Regularizing Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3832", "id": "3832", "author_site": "Alberto Bietti, Gregoire Mialon, Dexiong Chen, Julien Mairal", "author": "Alberto Bietti; Gr\u00e9goire Mialon; Dexiong Chen; Julien Mairal", "abstract": "We propose a new point of view for regularizing deep neural networks by using the norm of a reproducing kernel Hilbert space (RKHS). Even though this norm cannot be computed, it admits upper and lower approximations leading to various practical strategies. Specifically, this perspective (i) provides a common umbrella for many existing regularization principles, including spectral norm and gradient penalties, or adversarial training, (ii) leads to new effective regularization penalties, and (iii) suggests hybrid strategies combining lower and upper bounds to get better approximations of the RKHS norm. We experimentally show this approach to be effective when learning on small datasets, or to obtain adversarially robust models.", "bibtex": "@InProceedings{pmlr-v97-bietti19a,\n title = \t {A Kernel Perspective for Regularizing Deep Neural Networks},\n author = {Bietti, Alberto and Mialon, Gr{\\'e}goire and Chen, Dexiong and Mairal, Julien},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {664--674},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bietti19a/bietti19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bietti19a.html},\n abstract = \t {We propose a new point of view for regularizing deep neural networks by using the norm of a reproducing kernel Hilbert space (RKHS). Even though this norm cannot be computed, it admits upper and lower approximations leading to various practical strategies. Specifically, this perspective (i) provides a common umbrella for many existing regularization principles, including spectral norm and gradient penalties, or adversarial training, (ii) leads to new effective regularization penalties, and (iii) suggests hybrid strategies combining lower and upper bounds to get better approximations of the RKHS norm. We experimentally show this approach to be effective when learning on small datasets, or to obtain adversarially robust models.}\n}", "pdf": "http://proceedings.mlr.press/v97/bietti19a/bietti19a.pdf", "supp": "", "pdf_size": 356259, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17149885341490741277&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France+D\u00e9partement d\u2019informatique de l\u2019ENS, ENS, CNRS, Inria, PSL, 75005 Paris, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France+D\u00e9partement d\u2019informatique de l\u2019ENS, ENS, CNRS, Inria, PSL, 75005 Paris, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France", "aff_domain": "inria.fr;inria.fr;inria.fr;inria.fr", "email": "inria.fr;inria.fr;inria.fr;inria.fr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/bietti19a.html", "aff_unique_index": "0+1;0+1;0;0", "aff_unique_norm": "Universite Grenoble Alpes;\u00c9cole Normale Sup\u00e9rieure", "aff_unique_dep": ";D\u00e9partement d\u2019informatique", "aff_unique_url": "https://www.univ-grenoble-alpes.fr;https://www.ens.fr", "aff_unique_abbr": "UGA;ENS", "aff_campus_unique_index": "0+1;0+1;0;0", "aff_campus_unique": "Grenoble;Paris", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "France" }, { "title": "A Kernel Theory of Modern Data Augmentation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3611", "id": "3611", "author_site": "Tri Dao, Albert Gu, Alexander J Ratner, Virginia Smith, Christopher De Sa, Christopher Re", "author": "Tri Dao; Albert Gu; Alexander Ratner; Virginia Smith; Chris De Sa; Christopher Re", "abstract": "Data augmentation, a technique in which a training set is expanded with class-preserving transformations, is ubiquitous in modern machine learning pipelines. In this paper, we seek to establish a theoretical framework for understanding data augmentation. We approach this from two directions: First, we provide a general model of augmentation as a Markov process, and show that kernels appear naturally with respect to this model, even when we do not employ kernel classification. Next, we analyze more directly the effect of augmentation on kernel classifiers, showing that data augmentation can be approximated by first-order feature averaging and second-order variance regularization components. These frameworks both serve to illustrate the ways in which data augmentation affects the downstream learning model, and the resulting analyses provide novel connections between prior work in invariant kernels, tangent propagation, and robust optimization. Finally, we provide several proof-of-concept applications showing that our theory can be useful for accelerating machine learning workflows, such as reducing the amount of computation needed to train using augmented data, and predicting the utility of a transformation prior to training.", "bibtex": "@InProceedings{pmlr-v97-dao19b,\n title = \t {A Kernel Theory of Modern Data Augmentation},\n author = {Dao, Tri and Gu, Albert and Ratner, Alexander and Smith, Virginia and De Sa, Chris and Re, Christopher},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1528--1537},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dao19b/dao19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/dao19b.html},\n abstract = \t {Data augmentation, a technique in which a training set is expanded with class-preserving transformations, is ubiquitous in modern machine learning pipelines. In this paper, we seek to establish a theoretical framework for understanding data augmentation. We approach this from two directions: First, we provide a general model of augmentation as a Markov process, and show that kernels appear naturally with respect to this model, even when we do not employ kernel classification. Next, we analyze more directly the effect of augmentation on kernel classifiers, showing that data augmentation can be approximated by first-order feature averaging and second-order variance regularization components. These frameworks both serve to illustrate the ways in which data augmentation affects the downstream learning model, and the resulting analyses provide novel connections between prior work in invariant kernels, tangent propagation, and robust optimization. Finally, we provide several proof-of-concept applications showing that our theory can be useful for accelerating machine learning workflows, such as reducing the amount of computation needed to train using augmented data, and predicting the utility of a transformation prior to training.}\n}", "pdf": "http://proceedings.mlr.press/v97/dao19b/dao19b.pdf", "supp": "", "pdf_size": 1438866, "gs_citation": 250, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15647761627191229165&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, Stanford University, California, USA; Department of Computer Science, Stanford University, California, USA; Department of Computer Science, Stanford University, California, USA; Department of Electrical and Computer Engineering, Carnegie Mellon University, Pennsylvania, USA; Department of Computer Science, Cornell University, New York, USA; Department of Computer Science, Stanford University, California, USA", "aff_domain": "cs.stanford.edu; ; ; ; ; ", "email": "cs.stanford.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/dao19b.html", "aff_unique_index": "0;0;0;1;2;0", "aff_unique_norm": "Stanford University;Carnegie Mellon University;Cornell University", "aff_unique_dep": "Department of Computer Science;Department of Electrical and Computer Engineering;Department of Computer Science", "aff_unique_url": "https://www.stanford.edu;https://www.cmu.edu;https://www.cornell.edu", "aff_unique_abbr": "Stanford;CMU;Cornell", "aff_campus_unique_index": "0;0;0;1;2;0", "aff_campus_unique": "California;Pennsylvania;New York", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Large-Scale Study on Regularization and Normalization in GANs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4146", "id": "4146", "author_site": "Karol Kurach, Mario Lucic, Xiaohua Zhai, Marcin Michalski, Sylvain Gelly", "author": "Karol Kurach; Mario Lu\u010di\u0107; Xiaohua Zhai; Marcin Michalski; Sylvain Gelly", "abstract": "Generative adversarial networks (GANs) are a class of deep generative models which aim to learn a target distribution in an unsupervised fashion. While they were successfully applied to many problems, training a GAN is a notoriously challenging task and requires a significant number of hyperparameter tuning, neural architecture engineering, and a non-trivial amount of \u201ctricks\". The success in many practical applications coupled with the lack of a measure to quantify the failure modes of GANs resulted in a plethora of proposed losses, regularization and normalization schemes, as well as neural architectures. In this work we take a sober view of the current state of GANs from a practical perspective. We discuss and evaluate common pitfalls and reproducibility issues, open-source our code on Github, and provide pre-trained models on TensorFlow Hub.", "bibtex": "@InProceedings{pmlr-v97-kurach19a,\n title = \t {A Large-Scale Study on Regularization and Normalization in {GAN}s},\n author = {Kurach, Karol and Lu{\\v{c}}i{\\'c}, Mario and Zhai, Xiaohua and Michalski, Marcin and Gelly, Sylvain},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3581--3590},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kurach19a/kurach19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kurach19a.html},\n abstract = \t {Generative adversarial networks (GANs) are a class of deep generative models which aim to learn a target distribution in an unsupervised fashion. While they were successfully applied to many problems, training a GAN is a notoriously challenging task and requires a significant number of hyperparameter tuning, neural architecture engineering, and a non-trivial amount of \u201ctricks\". The success in many practical applications coupled with the lack of a measure to quantify the failure modes of GANs resulted in a plethora of proposed losses, regularization and normalization schemes, as well as neural architectures. In this work we take a sober view of the current state of GANs from a practical perspective. We discuss and evaluate common pitfalls and reproducibility issues, open-source our code on Github, and provide pre-trained models on TensorFlow Hub.}\n}", "pdf": "http://proceedings.mlr.press/v97/kurach19a/kurach19a.pdf", "supp": "", "pdf_size": 855190, "gs_citation": 225, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2102263768032678612&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com;google.com; ; ; ", "email": "google.com;google.com; ; ; ", "github": "www.github.com/google/compare_gan", "project": "www.tensorflow.org/hub", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kurach19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Multitask Multiple Kernel Learning Algorithm for Survival Analysis with Application to Cancer Biology", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4219", "id": "4219", "author_site": "Onur Dereli, Ceyda O\u011fuz, Mehmet G\u00f6nen", "author": "Onur Dereli; Ceyda O\u011fuz; Mehmet G\u00f6nen", "abstract": "Predictive performance of machine learning algorithms on related problems can be improved using multitask learning approaches. Rather than performing survival analysis on each data set to predict survival times of cancer patients, we developed a novel multitask approach based on multiple kernel learning (MKL). Our multitask MKL algorithm both works on multiple cancer data sets and integrates cancer-related pathways/gene sets into survival analysis. We tested our algorithm, which is named as Path2MSurv, on the Cancer Genome Atlas data sets analyzing gene expression profiles of 7,655 patients from 20 cancer types together with cancer-specific pathway/gene set collections. Path2MSurv obtained better or comparable predictive performance when benchmarked against random survival forest, survival support vector machine, and single-task variant of our algorithm. Path2MSurv has the ability to identify key pathways/gene sets in predicting survival times of patients from different cancer types.", "bibtex": "@InProceedings{pmlr-v97-dereli19a,\n title = \t {A Multitask Multiple Kernel Learning Algorithm for Survival Analysis with Application to Cancer Biology},\n author = {Dereli, Onur and O{\\u{g}}uz, Ceyda and G{\\\"o}nen, Mehmet},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1576--1585},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dereli19a/dereli19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dereli19a.html},\n abstract = \t {Predictive performance of machine learning algorithms on related problems can be improved using multitask learning approaches. Rather than performing survival analysis on each data set to predict survival times of cancer patients, we developed a novel multitask approach based on multiple kernel learning (MKL). Our multitask MKL algorithm both works on multiple cancer data sets and integrates cancer-related pathways/gene sets into survival analysis. We tested our algorithm, which is named as Path2MSurv, on the Cancer Genome Atlas data sets analyzing gene expression profiles of 7,655 patients from 20 cancer types together with cancer-specific pathway/gene set collections. Path2MSurv obtained better or comparable predictive performance when benchmarked against random survival forest, survival support vector machine, and single-task variant of our algorithm. Path2MSurv has the ability to identify key pathways/gene sets in predicting survival times of patients from different cancer types.}\n}", "pdf": "http://proceedings.mlr.press/v97/dereli19a/dereli19a.pdf", "supp": "", "pdf_size": 875415, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18419795599745081925&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Graduate School of Sciences and Engineering, Koc University, Istanbul 34450, Turkey; Department of Industrial Engineering, College of Engineering, Koc University, Istanbul 34450, Turkey; School of Medicine, Koc University, Istanbul 34450, Turkey + Department of Biomedical Engineering, School of Medicine, Oregon Health & Science University, Portland, OR 97239, USA", "aff_domain": "ku.edu.tr; ; ", "email": "ku.edu.tr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/dereli19a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "Koc University;Oregon Health & Science University", "aff_unique_dep": "Graduate School of Sciences and Engineering;Department of Biomedical Engineering", "aff_unique_url": "https://www.ku.edu.tr;https://www.ohsu.edu", "aff_unique_abbr": "Koc U;OHSU", "aff_campus_unique_index": "0;0;0+1", "aff_campus_unique": "Istanbul;Portland", "aff_country_unique_index": "0;0;0+1", "aff_country_unique": "T\u00fcrkiye;United States" }, { "title": "A Persistent Weisfeiler-Lehman Procedure for Graph Classification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3725", "id": "3725", "author_site": "Bastian Rieck, Christian Bock, Karsten Borgwardt", "author": "Bastian Rieck; Christian Bock; Karsten Borgwardt", "abstract": "The Weisfeiler\u2013Lehman graph kernel exhibits competitive performance in many graph classification tasks. However, its subtree features are not able to capture connected components and cycles, topological features known for characterising graphs. To extract such features, we leverage propagated node label information and transform unweighted graphs into metric ones. This permits us to augment the subtree features with topological information obtained using persistent homology, a concept from topological data analysis. Our method, which we formalise as a generalisation of Weisfeiler\u2013Lehman subtree features, exhibits favourable classification accuracy and its improvements in predictive performance are mainly driven by including cycle information.", "bibtex": "@InProceedings{pmlr-v97-rieck19a,\n title = \t {A Persistent Weisfeiler-Lehman Procedure for Graph Classification},\n author = {Rieck, Bastian and Bock, Christian and Borgwardt, Karsten},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5448--5458},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rieck19a/rieck19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rieck19a.html},\n abstract = \t {The Weisfeiler\u2013Lehman graph kernel exhibits competitive performance in many graph classification tasks. However, its subtree features are not able to capture connected components and cycles, topological features known for characterising graphs. To extract such features, we leverage propagated node label information and transform unweighted graphs into metric ones. This permits us to augment the subtree features with topological information obtained using persistent homology, a concept from topological data analysis. Our method, which we formalise as a generalisation of Weisfeiler\u2013Lehman subtree features, exhibits favourable classification accuracy and its improvements in predictive performance are mainly driven by including cycle information.}\n}", "pdf": "http://proceedings.mlr.press/v97/rieck19a/rieck19a.pdf", "supp": "", "pdf_size": 137210, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7520360655916275736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Biosystems Science and Engineering, ETH Zurich, 4058 Basel, Switzerland; Department of Biosystems Science and Engineering, ETH Zurich, 4058 Basel, Switzerland; Department of Biosystems Science and Engineering, ETH Zurich, 4058 Basel, Switzerland", "aff_domain": "bsse.ethz.ch; ;bsse.ethz.ch", "email": "bsse.ethz.ch; ;bsse.ethz.ch", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/rieck19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Biosystems Science and Engineering", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Basel", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "A Personalized Affective Memory Model for Improving Emotion Recognition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3828", "id": "3828", "author_site": "Pablo Barros, German Parisi, Stefan Wermter", "author": "Pablo Barros; German Parisi; Stefan Wermter", "abstract": "Recent models of emotion recognition strongly rely on supervised deep learning solutions for the distinction of general emotion expressions. However, they are not reliable when recognizing online and personalized facial expressions, e.g., for person-specific affective understanding. In this paper, we present a neural model based on a conditional adversarial autoencoder to learn how to represent and edit general emotion expressions. We then propose Grow-When-Required networks as personalized affective memories to learn individualized aspects of emotional expressions. Our model achieves state-of-the-art performance on emotion recognition when evaluated on in-the-wild datasets. Furthermore, our experiments include ablation studies and neural visualizations in order to explain the behavior of our model.", "bibtex": "@InProceedings{pmlr-v97-barros19a,\n title = \t {A Personalized Affective Memory Model for Improving Emotion Recognition},\n author = {Barros, Pablo and Parisi, German and Wermter, Stefan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {485--494},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/barros19a/barros19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/barros19a.html},\n abstract = \t {Recent models of emotion recognition strongly rely on supervised deep learning solutions for the distinction of general emotion expressions. However, they are not reliable when recognizing online and personalized facial expressions, e.g., for person-specific affective understanding. In this paper, we present a neural model based on a conditional adversarial autoencoder to learn how to represent and edit general emotion expressions. We then propose Grow-When-Required networks as personalized affective memories to learn individualized aspects of emotional expressions. Our model achieves state-of-the-art performance on emotion recognition when evaluated on in-the-wild datasets. Furthermore, our experiments include ablation studies and neural visualizations in order to explain the behavior of our model.}\n}", "pdf": "http://proceedings.mlr.press/v97/barros19a/barros19a.pdf", "supp": "", "pdf_size": 1884797, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=719517707256328772&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "Knowledge Technology, Department of Informatics, University of Hamburg, Germany+Apprente Inc, Montain View, CA, USA; Knowledge Technology, Department of Informatics, University of Hamburg, Germany; Knowledge Technology, Department of Informatics, University of Hamburg, Germany", "aff_domain": "informatik.uni-hamburg.de; ; ", "email": "informatik.uni-hamburg.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/barros19a.html", "aff_unique_index": "0+1;0;0", "aff_unique_norm": "University of Hamburg;Apprente Inc", "aff_unique_dep": "Department of Informatics;", "aff_unique_url": "https://www.uni-hamburg.de;", "aff_unique_abbr": ";", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+1;0;0", "aff_country_unique": "Germany;United States" }, { "title": "A Polynomial Time MCMC Method for Sampling from Continuous Determinantal Point Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3982", "id": "3982", "author_site": "Alireza Rezaei, Shayan Oveis Gharan", "author": "Alireza Rezaei; Shayan Oveis Gharan", "abstract": "We study the Gibbs sampling algorithm for discrete and continuous $k$-determinantal point processes. We show that in both cases, the spectral gap of the chain is bounded by a polynomial of $k$ and it is independent of the size of the domain. As an immediate corollary, we obtain sublinear time algorithms for sampling from discrete $k$-DPPs given access to polynomially many processors. In the continuous setting, our result leads to the first class of rigorously analyzed efficient algorithms to generate random samples of continuous $k$-DPPs. We achieve this by showing that the Gibbs sampler for a large family of continuous $k$-DPPs can be simulated efficiently when the spectrum is not concentrated on the top $k$ eigenvalues.", "bibtex": "@InProceedings{pmlr-v97-rezaei19a,\n title = \t {A Polynomial Time {MCMC} Method for Sampling from Continuous Determinantal Point Processes},\n author = {Rezaei, Alireza and Gharan, Shayan Oveis},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5438--5447},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rezaei19a/rezaei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rezaei19a.html},\n abstract = \t {We study the Gibbs sampling algorithm for discrete and continuous $k$-determinantal point processes. We show that in both cases, the spectral gap of the chain is bounded by a polynomial of $k$ and it is independent of the size of the domain. As an immediate corollary, we obtain sublinear time algorithms for sampling from discrete $k$-DPPs given access to polynomially many processors. In the continuous setting, our result leads to the first class of rigorously analyzed efficient algorithms to generate random samples of continuous $k$-DPPs. We achieve this by showing that the Gibbs sampler for a large family of continuous $k$-DPPs can be simulated efficiently when the spectrum is not concentrated on the top $k$ eigenvalues.}\n}", "pdf": "http://proceedings.mlr.press/v97/rezaei19a/rezaei19a.pdf", "supp": "", "pdf_size": 424029, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16700510190642348534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle, WA, USA; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle, WA, USA", "aff_domain": "cs.washington.edu;cs.washington.edu", "email": "cs.washington.edu;cs.washington.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/rezaei19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Paul G. Allen School of Computer Science and Engineering", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Quantitative Analysis of the Effect of Batch Normalization on Gradient Descent", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3821", "id": "3821", "author_site": "YongQiang Cai, Qianxiao Li, Zuowei Shen", "author": "Yongqiang Cai; Qianxiao Li; Zuowei Shen", "abstract": "Despite its empirical success and recent theoretical progress, there generally lacks a quantitative analysis of the effect of batch normalization (BN) on the convergence and stability of gradient descent. In this paper, we provide such an analysis on the simple problem of ordinary least squares (OLS), where the precise dynamical properties of gradient descent (GD) is completely known, thus allowing us to isolate and compare the additional effects of BN. More precisely, we show that unlike GD, gradient descent with BN (BNGD) converges for arbitrary learning rates for the weights, and the convergence remains linear under mild conditions. Moreover, we quantify two different sources of acceleration of BNGD over GD \u2013 one due to over-parameterization which improves the effective condition number and another due having a large range of learning rates giving rise to fast descent. These phenomena set BNGD apart from GD and could account for much of its robustness properties. These findings are confirmed quantitatively by numerical experiments, which further show that many of the uncovered properties of BNGD in OLS are also observed qualitatively in more complex supervised learning problems.", "bibtex": "@InProceedings{pmlr-v97-cai19a,\n title = \t {A Quantitative Analysis of the Effect of Batch Normalization on Gradient Descent},\n author = {Cai, Yongqiang and Li, Qianxiao and Shen, Zuowei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {882--890},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cai19a/cai19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cai19a.html},\n abstract = \t {Despite its empirical success and recent theoretical progress, there generally lacks a quantitative analysis of the effect of batch normalization (BN) on the convergence and stability of gradient descent. In this paper, we provide such an analysis on the simple problem of ordinary least squares (OLS), where the precise dynamical properties of gradient descent (GD) is completely known, thus allowing us to isolate and compare the additional effects of BN. More precisely, we show that unlike GD, gradient descent with BN (BNGD) converges for arbitrary learning rates for the weights, and the convergence remains linear under mild conditions. Moreover, we quantify two different sources of acceleration of BNGD over GD \u2013 one due to over-parameterization which improves the effective condition number and another due having a large range of learning rates giving rise to fast descent. These phenomena set BNGD apart from GD and could account for much of its robustness properties. These findings are confirmed quantitatively by numerical experiments, which further show that many of the uncovered properties of BNGD in OLS are also observed qualitatively in more complex supervised learning problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/cai19a/cai19a.pdf", "supp": "", "pdf_size": 935846, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13059647606456259294&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Mathematics, National University of Singapore, Singapore; Department of Mathematics, National University of Singapore, Singapore + Institute of High Performance Computing, A*STAR, Singapore; Department of Mathematics, National University of Singapore, Singapore", "aff_domain": "nus.edu.sg;nus.edu.sg;nus.edu.sg", "email": "nus.edu.sg;nus.edu.sg;nus.edu.sg", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/cai19a.html", "aff_unique_index": "0;0+1;0", "aff_unique_norm": "National University of Singapore;A*STAR Institute of High Performance Computing", "aff_unique_dep": "Department of Mathematics;Institute of High Performance Computing", "aff_unique_url": "https://www.nus.edu.sg;https://www.ihpc.a-star.edu.sg", "aff_unique_abbr": "NUS;IHPC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "Singapore" }, { "title": "A Recurrent Neural Cascade-based Model for Continuous-Time Diffusion", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3646", "id": "3646", "author": "Sylvain Lamprier", "abstract": "Many works have been proposed in the literature to capture the dynamics of diffusion in networks. While some of them define graphical Markovian models to extract temporal relationships between node infections in networks, others consider diffusion episodes as sequences of infections via recurrent neural models. In this paper we propose a model at the crossroads of these two extremes, which embeds the history of diffusion in infected nodes as hidden continuous states. Depending on the trajectory followed by the content before reaching a given node, the distribution of influence probabilities may vary. However, content trajectories are usually hidden in the data, which induces challenging learning problems. We propose a topological recurrent neural model which exhibits good experimental performances for diffusion modeling and prediction.", "bibtex": "@InProceedings{pmlr-v97-lamprier19a,\n title = \t {A Recurrent Neural Cascade-based Model for Continuous-Time Diffusion},\n author = {Lamprier, Sylvain},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3632--3641},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lamprier19a/lamprier19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lamprier19a.html},\n abstract = \t {Many works have been proposed in the literature to capture the dynamics of diffusion in networks. While some of them define graphical Markovian models to extract temporal relationships between node infections in networks, others consider diffusion episodes as sequences of infections via recurrent neural models. In this paper we propose a model at the crossroads of these two extremes, which embeds the history of diffusion in infected nodes as hidden continuous states. Depending on the trajectory followed by the content before reaching a given node, the distribution of influence probabilities may vary. However, content trajectories are usually hidden in the data, which induces challenging learning problems. We propose a topological recurrent neural model which exhibits good experimental performances for diffusion modeling and prediction.}\n}", "pdf": "http://proceedings.mlr.press/v97/lamprier19a/lamprier19a.pdf", "supp": "", "pdf_size": 363504, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9654474908920415942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Sorbonne Universit\u00e9es, LIP6, F-75005, Paris, France", "aff_domain": "lip6.fr", "email": "lip6.fr", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/lamprier19a.html", "aff_unique_index": "0", "aff_unique_norm": "Sorbonne Universit\u00e9", "aff_unique_dep": "LIP6", "aff_unique_url": "https://www.sorbonne-universite.fr", "aff_unique_abbr": "Sorbonne U", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "title": "A Statistical Investigation of Long Memory in Language and Music", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3937", "id": "3937", "author_site": "Alexander Greaves-Tunnell, Zaid Harchaoui", "author": "Alexander Greaves-Tunnell; Zaid Harchaoui", "abstract": "Representation and learning of long-range dependencies is a central challenge confronted in modern applications of machine learning to sequence data. Yet despite the prominence of this issue, the basic problem of measuring long-range dependence, either in a given data source or as represented in a trained deep model, remains largely limited to heuristic tools. We contribute a statistical framework for investigating long-range dependence in current applications of deep sequence modeling, drawing on the well-developed theory of long memory stochastic processes. This framework yields testable implications concerning the relationship between long memory in real-world data and its learned representation in a deep learning architecture, which are explored through a semiparametric framework adapted to the high-dimensional setting.", "bibtex": "@InProceedings{pmlr-v97-greaves-tunnell19a,\n title = \t {A Statistical Investigation of Long Memory in Language and Music},\n author = {Greaves-Tunnell, Alexander and Harchaoui, Zaid},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2394--2403},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/greaves-tunnell19a/greaves-tunnell19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/greaves-tunnell19a.html},\n abstract = \t {Representation and learning of long-range dependencies is a central challenge confronted in modern applications of machine learning to sequence data. Yet despite the prominence of this issue, the basic problem of measuring long-range dependence, either in a given data source or as represented in a trained deep model, remains largely limited to heuristic tools. We contribute a statistical framework for investigating long-range dependence in current applications of deep sequence modeling, drawing on the well-developed theory of long memory stochastic processes. This framework yields testable implications concerning the relationship between long memory in real-world data and its learned representation in a deep learning architecture, which are explored through a semiparametric framework adapted to the high-dimensional setting.}\n}", "pdf": "http://proceedings.mlr.press/v97/greaves-tunnell19a/greaves-tunnell19a.pdf", "supp": "", "pdf_size": 316606, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3204260135600784159&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff": "Department of Statistics, University of Washington, Seattle, USA; Department of Statistics, University of Washington, Seattle, USA", "aff_domain": "uw.edu; ", "email": "uw.edu; ", "github": "https://github.com/alecgt/RNN_long_memory", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/greaves-tunnell19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "A Tail-Index Analysis of Stochastic Gradient Noise in Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3560", "id": "3560", "author_site": "Umut Simsekli, Levent Sagun, Mert Gurbuzbalaban", "author": "Umut Simsekli; Levent Sagun; Mert Gurbuzbalaban", "abstract": "The gradient noise (GN) in the stochastic gradient descent (SGD) algorithm is often considered to be Gaussian in the large data regime by assuming that the classical central limit theorem (CLT) kicks in. This assumption is often made for mathematical convenience, since it enables SGD to be analyzed as a stochastic differential equation (SDE) driven by a Brownian motion. We argue that the Gaussianity assumption might fail to hold in deep learning settings and hence render the Brownian motion-based analyses inappropriate. Inspired by non-Gaussian natural phenomena, we consider the GN in a more general context and invoke the generalized CLT (GCLT), which suggests that the GN converges to a heavy-tailed $\\alpha$-stable random variable. Accordingly, we propose to analyze SGD as an SDE driven by a L\u00e9vy motion. Such SDEs can incur \u2018jumps\u2019, which force the SDE transition from narrow minima to wider minima, as proven by existing metastability theory. To validate the $\\alpha$-stable assumption, we conduct experiments on common deep learning scenarios and show that in all settings, the GN is highly non-Gaussian and admits heavy-tails. We investigate the tail behavior in varying network architectures and sizes, loss functions, and datasets. Our results open up a different perspective and shed more light on the belief that SGD prefers wide minima.", "bibtex": "@InProceedings{pmlr-v97-simsekli19a,\n title = \t {A Tail-Index Analysis of Stochastic Gradient Noise in Deep Neural Networks},\n author = {Simsekli, Umut and Sagun, Levent and Gurbuzbalaban, Mert},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5827--5837},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/simsekli19a/simsekli19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/simsekli19a.html},\n abstract = \t {The gradient noise (GN) in the stochastic gradient descent (SGD) algorithm is often considered to be Gaussian in the large data regime by assuming that the classical central limit theorem (CLT) kicks in. This assumption is often made for mathematical convenience, since it enables SGD to be analyzed as a stochastic differential equation (SDE) driven by a Brownian motion. We argue that the Gaussianity assumption might fail to hold in deep learning settings and hence render the Brownian motion-based analyses inappropriate. Inspired by non-Gaussian natural phenomena, we consider the GN in a more general context and invoke the generalized CLT (GCLT), which suggests that the GN converges to a heavy-tailed $\\alpha$-stable random variable. Accordingly, we propose to analyze SGD as an SDE driven by a L\u00e9vy motion. Such SDEs can incur \u2018jumps\u2019, which force the SDE transition from narrow minima to wider minima, as proven by existing metastability theory. To validate the $\\alpha$-stable assumption, we conduct experiments on common deep learning scenarios and show that in all settings, the GN is highly non-Gaussian and admits heavy-tails. We investigate the tail behavior in varying network architectures and sizes, loss functions, and datasets. Our results open up a different perspective and shed more light on the belief that SGD prefers wide minima.}\n}", "pdf": "http://proceedings.mlr.press/v97/simsekli19a/simsekli19a.pdf", "supp": "", "pdf_size": 1772853, "gs_citation": 289, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9609343502114720744&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "LTCI, T\u00b4el\u00b4ecom ParisTech, Universit\u00b4e Paris-Saclay, 75013, Paris, France; Institute of Physics, \u00b4Ecole Polytechnique F\u00b4ed\u00b4erale de Lausanne, 1015 Lausanne, Switzerland; Department of Management Science and Information Systems, Rutgers Business School, NJ 08854, USA", "aff_domain": "telecom-paristech.fr; ; ", "email": "telecom-paristech.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/simsekli19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech;EPFL;Rutgers Business School", "aff_unique_dep": "LTCI;Institute of Physics;Department of Management Science and Information Systems", "aff_unique_url": "https://www.telecom-paris.fr;https://www.epfl.ch;https://business.rutgers.edu", "aff_unique_abbr": "T\u00e9l\u00e9com ParisTech;EPFL;RBS", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Paris;Lausanne;", "aff_country_unique_index": "0;1;2", "aff_country_unique": "France;Switzerland;United States" }, { "title": "A Theoretical Analysis of Contrastive Unsupervised Representation Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4241", "id": "4241", "author_site": "Nikunj Umesh Saunshi, Orestis Plevrakis, Sanjeev Arora, Mikhail Khodak, Hrishikesh Khandeparkar", "author": "Nikunj Saunshi; Orestis Plevrakis; Sanjeev Arora; Mikhail Khodak; Hrishikesh Khandeparkar", "abstract": "Recent empirical works have successfully used unlabeled data to learn feature representations that are broadly useful in downstream classification tasks. Several of these methods are reminiscent of the well-known word2vec embedding algorithm: leveraging availability of pairs of semantically \u201csimilar\" data points and \u201cnegative samples,\" the learner forces the inner product of representations of similar pairs with each other to be higher on average than with negative samples. The current paper uses the term", "bibtex": "@InProceedings{pmlr-v97-saunshi19a,\n title = \t {A Theoretical Analysis of Contrastive Unsupervised Representation Learning},\n author = {Saunshi, Nikunj and Plevrakis, Orestis and Arora, Sanjeev and Khodak, Mikhail and Khandeparkar, Hrishikesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5628--5637},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/saunshi19a/saunshi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/saunshi19a.html},\n abstract = \t {Recent empirical works have successfully used unlabeled data to learn feature representations that are broadly useful in downstream classification tasks. Several of these methods are reminiscent of the well-known word2vec embedding algorithm: leveraging availability of pairs of semantically \u201csimilar\" data points and \u201cnegative samples,\" the learner forces the inner product of representations of similar pairs with each other to be higher on average than with negative samples. The current paper uses the term", "pdf": "http://proceedings.mlr.press/v97/saunshi19a/saunshi19a.pdf", "supp": "", "pdf_size": 532907, "gs_citation": 933, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8109121500737747479&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Princeton University; Princeton University; Carnegie Mellon University; Princeton University; Princeton University", "aff_domain": "cs.princeton.edu;cs.princeton.edu;cs.cmu.edu;cs.princeton.edu;cs.princeton.edu", "email": "cs.princeton.edu;cs.princeton.edu;cs.cmu.edu;cs.princeton.edu;cs.princeton.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/saunshi19a.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Princeton University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.cmu.edu", "aff_unique_abbr": "Princeton;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "A Theory of Regularized Markov Decision Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4025", "id": "4025", "author_site": "Matthieu Geist, Bruno Scherrer, Olivier Pietquin", "author": "Matthieu Geist; Bruno Scherrer; Olivier Pietquin", "abstract": "Many recent successful (deep) reinforcement learning algorithms make use of regularization, generally based on entropy or Kullback-Leibler divergence. We propose a general theory of regularized Markov Decision Processes that generalizes these approaches in two directions: we consider a larger class of regularizers, and we consider the general modified policy iteration approach, encompassing both policy iteration and value iteration. The core building blocks of this theory are a notion of regularized Bellman operator and the Legendre-Fenchel transform, a classical tool of convex optimization. This approach allows for error propagation analyses of general algorithmic schemes of which (possibly variants of) classical algorithms such as Trust Region Policy Optimization, Soft Q-learning, Stochastic Actor Critic or Dynamic Policy Programming are special cases. This also draws connections to proximal convex optimization, especially to Mirror Descent.", "bibtex": "@InProceedings{pmlr-v97-geist19a,\n title = \t {A Theory of Regularized {M}arkov Decision Processes},\n author = {Geist, Matthieu and Scherrer, Bruno and Pietquin, Olivier},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2160--2169},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/geist19a/geist19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/geist19a.html},\n abstract = \t {Many recent successful (deep) reinforcement learning algorithms make use of regularization, generally based on entropy or Kullback-Leibler divergence. We propose a general theory of regularized Markov Decision Processes that generalizes these approaches in two directions: we consider a larger class of regularizers, and we consider the general modified policy iteration approach, encompassing both policy iteration and value iteration. The core building blocks of this theory are a notion of regularized Bellman operator and the Legendre-Fenchel transform, a classical tool of convex optimization. This approach allows for error propagation analyses of general algorithmic schemes of which (possibly variants of) classical algorithms such as Trust Region Policy Optimization, Soft Q-learning, Stochastic Actor Critic or Dynamic Policy Programming are special cases. This also draws connections to proximal convex optimization, especially to Mirror Descent.}\n}", "pdf": "http://proceedings.mlr.press/v97/geist19a/geist19a.pdf", "supp": "", "pdf_size": 274384, "gs_citation": 385, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10797716709757595687&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google Research, Brain Team; Universit \u00b4e de Lorraine, CNRS, Inria, IECL, F-54000 Nancy, France; Google Research, Brain Team", "aff_domain": "google.com; ;google.com", "email": "google.com; ;google.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/geist19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Google;Universit\u00e9 de Lorraine", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.univ-lorraine.fr", "aff_unique_abbr": "Google;UL", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Mountain View;Nancy", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;France" }, { "title": "A Tree-Based Method for Fast Repeated Sampling of Determinantal Point Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3607", "id": "3607", "author_site": "Jennifer Gillenwater, Alex Kulesza, Zelda Mariet, Sergei Vassilvitskii", "author": "Jennifer Gillenwater; Alex Kulesza; Zelda Mariet; Sergei Vassilvtiskii", "abstract": "It is often desirable in recommender systems and other information retrieval applications to provide diverse results, and determinantal point processes (DPPs) have become a popular way to capture the trade-off between the quality of individual results and the diversity of the overall set. However, sampling from a DPP is inherently expensive: if the underlying collection contains N items, then generating each DPP sample requires time linear in N following a one-time preprocessing phase. Additionally, results often need to be personalized to a user, but standard approaches to personalization invalidate the preprocessing, making personalized samples especially expensive. In this work we address both of these shortcomings. First, we propose a new algorithm for generating DPP samples in time logarithmic in N, following a slightly more expensive preprocessing phase. We then extend the algorithm to support arbitrary query-time feature weights, allowing us to generate samples customized to individual users while still retaining logarithmic runtime; experiments show our approach runs over 300 times faster than traditional DPP sampling on collections of 100,000 items for samples of size 10.", "bibtex": "@InProceedings{pmlr-v97-gillenwater19a,\n title = \t {A Tree-Based Method for Fast Repeated Sampling of Determinantal Point Processes},\n author = {Gillenwater, Jennifer and Kulesza, Alex and Mariet, Zelda and Vassilvtiskii, Sergei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2260--2268},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gillenwater19a/gillenwater19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gillenwater19a.html},\n abstract = \t {It is often desirable in recommender systems and other information retrieval applications to provide diverse results, and determinantal point processes (DPPs) have become a popular way to capture the trade-off between the quality of individual results and the diversity of the overall set. However, sampling from a DPP is inherently expensive: if the underlying collection contains N items, then generating each DPP sample requires time linear in N following a one-time preprocessing phase. Additionally, results often need to be personalized to a user, but standard approaches to personalization invalidate the preprocessing, making personalized samples especially expensive. In this work we address both of these shortcomings. First, we propose a new algorithm for generating DPP samples in time logarithmic in N, following a slightly more expensive preprocessing phase. We then extend the algorithm to support arbitrary query-time feature weights, allowing us to generate samples customized to individual users while still retaining logarithmic runtime; experiments show our approach runs over 300 times faster than traditional DPP sampling on collections of 100,000 items for samples of size 10.}\n}", "pdf": "http://proceedings.mlr.press/v97/gillenwater19a/gillenwater19a.pdf", "supp": "", "pdf_size": 520379, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4299776343326742147&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Research NYC; Google Research NYC; Massachusetts Institute of Technology + Google; Google Research NYC", "aff_domain": "google.com;google.com;csail.mit.edu;google.com", "email": "google.com;google.com;csail.mit.edu;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/gillenwater19a.html", "aff_unique_index": "0;0;1+0;0", "aff_unique_norm": "Google;Massachusetts Institute of Technology", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://web.mit.edu", "aff_unique_abbr": "Google Research;MIT", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "New York City;;Mountain View", "aff_country_unique_index": "0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "A Wrapped Normal Distribution on Hyperbolic Space for Gradient-Based Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3795", "id": "3795", "author_site": "Yoshihiro Nagano, Shoichiro Yamaguchi, Yasuhiro Fujita, Masanori Koyama", "author": "Yoshihiro Nagano; Shoichiro Yamaguchi; Yasuhiro Fujita; Masanori Koyama", "abstract": "Hyperbolic space is a geometry that is known to be well-suited for representation learning of data with an underlying hierarchical structure. In this paper, we present a novel hyperbolic distribution called hyperbolic wrapped distribution, a wrapped normal distribution on hyperbolic space whose density can be evaluated analytically and differentiated with respect to the parameters. Our distribution enables the gradient-based learning of the probabilistic models on hyperbolic space that could never have been considered before. Also, we can sample from this hyperbolic probability distribution without resorting to auxiliary means like rejection sampling. As applications of our distribution, we develop a hyperbolic-analog of variational autoencoder and a method of probabilistic word embedding on hyperbolic space. We demonstrate the efficacy of our distribution on various datasets including MNIST, Atari 2600 Breakout, and WordNet.", "bibtex": "@InProceedings{pmlr-v97-nagano19a,\n title = \t {A Wrapped Normal Distribution on Hyperbolic Space for Gradient-Based Learning},\n author = {Nagano, Yoshihiro and Yamaguchi, Shoichiro and Fujita, Yasuhiro and Koyama, Masanori},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4693--4702},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nagano19a/nagano19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nagano19a.html},\n abstract = \t {Hyperbolic space is a geometry that is known to be well-suited for representation learning of data with an underlying hierarchical structure. In this paper, we present a novel hyperbolic distribution called hyperbolic wrapped distribution, a wrapped normal distribution on hyperbolic space whose density can be evaluated analytically and differentiated with respect to the parameters. Our distribution enables the gradient-based learning of the probabilistic models on hyperbolic space that could never have been considered before. Also, we can sample from this hyperbolic probability distribution without resorting to auxiliary means like rejection sampling. As applications of our distribution, we develop a hyperbolic-analog of variational autoencoder and a method of probabilistic word embedding on hyperbolic space. We demonstrate the efficacy of our distribution on various datasets including MNIST, Atari 2600 Breakout, and WordNet.}\n}", "pdf": "http://proceedings.mlr.press/v97/nagano19a/nagano19a.pdf", "supp": "", "pdf_size": 1357146, "gs_citation": 147, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11277639546038701066&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Complexity Science and Engineering, The University of Tokyo, Japan; Preferred Networks, Inc., Japan; Preferred Networks, Inc., Japan; Preferred Networks, Inc., Japan", "aff_domain": "mns.k.u-tokyo.ac.jp; ; ; ", "email": "mns.k.u-tokyo.ac.jp; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/nagano19a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "University of Tokyo;Preferred Networks, Inc.", "aff_unique_dep": "Department of Complexity Science and Engineering;", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.preferred-networks.com", "aff_unique_abbr": "UTokyo;PFN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "title": "A fully differentiable beam search decoder", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4328", "id": "4328", "author_site": "Ronan Collobert, Awni Hannun, Gabriel Synnaeve", "author": "Ronan Collobert; Awni Hannun; Gabriel Synnaeve", "abstract": "We introduce a new beam search decoder that is fully differentiable, making it possible to optimize at training time through the inference procedure. Our decoder allows us to combine models which operate at different granularities (e.g. acoustic and language models). It can be used when target sequences are not aligned to input sequences by considering all possible alignments between the two. We demonstrate our approach scales by applying it to speech recognition, jointly training acoustic and word-level language models. The system is end-to-end, with gradients flowing through the whole architecture from the word-level transcriptions. Recent research efforts have shown that deep neural networks with attention-based mechanisms can successfully train an acoustic model from the final transcription, while implicitly learning a language model. Instead, we show that it is possible to discriminatively train an acoustic model jointly with an", "bibtex": "@InProceedings{pmlr-v97-collobert19a,\n title = \t {A fully differentiable beam search decoder},\n author = {Collobert, Ronan and Hannun, Awni and Synnaeve, Gabriel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1341--1350},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/collobert19a/collobert19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/collobert19a.html},\n abstract = \t {We introduce a new beam search decoder that is fully differentiable, making it possible to optimize at training time through the inference procedure. Our decoder allows us to combine models which operate at different granularities (e.g. acoustic and language models). It can be used when target sequences are not aligned to input sequences by considering all possible alignments between the two. We demonstrate our approach scales by applying it to speech recognition, jointly training acoustic and word-level language models. The system is end-to-end, with gradients flowing through the whole architecture from the word-level transcriptions. Recent research efforts have shown that deep neural networks with attention-based mechanisms can successfully train an acoustic model from the final transcription, while implicitly learning a language model. Instead, we show that it is possible to discriminatively train an acoustic model jointly with an", "pdf": "http://proceedings.mlr.press/v97/collobert19a/collobert19a.pdf", "supp": "", "pdf_size": 375480, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4981009460390388636&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Facebook AI Research; Facebook AI Research; Facebook AI Research", "aff_domain": "fb.com;fb.com;fb.com", "email": "fb.com;fb.com;fb.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/collobert19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "ARSM: Augment-REINFORCE-Swap-Merge Estimator for Gradient Backpropagation Through Categorical Variables", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4166", "id": "4166", "author_site": "Mingzhang Yin, Yuguang Yue, Mingyuan Zhou", "author": "Mingzhang Yin; Yuguang Yue; Mingyuan Zhou", "abstract": "To address the challenge of backpropagating the gradient through categorical variables, we propose the augment-REINFORCE-swap-merge (ARSM) gradient estimator that is unbiased and has low variance. ARSM first uses variable augmentation, REINFORCE, and Rao-Blackwellization to re-express the gradient as an expectation under the Dirichlet distribution, then uses variable swapping to construct differently expressed but equivalent expectations, and finally shares common random numbers between these expectations to achieve significant variance reduction. Experimental results show ARSM closely resembles the performance of the true gradient for optimization in univariate settings; outperforms existing estimators by a large margin when applied to categorical variational auto-encoders; and provides a \"try-and-see self-critic\" variance reduction method for discrete-action policy gradient, which removes the need of estimating baselines by generating a random number of pseudo actions and estimating their action-value functions.", "bibtex": "@InProceedings{pmlr-v97-yin19c,\n title = \t {{ARSM}: Augment-{REINFORCE}-Swap-Merge Estimator for Gradient Backpropagation Through Categorical Variables},\n author = {Yin, Mingzhang and Yue, Yuguang and Zhou, Mingyuan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7095--7104},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yin19c/yin19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/yin19c.html},\n abstract = \t {To address the challenge of backpropagating the gradient through categorical variables, we propose the augment-REINFORCE-swap-merge (ARSM) gradient estimator that is unbiased and has low variance. ARSM first uses variable augmentation, REINFORCE, and Rao-Blackwellization to re-express the gradient as an expectation under the Dirichlet distribution, then uses variable swapping to construct differently expressed but equivalent expectations, and finally shares common random numbers between these expectations to achieve significant variance reduction. Experimental results show ARSM closely resembles the performance of the true gradient for optimization in univariate settings; outperforms existing estimators by a large margin when applied to categorical variational auto-encoders; and provides a \"try-and-see self-critic\" variance reduction method for discrete-action policy gradient, which removes the need of estimating baselines by generating a random number of pseudo actions and estimating their action-value functions.}\n}", "pdf": "http://proceedings.mlr.press/v97/yin19c/yin19c.pdf", "supp": "", "pdf_size": 1939980, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18117321206953712314&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Statistics and Data Sciences, The University of Texas at Austin; Department of Statistics and Data Sciences, The University of Texas at Austin; Department of IROM, McCombs School of Business, The University of Texas at Austin", "aff_domain": "mccombs.utexas.edu;mccombs.utexas.edu;mccombs.utexas.edu", "email": "mccombs.utexas.edu;mccombs.utexas.edu;mccombs.utexas.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yin19c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of Statistics and Data Sciences", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "AReS and MaRS Adversarial and MMD-Minimizing Regression for SDEs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3857", "id": "3857", "author_site": "Gabriele Abbati, Philippe Wenk, Michael A Osborne, Andreas Krause, Bernhard Sch\u00f6lkopf, Stefan Bauer", "author": "Gabriele Abbati; Philippe Wenk; Michael A. Osborne; Andreas Krause; Bernhard Sch\u00f6lkopf; Stefan Bauer", "abstract": "Stochastic differential equations are an important modeling class in many disciplines. Consequently, there exist many methods relying on various discretization and numerical integration schemes. In this paper, we propose a novel, probabilistic model for estimating the drift and diffusion given noisy observations of the underlying stochastic system. Using state-of-the-art adversarial and moment matching inference techniques, we avoid the discretization schemes of classical approaches. This leads to significant improvements in parameter accuracy and robustness given random initial guesses. On four established benchmark systems, we compare the performance of our algorithms to state-of-the-art solutions based on extended Kalman filtering and Gaussian processes.", "bibtex": "@InProceedings{pmlr-v97-abbati19a,\n title = \t {{AR}e{S} and {M}a{RS} Adversarial and {MMD}-Minimizing Regression for {SDE}s},\n author = {Abbati, Gabriele and Wenk, Philippe and Osborne, Michael A. and Krause, Andreas and Sch{\\\"o}lkopf, Bernhard and Bauer, Stefan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1--10},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/abbati19a/abbati19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/abbati19a.html},\n abstract = \t {Stochastic differential equations are an important modeling class in many disciplines. Consequently, there exist many methods relying on various discretization and numerical integration schemes. In this paper, we propose a novel, probabilistic model for estimating the drift and diffusion given noisy observations of the underlying stochastic system. Using state-of-the-art adversarial and moment matching inference techniques, we avoid the discretization schemes of classical approaches. This leads to significant improvements in parameter accuracy and robustness given random initial guesses. On four established benchmark systems, we compare the performance of our algorithms to state-of-the-art solutions based on extended Kalman filtering and Gaussian processes.}\n}", "pdf": "http://proceedings.mlr.press/v97/abbati19a/abbati19a.pdf", "supp": "", "pdf_size": 459845, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17009642068403205784&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Department of Engineering Science, University of Oxford; Learning and Adaptive Systems Group, ETH Z\u00a8urich+Max Planck ETH Center for Learning Systems; Department of Engineering Science, University of Oxford; Learning and Adaptive Systems Group, ETH Z\u00a8urich; Empirical Inference Group, Max Planck Institute for Intelligent Systems; Empirical Inference Group, Max Planck Institute for Intelligent Systems", "aff_domain": "robots.ox.ac.uk;ethz.ch; ; ; ; ", "email": "robots.ox.ac.uk;ethz.ch; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/abbati19a.html", "aff_unique_index": "0;1+2;0;1;3;3", "aff_unique_norm": "University of Oxford;ETH Zurich;Max Planck ETH Center for Learning Systems;Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Department of Engineering Science;Learning and Adaptive Systems Group;Center for Learning Systems;Empirical Inference Group", "aff_unique_url": "https://www.ox.ac.uk;https://www.ethz.ch;https://learning-systems.org;https://www.mpituebingen.mpg.de", "aff_unique_abbr": "Oxford;ETHZ;;MPI-IS", "aff_campus_unique_index": "0;;0", "aff_campus_unique": "Oxford;", "aff_country_unique_index": "0;1+1;0;1;2;2", "aff_country_unique": "United Kingdom;Switzerland;Germany" }, { "title": "AUC\u03bc: A Performance Metric for Multi-Class Machine Learning Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4252", "id": "4252", "author_site": "Ross Kleiman, University of Wisconsin David Page", "author": "Ross Kleiman; David Page", "abstract": "The area under the receiver operating characteristic curve (AUC) is arguably the most common metric in machine learning for assessing the quality of a two-class classification model. As the number and complexity of machine learning applications grows, so too does the need for measures that can gracefully extend to classification models trained for more than two classes. Prior work in this area has proven computationally intractable and/or inconsistent with known properties of AUC, and thus there is still a need for an improved multi-class efficacy metric. We provide in this work a multi-class extension of AUC that we call AUC{\\textmu} that is derived from first principles of the binary class AUC. AUC{\\textmu} has similar computational complexity to AUC and maintains the properties of AUC critical to its interpretation and use.", "bibtex": "@InProceedings{pmlr-v97-kleiman19a,\n title = \t {{AUC}{\\textmu}: A Performance Metric for Multi-Class Machine Learning Models},\n author = {Kleiman, Ross and Page, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3439--3447},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kleiman19a/kleiman19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kleiman19a.html},\n abstract = \t {The area under the receiver operating characteristic curve (AUC) is arguably the most common metric in machine learning for assessing the quality of a two-class classification model. As the number and complexity of machine learning applications grows, so too does the need for measures that can gracefully extend to classification models trained for more than two classes. Prior work in this area has proven computationally intractable and/or inconsistent with known properties of AUC, and thus there is still a need for an improved multi-class efficacy metric. We provide in this work a multi-class extension of AUC that we call AUC{\\textmu} that is derived from first principles of the binary class AUC. AUC{\\textmu} has similar computational complexity to AUC and maintains the properties of AUC critical to its interpretation and use.}\n}", "pdf": "http://proceedings.mlr.press/v97/kleiman19a/kleiman19a.pdf", "supp": "", "pdf_size": 601634, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17908827473747234465&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Sciences, University of Wisconsin - Madison; Department of Computer Sciences, University of Wisconsin - Madison + Department of Biostatistics and Medical Informatics, University of Wisconsin - Madison", "aff_domain": "cs.wisc.edu; ", "email": "cs.wisc.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/kleiman19a.html", "aff_unique_index": "0;0+0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Sciences", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0+0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0+0", "aff_country_unique": "United States" }, { "title": "Accelerated Flow for Probability Distributions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3689", "id": "3689", "author_site": "Amirhossein Taghvaei, Prashant Mehta", "author": "Amirhossein Taghvaei; Prashant Mehta", "abstract": "This paper presents a methodology and numerical algorithms for constructing accelerated gradient flows on the space of probability distributions. In particular, we extend the recent variational formulation of accelerated methods in (Wibisono et al., 2016) from vector valued variables to probability distributions. The variational problem is modeled as a mean-field optimal control problem. A quantitative estimate on the asymptotic convergence rate is provided based on a Lyapunov function construction, when the objective functional is displacement convex. An important special case is considered where the objective functional is the relative entropy. For this case, two numerical approximations are presented to implement the Hamilton\u2019s equations as a system of N interacting particles. The algorithm is numerically illustrated and compared with the MCMC and Hamiltonian MCMC algorithms.", "bibtex": "@InProceedings{pmlr-v97-taghvaei19a,\n title = \t {Accelerated Flow for Probability Distributions},\n author = {Taghvaei, Amirhossein and Mehta, Prashant},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6076--6085},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/taghvaei19a/taghvaei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/taghvaei19a.html},\n abstract = \t {This paper presents a methodology and numerical algorithms for constructing accelerated gradient flows on the space of probability distributions. In particular, we extend the recent variational formulation of accelerated methods in (Wibisono et al., 2016) from vector valued variables to probability distributions. The variational problem is modeled as a mean-field optimal control problem. A quantitative estimate on the asymptotic convergence rate is provided based on a Lyapunov function construction, when the objective functional is displacement convex. An important special case is considered where the objective functional is the relative entropy. For this case, two numerical approximations are presented to implement the Hamilton\u2019s equations as a system of N interacting particles. The algorithm is numerically illustrated and compared with the MCMC and Hamiltonian MCMC algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/taghvaei19a/taghvaei19a.pdf", "supp": "", "pdf_size": 689160, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9398552720489762966&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Mechanical Science and Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, Urbana, IL, USA; Department of Mechanical Science and Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, Urbana, IL, USA", "aff_domain": "illinois.edu; ", "email": "illinois.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/taghvaei19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Mechanical Science and Engineering", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Accelerated Linear Convergence of Stochastic Momentum Methods in Wasserstein Distances", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4211", "id": "4211", "author_site": "Bugra Can, Mert Gurbuzbalaban, Lingjiong Zhu", "author": "Bugra Can; Mert Gurbuzbalaban; Lingjiong Zhu", "abstract": "Momentum methods such as Polyak\u2019s heavy ball (HB) method, Nesterov\u2019s accelerated gradient (AG) as well as accelerated projected gradient (APG) method have been commonly used in machine learning practice, but their performance is quite sensitive to noise in the gradients. We study these methods under a first-order stochastic oracle model where noisy estimates of the gradients are available. For strongly convex problems, we show that the distribution of the iterates of AG converges with the accelerated $O(\\sqrt{\\kappa}\\log(1/\\varepsilon))$ linear rate to a ball of radius $\\varepsilon$ centered at a unique invariant distribution in the 1-Wasserstein metric where $\\kappa$ is the condition number as long as the noise variance is smaller than an explicit upper bound we can provide. Our analysis also certifies linear convergence rates as a function of the stepsize, momentum parameter and the noise variance; recovering the accelerated rates in the noiseless case and quantifying the level of noise that can be tolerated to achieve a given performance. To the best of our knowledge, these are the first linear convergence results for stochastic momentum methods under the stochastic oracle model. We also develop finer results for the special case of quadratic objectives, extend our results to the APG method and weakly convex functions showing accelerated rates when the noise magnitude is sufficiently small.", "bibtex": "@InProceedings{pmlr-v97-can19a,\n title = \t {Accelerated Linear Convergence of Stochastic Momentum Methods in {W}asserstein Distances},\n author = {Can, Bugra and Gurbuzbalaban, Mert and Zhu, Lingjiong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {891--901},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/can19a/can19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/can19a.html},\n abstract = \t {Momentum methods such as Polyak\u2019s heavy ball (HB) method, Nesterov\u2019s accelerated gradient (AG) as well as accelerated projected gradient (APG) method have been commonly used in machine learning practice, but their performance is quite sensitive to noise in the gradients. We study these methods under a first-order stochastic oracle model where noisy estimates of the gradients are available. For strongly convex problems, we show that the distribution of the iterates of AG converges with the accelerated $O(\\sqrt{\\kappa}\\log(1/\\varepsilon))$ linear rate to a ball of radius $\\varepsilon$ centered at a unique invariant distribution in the 1-Wasserstein metric where $\\kappa$ is the condition number as long as the noise variance is smaller than an explicit upper bound we can provide. Our analysis also certifies linear convergence rates as a function of the stepsize, momentum parameter and the noise variance; recovering the accelerated rates in the noiseless case and quantifying the level of noise that can be tolerated to achieve a given performance. To the best of our knowledge, these are the first linear convergence results for stochastic momentum methods under the stochastic oracle model. We also develop finer results for the special case of quadratic objectives, extend our results to the APG method and weakly convex functions showing accelerated rates when the noise magnitude is sufficiently small.}\n}", "pdf": "http://proceedings.mlr.press/v97/can19a/can19a.pdf", "supp": "", "pdf_size": 320562, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11235226321495471103&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Management Science and Information Systems, Rutgers Business School, Piscataway, NJ-08854, United States of America; Department of Management Science and Information Systems, Rutgers Business School, Piscataway, NJ-08854, United States of America; Department of Mathematics, Florida State University, 1017 Academic Way, Tallahassee, FL-32306, United States of America", "aff_domain": "rutgers.edu;rutgers.edu; ", "email": "rutgers.edu;rutgers.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/can19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Rutgers Business School;Florida State University", "aff_unique_dep": "Department of Management Science and Information Systems;Department of Mathematics", "aff_unique_url": "https://business.rutgers.edu;https://www.fsu.edu", "aff_unique_abbr": "RBS;FSU", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Piscataway;Tallahassee", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Acceleration of SVRG and Katyusha X by Inexact Preconditioning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4233", "id": "4233", "author_site": "Yanli Liu, Fei Feng, Wotao Yin", "author": "Yanli Liu; Fei Feng; Wotao Yin", "abstract": "Empirical risk minimization is an important class of optimization problems with many popular machine learning applications, and stochastic variance reduction methods are popular choices for solving them. Among these methods, SVRG and Katyusha X (a Nesterov accelerated SVRG) achieve fast convergence without substantial memory requirement. In this paper, we propose to accelerate these two algorithms by", "bibtex": "@InProceedings{pmlr-v97-liu19a,\n title = \t {Acceleration of {SVRG} and {K}atyusha X by Inexact Preconditioning},\n author = {Liu, Yanli and Feng, Fei and Yin, Wotao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4003--4012},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19a/liu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19a.html},\n abstract = \t {Empirical risk minimization is an important class of optimization problems with many popular machine learning applications, and stochastic variance reduction methods are popular choices for solving them. Among these methods, SVRG and Katyusha X (a Nesterov accelerated SVRG) achieve fast convergence without substantial memory requirement. In this paper, we propose to accelerate these two algorithms by", "pdf": "http://proceedings.mlr.press/v97/liu19a/liu19a.pdf", "supp": "", "pdf_size": 1084625, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13059368819279986289&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Department of Mathematics, University of California, Los Angeles, Los Angeles, CA, USA; Department of Mathematics, University of California, Los Angeles, Los Angeles, CA, USA; Department of Mathematics, University of California, Los Angeles, Los Angeles, CA, USA", "aff_domain": "math.ucla.edu; ; ", "email": "math.ucla.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/liu19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Mathematics", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Action Robust Reinforcement Learning and Applications in Continuous Control", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3585", "id": "3585", "author_site": "Chen Tessler, Chen Tessler, Yonathan Efroni, Shie Mannor", "author": "Chen Tessler; Yonathan Efroni; Shie Mannor", "abstract": "A policy is said to be robust if it maximizes the reward while considering a bad, or even adversarial, model. In this work we formalize two new criteria of robustness to action uncertainty. Specifically, we consider two scenarios in which the agent attempts to perform an action $\\action$, and (i) with probability $\\alpha$, an alternative adversarial action $\\bar \\action$ is taken, or (ii) an adversary adds a perturbation to the selected action in the case of continuous action space. We show that our criteria are related to common forms of uncertainty in robotics domains, such as the occurrence of abrupt forces, and suggest algorithms in the tabular case. Building on the suggested algorithms, we generalize our approach to deep reinforcement learning (DRL) and provide extensive experiments in the various MuJoCo domains. Our experiments show that not only does our approach produce robust policies, but it also improves the performance in the absence of perturbations. This generalization indicates that action-robustness can be thought of as implicit regularization in RL problems.", "bibtex": "@InProceedings{pmlr-v97-tessler19a,\n title = \t {Action Robust Reinforcement Learning and Applications in Continuous Control},\n author = {Tessler, Chen and Efroni, Yonathan and Mannor, Shie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6215--6224},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tessler19a/tessler19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tessler19a.html},\n abstract = \t {A policy is said to be robust if it maximizes the reward while considering a bad, or even adversarial, model. In this work we formalize two new criteria of robustness to action uncertainty. Specifically, we consider two scenarios in which the agent attempts to perform an action $\\action$, and (i) with probability $\\alpha$, an alternative adversarial action $\\bar \\action$ is taken, or (ii) an adversary adds a perturbation to the selected action in the case of continuous action space. We show that our criteria are related to common forms of uncertainty in robotics domains, such as the occurrence of abrupt forces, and suggest algorithms in the tabular case. Building on the suggested algorithms, we generalize our approach to deep reinforcement learning (DRL) and provide extensive experiments in the various MuJoCo domains. Our experiments show that not only does our approach produce robust policies, but it also improves the performance in the absence of perturbations. This generalization indicates that action-robustness can be thought of as implicit regularization in RL problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/tessler19a/tessler19a.pdf", "supp": "", "pdf_size": 434964, "gs_citation": 284, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11381455221391781930&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering, Technion Institute of Technology, Haifa, Israel; Department of Electrical Engineering, Technion Institute of Technology, Haifa, Israel; Department of Electrical Engineering, Technion Institute of Technology, Haifa, Israel", "aff_domain": "campus.technion.ac.il;campus.technion.ac.il; ", "email": "campus.technion.ac.il;campus.technion.ac.il; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/tessler19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Technion Institute of Technology", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Active Embedding Search via Noisy Paired Comparisons", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4212", "id": "4212", "author_site": "Gregory Canal, Andy Massimino, Mark Davenport, Christopher Rozell", "author": "Gregory Canal; Andy Massimino; Mark Davenport; Christopher Rozell", "abstract": "Suppose that we wish to estimate a user\u2019s preference vector $w$ from paired comparisons of the form \u201cdoes user $w$ prefer item $p$ or item $q$?,\u201d where both the user and items are embedded in a low-dimensional Euclidean space with distances that reflect user and item similarities. Such observations arise in numerous settings, including psychometrics and psychology experiments, search tasks, advertising, and recommender systems. In such tasks, queries can be extremely costly and subject to varying levels of response noise; thus, we aim to actively choose pairs that are most informative given the results of previous comparisons. We provide new theoretical insights into the benefits and challenges of greedy information maximization in this setting, and develop two novel strategies that maximize lower bounds on information gain and are simpler to analyze and compute respectively. We use simulated responses from a real-world dataset to validate our strategies through their similar performance to greedy information maximization, and their superior preference estimation over state-of-the-art selection methods as well as random queries.", "bibtex": "@InProceedings{pmlr-v97-canal19a,\n title = \t {Active Embedding Search via Noisy Paired Comparisons},\n author = {Canal, Gregory and Massimino, Andy and Davenport, Mark and Rozell, Christopher},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {902--911},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/canal19a/canal19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/canal19a.html},\n abstract = \t {Suppose that we wish to estimate a user\u2019s preference vector $w$ from paired comparisons of the form \u201cdoes user $w$ prefer item $p$ or item $q$?,\u201d where both the user and items are embedded in a low-dimensional Euclidean space with distances that reflect user and item similarities. Such observations arise in numerous settings, including psychometrics and psychology experiments, search tasks, advertising, and recommender systems. In such tasks, queries can be extremely costly and subject to varying levels of response noise; thus, we aim to actively choose pairs that are most informative given the results of previous comparisons. We provide new theoretical insights into the benefits and challenges of greedy information maximization in this setting, and develop two novel strategies that maximize lower bounds on information gain and are simpler to analyze and compute respectively. We use simulated responses from a real-world dataset to validate our strategies through their similar performance to greedy information maximization, and their superior preference estimation over state-of-the-art selection methods as well as random queries.}\n}", "pdf": "http://proceedings.mlr.press/v97/canal19a/canal19a.pdf", "supp": "", "pdf_size": 4656986, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10123441327203003064&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, Georgia, United States; School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, Georgia, United States; School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, Georgia, United States; School of Electrical and Computer Engineering, Georgia Institute of Technology, Atlanta, Georgia, United States", "aff_domain": "gatech.edu; ; ; ", "email": "gatech.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/canal19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "School of Electrical and Computer Engineering", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Atlanta", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Active Learning for Decision-Making from Imbalanced Observational Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4171", "id": "4171", "author_site": "Iiris Sundin, Peter Schulam, Eero Siivola, Aki Vehtari, Suchi Saria, Samuel Kaski", "author": "Iiris Sundin; Peter Schulam; Eero Siivola; Aki Vehtari; Suchi Saria; Samuel Kaski", "abstract": "Machine learning can help personalized decision support by learning models to predict individual treatment effects (ITE). This work studies the reliability of prediction-based decision-making in a task of deciding which action $a$ to take for a target unit after observing its covariates $\\tilde{x}$ and predicted outcomes $\\hat{p}(\\tilde{y} \\mid \\tilde{x}, a)$. An example case is personalized medicine and the decision of which treatment to give to a patient. A common problem when learning these models from observational data is imbalance, that is, difference in treated/control covariate distributions, which is known to increase the upper bound of the expected ITE estimation error. We propose to assess the decision-making reliability by estimating the ITE model\u2019s Type S error rate, which is the probability of the model inferring the sign of the treatment effect wrong. Furthermore, we use the estimated reliability as a criterion for active learning, in order to collect new (possibly expensive) observations, instead of making a forced choice based on unreliable predictions. We demonstrate the effectiveness of this decision-making aware active learning in two decision-making tasks: in simulated data with binary outcomes and in a medical dataset with synthetic and continuous treatment outcomes.", "bibtex": "@InProceedings{pmlr-v97-sundin19a,\n title = \t {Active Learning for Decision-Making from Imbalanced Observational Data},\n author = {Sundin, Iiris and Schulam, Peter and Siivola, Eero and Vehtari, Aki and Saria, Suchi and Kaski, Samuel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6046--6055},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sundin19a/sundin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/sundin19a.html},\n abstract = \t {Machine learning can help personalized decision support by learning models to predict individual treatment effects (ITE). This work studies the reliability of prediction-based decision-making in a task of deciding which action $a$ to take for a target unit after observing its covariates $\\tilde{x}$ and predicted outcomes $\\hat{p}(\\tilde{y} \\mid \\tilde{x}, a)$. An example case is personalized medicine and the decision of which treatment to give to a patient. A common problem when learning these models from observational data is imbalance, that is, difference in treated/control covariate distributions, which is known to increase the upper bound of the expected ITE estimation error. We propose to assess the decision-making reliability by estimating the ITE model\u2019s Type S error rate, which is the probability of the model inferring the sign of the treatment effect wrong. Furthermore, we use the estimated reliability as a criterion for active learning, in order to collect new (possibly expensive) observations, instead of making a forced choice based on unreliable predictions. We demonstrate the effectiveness of this decision-making aware active learning in two decision-making tasks: in simulated data with binary outcomes and in a medical dataset with synthetic and continuous treatment outcomes.}\n}", "pdf": "http://proceedings.mlr.press/v97/sundin19a/sundin19a.pdf", "supp": "", "pdf_size": 617427, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11068950716129063345&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/sundin19a.html" }, { "title": "Active Learning for Probabilistic Structured Prediction of Cuts and Matchings", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3830", "id": "3830", "author_site": "Sima Behpour, Anqi Liu, Brian Ziebart", "author": "Sima Behpour; Anqi Liu; Brian Ziebart", "abstract": "Active learning methods, like uncertainty sampling, combined with probabilistic prediction techniques have achieved success in various problems like image classification and text classification. For more complex multivariate prediction tasks, the relationships between labels play an important role in designing structured classifiers with better performance. However, computational time complexity limits prevalent probabilistic methods from effectively supporting active learning. Specifically, while non-probabilistic methods based on structured support vector ma-chines can be tractably applied to predicting cuts and bipartite matchings, conditional random fields are intractable for these structures. We propose an adversarial approach for active learning with structured prediction domains that is tractable for cuts and matching. We evaluate this approach algorithmically in two important structured prediction problems: multi-label classification and object tracking in videos. We demonstrate better accuracy and computational efficiency for our proposed method.", "bibtex": "@InProceedings{pmlr-v97-behpour19a,\n title = \t {Active Learning for Probabilistic Structured Prediction of Cuts and Matchings},\n author = {Behpour, Sima and Liu, Anqi and Ziebart, Brian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {563--572},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/behpour19a/behpour19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/behpour19a.html},\n abstract = \t {Active learning methods, like uncertainty sampling, combined with probabilistic prediction techniques have achieved success in various problems like image classification and text classification. For more complex multivariate prediction tasks, the relationships between labels play an important role in designing structured classifiers with better performance. However, computational time complexity limits prevalent probabilistic methods from effectively supporting active learning. Specifically, while non-probabilistic methods based on structured support vector ma-chines can be tractably applied to predicting cuts and bipartite matchings, conditional random fields are intractable for these structures. We propose an adversarial approach for active learning with structured prediction domains that is tractable for cuts and matching. We evaluate this approach algorithmically in two important structured prediction problems: multi-label classification and object tracking in videos. We demonstrate better accuracy and computational efficiency for our proposed method.}\n}", "pdf": "http://proceedings.mlr.press/v97/behpour19a/behpour19a.pdf", "supp": "", "pdf_size": 2899312, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1060553941145361236&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Pennsylvania + Department of Computer Science, University of Illinois at Chicago; California Institute of Technology; Department of Computer Science, University of Illinois at Chicago", "aff_domain": "seas.upenn.edu; ; ", "email": "seas.upenn.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/behpour19a.html", "aff_unique_index": "0+1;2;1", "aff_unique_norm": "University of Pennsylvania;University of Illinois at Chicago;California Institute of Technology", "aff_unique_dep": ";Department of Computer Science;", "aff_unique_url": "https://www.upenn.edu;https://www.uic.edu;https://www.caltech.edu", "aff_unique_abbr": "UPenn;UIC;Caltech", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Chicago;Pasadena", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Active Learning with Disagreement Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4190", "id": "4190", "author_site": "Corinna Cortes, Giulia DeSalvo, Mehryar Mohri, Ningshan Zhang, Claudio Gentile", "author": "Corinna Cortes; Giulia Desalvo; Mehryar Mohri; Ningshan Zhang; Claudio Gentile", "abstract": "We present two novel enhancements of an online importance-weighted active learning algorithm IWAL, using the properties of disagreements among hypotheses. The first enhancement, IWALD, prunes the hypothesis set with a more aggressive strategy based on the disagreement graph. We show that IWAL-D improves the generalization performance and the label complexity of the original IWAL, and quantify the improvement in terms of the disagreement graph coefficient. The second enhancement, IZOOM, further improves IWAL-D by adaptively zooming into the current version space and thus reducing the best-in-class error. We show that IZOOM admits favorable theoretical guarantees with the changing hypothesis set. We report experimental results on multiple datasets and demonstrate that the proposed algorithms achieve better test performances than IWAL given the same amount of labeling budget.", "bibtex": "@InProceedings{pmlr-v97-cortes19b,\n title = \t {Active Learning with Disagreement Graphs},\n author = {Cortes, Corinna and Desalvo, Giulia and Mohri, Mehryar and Zhang, Ningshan and Gentile, Claudio},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1379--1387},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cortes19b/cortes19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/cortes19b.html},\n abstract = \t {We present two novel enhancements of an online importance-weighted active learning algorithm IWAL, using the properties of disagreements among hypotheses. The first enhancement, IWALD, prunes the hypothesis set with a more aggressive strategy based on the disagreement graph. We show that IWAL-D improves the generalization performance and the label complexity of the original IWAL, and quantify the improvement in terms of the disagreement graph coefficient. The second enhancement, IZOOM, further improves IWAL-D by adaptively zooming into the current version space and thus reducing the best-in-class error. We show that IZOOM admits favorable theoretical guarantees with the changing hypothesis set. We report experimental results on multiple datasets and demonstrate that the proposed algorithms achieve better test performances than IWAL given the same amount of labeling budget.}\n}", "pdf": "http://proceedings.mlr.press/v97/cortes19b/cortes19b.pdf", "supp": "", "pdf_size": 694620, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4283441487995347039&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Google Research, New York, NY, USA; Google Research, New York, NY, USA; Google Research, New York, NY, USA; Google Research, New York, NY, USA + Courant Institute, New York, NY, USA; Leonard N. Stern School of Business, New York University, New York, NY, USA", "aff_domain": "google.com;google.com;google.com;cs.nyu.edu;stern.nyu.edu", "email": "google.com;google.com;google.com;cs.nyu.edu;stern.nyu.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/cortes19b.html", "aff_unique_index": "0;0;0;0+1;2", "aff_unique_norm": "Google;Courant Institute of Mathematical Sciences;New York University", "aff_unique_dep": "Google Research;;Leonard N. Stern School of Business", "aff_unique_url": "https://research.google;https://courant.nyu.edu;https://www.nyu.edu", "aff_unique_abbr": "Google Research;Courant;NYU", "aff_campus_unique_index": "0;0;0;0+0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "Active Manifolds: A non-linear analogue to Active Subspaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4239", "id": "4239", "author_site": "Robert Bridges, Anthony Gruber, Christopher Felder, Miki Verma, Chelsey Hoff", "author": "Robert Bridges; Anthony Gruber; Christopher Felder; Miki Verma; Chelsey Hoff", "abstract": "We present an approach to analyze $C^1(\\mathbb{R}^m)$ functions that addresses limitations present in the Active Subspaces (AS) method of Constantine et al. (2014; 2015). Under appropriate hypotheses, our Active Manifolds (AM) method identifies a 1-D curve in the domain (the active manifold) on which nearly all values of the unknown function are attained, which can be exploited for approximation or analysis, especially when $m$ is large (high-dimensional input space). We provide theorems justifying our AM technique and an algorithm permitting functional approximation and sensitivity analysis. Using accessible, low-dimensional functions as initial examples, we show AM reduces approximation error by an order of magnitude compared to AS, at the expense of more computation. Following this, we revisit the sensitivity analysis by Glaws et al. (2017), who apply AS to analyze a magnetohydrodynamic power generator model, and compare the performance of AM on the same data. Our analysis provides detailed information not captured by AS, exhibiting the influence of each parameter individually along an active manifold. Overall, AM represents a novel technique for analyzing functional models with benefits including: reducing $m$-dimensional analysis to a 1-D analogue, permitting more accurate regression than AS (at more computational expense), enabling more informative sensitivity analysis, and granting accessible visualizations (2-D plots) of parameter sensitivity along the AM.", "bibtex": "@InProceedings{pmlr-v97-bridges19a,\n title = \t {Active Manifolds: A non-linear analogue to Active Subspaces},\n author = {Bridges, Robert and Gruber, Anthony and Felder, Christopher and Verma, Miki and Hoff, Chelsey},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {764--772},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bridges19a/bridges19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bridges19a.html},\n abstract = \t {We present an approach to analyze $C^1(\\mathbb{R}^m)$ functions that addresses limitations present in the Active Subspaces (AS) method of Constantine et al. (2014; 2015). Under appropriate hypotheses, our Active Manifolds (AM) method identifies a 1-D curve in the domain (the active manifold) on which nearly all values of the unknown function are attained, which can be exploited for approximation or analysis, especially when $m$ is large (high-dimensional input space). We provide theorems justifying our AM technique and an algorithm permitting functional approximation and sensitivity analysis. Using accessible, low-dimensional functions as initial examples, we show AM reduces approximation error by an order of magnitude compared to AS, at the expense of more computation. Following this, we revisit the sensitivity analysis by Glaws et al. (2017), who apply AS to analyze a magnetohydrodynamic power generator model, and compare the performance of AM on the same data. Our analysis provides detailed information not captured by AS, exhibiting the influence of each parameter individually along an active manifold. Overall, AM represents a novel technique for analyzing functional models with benefits including: reducing $m$-dimensional analysis to a 1-D analogue, permitting more accurate regression than AS (at more computational expense), enabling more informative sensitivity analysis, and granting accessible visualizations (2-D plots) of parameter sensitivity along the AM.}\n}", "pdf": "http://proceedings.mlr.press/v97/bridges19a/bridges19a.pdf", "supp": "", "pdf_size": 8677889, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12766453925065005709&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Cyber & Applied Data Analytics Division, Oak Ridge National Laboratory; Department of Mathematics, Texas Tech University; Department of Mathematics and Statistics, Washington University in St. Louis; Cyber & Applied Data Analytics Division, Oak Ridge National Laboratory; Cyber & Applied Data Analytics Division, Oak Ridge National Laboratory", "aff_domain": "ornl.gov; ; ; ; ", "email": "ornl.gov; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/bridges19a.html", "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "Oak Ridge National Laboratory;Texas Tech University;Washington University in St. Louis", "aff_unique_dep": "Cyber & Applied Data Analytics Division;Department of Mathematics;Department of Mathematics and Statistics", "aff_unique_url": "https://www.ornl.gov;https://www.ttu.edu;https://wustl.edu", "aff_unique_abbr": "ORNL;TTU;WUSTL", "aff_campus_unique_index": "1", "aff_campus_unique": ";St. Louis", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Actor-Attention-Critic for Multi-Agent Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4199", "id": "4199", "author_site": "Shariq Iqbal, Fei Sha", "author": "Shariq Iqbal; Fei Sha", "abstract": "Reinforcement learning in multi-agent scenarios is important for real-world applications but presents challenges beyond those seen in single-agent settings. We present an actor-critic algorithm that trains decentralized policies in multi-agent settings, using centrally computed critics that share an attention mechanism which selects relevant information for each agent at every timestep. This attention mechanism enables more effective and scalable learning in complex multi-agent environments, when compared to recent approaches. Our approach is applicable not only to cooperative settings with shared rewards, but also individualized reward settings, including adversarial settings, as well as settings that do not provide global states, and it makes no assumptions about the action spaces of the agents. As such, it is flexible enough to be applied to most multi-agent learning problems.", "bibtex": "@InProceedings{pmlr-v97-iqbal19a,\n title = \t {Actor-Attention-Critic for Multi-Agent Reinforcement Learning},\n author = {Iqbal, Shariq and Sha, Fei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2961--2970},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/iqbal19a/iqbal19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/iqbal19a.html},\n abstract = \t {Reinforcement learning in multi-agent scenarios is important for real-world applications but presents challenges beyond those seen in single-agent settings. We present an actor-critic algorithm that trains decentralized policies in multi-agent settings, using centrally computed critics that share an attention mechanism which selects relevant information for each agent at every timestep. This attention mechanism enables more effective and scalable learning in complex multi-agent environments, when compared to recent approaches. Our approach is applicable not only to cooperative settings with shared rewards, but also individualized reward settings, including adversarial settings, as well as settings that do not provide global states, and it makes no assumptions about the action spaces of the agents. As such, it is flexible enough to be applied to most multi-agent learning problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/iqbal19a/iqbal19a.pdf", "supp": "", "pdf_size": 777015, "gs_citation": 1031, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=241844530313281803&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, University of Southern California; Department of Computer Science, University of Southern California + On leave at Google AI", "aff_domain": "usc.edu;google.com", "email": "usc.edu;google.com", "github": "https://github.com/shariqiqbal2810/MAAC", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/iqbal19a.html", "aff_unique_index": "0;0+1", "aff_unique_norm": "University of Southern California;Google", "aff_unique_dep": "Department of Computer Science;Google AI", "aff_unique_url": "https://www.usc.edu;https://ai.google", "aff_unique_abbr": "USC;Google AI", "aff_campus_unique_index": "0;0+1", "aff_campus_unique": "Los Angeles;Mountain View", "aff_country_unique_index": "0;0+0", "aff_country_unique": "United States" }, { "title": "AdaGrad Stepsizes: Sharp Convergence Over Nonconvex Landscapes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4063", "id": "4063", "author_site": "Rachel Ward, Xiaoxia Wu, Leon Bottou", "author": "Rachel Ward; Xiaoxia Wu; Leon Bottou", "abstract": "Adaptive gradient methods such as AdaGrad and its variants update the stepsize in stochastic gradient descent on the fly according to the gradients received along the way; such methods have gained widespread use in large-scale optimization for their ability to converge robustly, without the need to fine-tune parameters such as the stepsize schedule. Yet, the theoretical guarantees to date for AdaGrad are for online and convex optimization. We bridge this gap by providing strong theoretical guarantees for the convergence of AdaGrad over smooth, nonconvex landscapes. We show that the norm version of AdaGrad (AdaGrad-Norm) converges to a stationary point at the $\\mathcal{O}(\\log(N)/\\sqrt{N})$ rate in the stochastic setting, and at the optimal $\\mathcal{O}(1/N)$ rate in the batch (non-stochastic) setting \u2013 in this sense, our convergence guarantees are \u201csharp\u201d. In particular, both our theoretical results and extensive numerical experiments imply that AdaGrad-Norm is robust to the", "bibtex": "@InProceedings{pmlr-v97-ward19a,\n title = \t {{A}da{G}rad Stepsizes: Sharp Convergence Over Nonconvex Landscapes},\n author = {Ward, Rachel and Wu, Xiaoxia and Bottou, Leon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6677--6686},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ward19a/ward19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ward19a.html},\n abstract = \t {Adaptive gradient methods such as AdaGrad and its variants update the stepsize in stochastic gradient descent on the fly according to the gradients received along the way; such methods have gained widespread use in large-scale optimization for their ability to converge robustly, without the need to fine-tune parameters such as the stepsize schedule. Yet, the theoretical guarantees to date for AdaGrad are for online and convex optimization. We bridge this gap by providing strong theoretical guarantees for the convergence of AdaGrad over smooth, nonconvex landscapes. We show that the norm version of AdaGrad (AdaGrad-Norm) converges to a stationary point at the $\\mathcal{O}(\\log(N)/\\sqrt{N})$ rate in the stochastic setting, and at the optimal $\\mathcal{O}(1/N)$ rate in the batch (non-stochastic) setting \u2013 in this sense, our convergence guarantees are \u201csharp\u201d. In particular, both our theoretical results and extensive numerical experiments imply that AdaGrad-Norm is robust to the", "pdf": "http://proceedings.mlr.press/v97/ward19a/ward19a.pdf", "supp": "", "pdf_size": 2126303, "gs_citation": 420, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9803524884611246420&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Mathematics, The University of Texas at Austin, USA + Facebook AI Research, New York, USA; Department of Mathematics, The University of Texas at Austin, USA + Facebook AI Research, New York, USA; Facebook AI Research, New York, USA", "aff_domain": "math.utexas.edu;math.utexas.edu; ", "email": "math.utexas.edu;math.utexas.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ward19a.html", "aff_unique_index": "0+1;0+1;1", "aff_unique_norm": "University of Texas at Austin;Meta", "aff_unique_dep": "Department of Mathematics;Facebook AI Research", "aff_unique_url": "https://www.utexas.edu;https://research.facebook.com", "aff_unique_abbr": "UT Austin;FAIR", "aff_campus_unique_index": "0+1;0+1;1", "aff_campus_unique": "Austin;New York", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Adaptive Antithetic Sampling for Variance Reduction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4125", "id": "4125", "author_site": "Hongyu Ren, Shengjia Zhao, Stefano Ermon", "author": "Hongyu Ren; Shengjia Zhao; Stefano Ermon", "abstract": "Variance reduction is crucial in stochastic estimation and optimization problems. Antithetic sampling reduces the variance of a Monte Carlo estimator by drawing correlated, rather than independent, samples. However, designing an effective correlation structure is challenging and application specific, thus limiting the practical applicability of these methods. In this paper, we propose a general-purpose adaptive antithetic sampling framework. We provide gradient-based and gradient-free methods to train the samplers such that they reduce variance while ensuring that the underlying Monte Carlo estimator is provably unbiased. We demonstrate the effectiveness of our approach on Bayesian inference and generative model training, where it reduces variance and improves task performance with little computational overhead.", "bibtex": "@InProceedings{pmlr-v97-ren19b,\n title = \t {Adaptive Antithetic Sampling for Variance Reduction},\n author = {Ren, Hongyu and Zhao, Shengjia and Ermon, Stefano},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5420--5428},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ren19b/ren19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/ren19b.html},\n abstract = \t {Variance reduction is crucial in stochastic estimation and optimization problems. Antithetic sampling reduces the variance of a Monte Carlo estimator by drawing correlated, rather than independent, samples. However, designing an effective correlation structure is challenging and application specific, thus limiting the practical applicability of these methods. In this paper, we propose a general-purpose adaptive antithetic sampling framework. We provide gradient-based and gradient-free methods to train the samplers such that they reduce variance while ensuring that the underlying Monte Carlo estimator is provably unbiased. We demonstrate the effectiveness of our approach on Bayesian inference and generative model training, where it reduces variance and improves task performance with little computational overhead.}\n}", "pdf": "http://proceedings.mlr.press/v97/ren19b/ren19b.pdf", "supp": "", "pdf_size": 0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5147955111658944738&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ren19b.html" }, { "title": "Adaptive Monte Carlo Multiple Testing via Multi-Armed Bandits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3590", "id": "3590", "author_site": "Martin Zhang, James Zou, David Tse", "author": "Martin Zhang; James Zou; David Tse", "abstract": "Monte Carlo (MC) permutation test is considered the gold standard for statistical hypothesis testing, especially when standard parametric assumptions are not clear or likely to fail. However, in modern data science settings where a large number of hypothesis tests need to be performed simultaneously, it is rarely used due to its prohibitive computational cost. In genome-wide association studies, for example, the number of hypothesis tests $m$ is around $10^6$ while the number of MC samples $n$ for each test could be greater than $10^8$, totaling more than $nm$=$10^{14}$ samples. In this paper, we propose \\texttt{A}daptive \\texttt{M}C multiple \\texttt{T}esting (\\texttt{AMT}) to estimate MC p-values and control false discovery rate in multiple testing. The algorithm outputs the same result as the standard full MC approach with high probability while requiring only $\\tilde{O}(\\sqrt{n}m)$ samples. This sample complexity is shown to be optimal. On a Parkinson GWAS dataset, the algorithm reduces the running time from 2 months for full MC to an hour. The \\texttt{AMT} algorithm is derived based on the theory of multi-armed bandits.", "bibtex": "@InProceedings{pmlr-v97-zhang19t,\n title = \t {Adaptive {M}onte {C}arlo Multiple Testing via Multi-Armed Bandits},\n author = {Zhang, Martin and Zou, James and Tse, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7512--7522},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19t/zhang19t.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19t.html},\n abstract = \t {Monte Carlo (MC) permutation test is considered the gold standard for statistical hypothesis testing, especially when standard parametric assumptions are not clear or likely to fail. However, in modern data science settings where a large number of hypothesis tests need to be performed simultaneously, it is rarely used due to its prohibitive computational cost. In genome-wide association studies, for example, the number of hypothesis tests $m$ is around $10^6$ while the number of MC samples $n$ for each test could be greater than $10^8$, totaling more than $nm$=$10^{14}$ samples. In this paper, we propose \\texttt{A}daptive \\texttt{M}C multiple \\texttt{T}esting (\\texttt{AMT}) to estimate MC p-values and control false discovery rate in multiple testing. The algorithm outputs the same result as the standard full MC approach with high probability while requiring only $\\tilde{O}(\\sqrt{n}m)$ samples. This sample complexity is shown to be optimal. On a Parkinson GWAS dataset, the algorithm reduces the running time from 2 months for full MC to an hour. The \\texttt{AMT} algorithm is derived based on the theory of multi-armed bandits.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19t/zhang19t.pdf", "supp": "", "pdf_size": 1918550, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17419761528871683302&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhang19t.html" }, { "title": "Adaptive Neural Trees", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3645", "id": "3645", "author_site": "Ryutaro Tanno, Kai Arulkumaran, Daniel Alexander, Antonio Criminisi, Aditya Nori", "author": "Ryutaro Tanno; Kai Arulkumaran; Daniel Alexander; Antonio Criminisi; Aditya Nori", "abstract": "Deep neural networks and decision trees operate on largely separate paradigms; typically, the former performs representation learning with pre-specified architectures, while the latter is characterised by learning hierarchies over pre-specified features with data-driven architectures. We unite the two via adaptive neural trees (ANTs), a model that incorporates representation learning into edges, routing functions and leaf nodes of a decision tree, along with a backpropagation-based training algorithm that adaptively grows the architecture from primitive modules (e.g., convolutional layers). We demonstrate that, whilst achieving competitive performance on classification and regression datasets, ANTs benefit from (i) lightweight inference via conditional computation, (ii) hierarchical separation of features useful to the predictive task e.g. learning meaningful class associations, such as separating natural vs. man-made objects, and (iii) a mechanism to adapt the architecture to the size and complexity of the training dataset.", "bibtex": "@InProceedings{pmlr-v97-tanno19a,\n title = \t {Adaptive Neural Trees},\n author = {Tanno, Ryutaro and Arulkumaran, Kai and Alexander, Daniel and Criminisi, Antonio and Nori, Aditya},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6166--6175},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tanno19a/tanno19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tanno19a.html},\n abstract = \t {Deep neural networks and decision trees operate on largely separate paradigms; typically, the former performs representation learning with pre-specified architectures, while the latter is characterised by learning hierarchies over pre-specified features with data-driven architectures. We unite the two via adaptive neural trees (ANTs), a model that incorporates representation learning into edges, routing functions and leaf nodes of a decision tree, along with a backpropagation-based training algorithm that adaptively grows the architecture from primitive modules (e.g., convolutional layers). We demonstrate that, whilst achieving competitive performance on classification and regression datasets, ANTs benefit from (i) lightweight inference via conditional computation, (ii) hierarchical separation of features useful to the predictive task e.g. learning meaningful class associations, such as separating natural vs. man-made objects, and (iii) a mechanism to adapt the architecture to the size and complexity of the training dataset.}\n}", "pdf": "http://proceedings.mlr.press/v97/tanno19a/tanno19a.pdf", "supp": "", "pdf_size": 981043, "gs_citation": 221, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10252139245277017232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "University College London, UK; Imperial College London, UK; University College London, UK + Microsoft Research, Cambridge, UK; Microsoft Research, Cambridge, UK; Microsoft Research, Cambridge, UK", "aff_domain": "ucl.ac.uk; ; ; ; ", "email": "ucl.ac.uk; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/tanno19a.html", "aff_unique_index": "0;1;0+2;2;2", "aff_unique_norm": "University College London;Imperial College London;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.ucl.ac.uk;https://www.imperial.ac.uk;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "UCL;ICL;MSR", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0+0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Adaptive Regret of Convex and Smooth Functions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3815", "id": "3815", "author_site": "Lijun Zhang, Tie-Yan Liu, Zhi-Hua Zhou", "author": "Lijun Zhang; Tie-Yan Liu; Zhi-Hua Zhou", "abstract": "We investigate online convex optimization in changing environments, and choose the adaptive regret as the performance measure. The goal is to achieve a small regret over every interval so that the comparator is allowed to change over time. Different from previous works that only utilize the convexity condition, this paper further exploits smoothness to improve the adaptive regret. To this end, we develop novel adaptive algorithms for convex and smooth functions, and establish problem-dependent regret bounds over any interval. Our regret bounds are comparable to existing results in the worst case, and become much tighter when the comparator has a small loss.", "bibtex": "@InProceedings{pmlr-v97-zhang19j,\n title = \t {Adaptive Regret of Convex and Smooth Functions},\n author = {Zhang, Lijun and Liu, Tie-Yan and Zhou, Zhi-Hua},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7414--7423},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19j/zhang19j.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19j.html},\n abstract = \t {We investigate online convex optimization in changing environments, and choose the adaptive regret as the performance measure. The goal is to achieve a small regret over every interval so that the comparator is allowed to change over time. Different from previous works that only utilize the convexity condition, this paper further exploits smoothness to improve the adaptive regret. To this end, we develop novel adaptive algorithms for convex and smooth functions, and establish problem-dependent regret bounds over any interval. Our regret bounds are comparable to existing results in the worst case, and become much tighter when the comparator has a small loss.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19j/zhang19j.pdf", "supp": "", "pdf_size": 294400, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12553983656799713193&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; Microsoft Research Asia, Beijing, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China", "aff_domain": "lamda.nju.edu.cn; ; ", "email": "lamda.nju.edu.cn; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhang19j.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Nanjing University;Microsoft", "aff_unique_dep": "National Key Laboratory for Novel Software Technology;Research", "aff_unique_url": "http://www.nju.edu.cn;https://www.microsoft.com/en-us/research/group/asia", "aff_unique_abbr": "Nanjing U;MSRA", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Nanjing;Beijing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Adaptive Scale-Invariant Online Algorithms for Learning Linear Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4196", "id": "4196", "author_site": "Michal Kempka, Wojciech Kotlowski, Manfred K. Warmuth", "author": "Michal Kempka; Wojciech Kotlowski; Manfred K. Warmuth", "abstract": "We consider online learning with linear models, where the algorithm predicts on sequentially revealed instances (feature vectors), and is compared against the best linear function (comparator) in hindsight. Popular algorithms in this framework, such as Online Gradient Descent (OGD), have parameters (learning rates), which ideally should be tuned based on the scales of the features and the optimal comparator, but these quantities only become available at the end of the learning process. In this paper, we resolve the tuning problem by proposing online algorithms making predictions which are invariant under arbitrary rescaling of the features. The algorithms have no parameters to tune, do not require any prior knowledge on the scale of the instances or the comparator, and achieve regret bounds matching (up to a logarithmic factor) that of OGD with optimally tuned separate learning rates per dimension, while retaining comparable runtime performance.", "bibtex": "@InProceedings{pmlr-v97-kempka19a,\n title = \t {Adaptive Scale-Invariant Online Algorithms for Learning Linear Models},\n author = {Kempka, Michal and Kotlowski, Wojciech and Warmuth, Manfred K.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3321--3330},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kempka19a/kempka19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kempka19a.html},\n abstract = \t {We consider online learning with linear models, where the algorithm predicts on sequentially revealed instances (feature vectors), and is compared against the best linear function (comparator) in hindsight. Popular algorithms in this framework, such as Online Gradient Descent (OGD), have parameters (learning rates), which ideally should be tuned based on the scales of the features and the optimal comparator, but these quantities only become available at the end of the learning process. In this paper, we resolve the tuning problem by proposing online algorithms making predictions which are invariant under arbitrary rescaling of the features. The algorithms have no parameters to tune, do not require any prior knowledge on the scale of the instances or the comparator, and achieve regret bounds matching (up to a logarithmic factor) that of OGD with optimally tuned separate learning rates per dimension, while retaining comparable runtime performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/kempka19a/kempka19a.pdf", "supp": "", "pdf_size": 3668945, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3468120882331657780&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Poznan University of Technology, Poznan, Poland; Poznan University of Technology, Poznan, Poland; Google Inc. Z\u00fcrich & UC Santa Cruz", "aff_domain": "cs.put.poznan.pl;cs.put.poznan.pl;ucsc.edu", "email": "cs.put.poznan.pl;cs.put.poznan.pl;ucsc.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kempka19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Poznan University of Technology;Google", "aff_unique_dep": ";Google Inc.", "aff_unique_url": "https://www.put.poznan.pl/;https://www.google.ch", "aff_unique_abbr": "PUT;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Poznan;Z\u00fcrich", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Poland;Switzerland" }, { "title": "Adaptive Sensor Placement for Continuous Spaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4082", "id": "4082", "author_site": "James A. Grant, Alexis Boukouvalas, Ryan-Rhys Griffiths, David Leslie, Sattar Vakili, Enrique Munoz De Cote", "author": "James Grant; Alexis Boukouvalas; Ryan-Rhys Griffiths; David Leslie; Sattar Vakili; Enrique Munoz De Cote", "abstract": "We consider the problem of adaptively placing sensors along an interval to detect stochastically-generated events. We present a new formulation of the problem as a continuum-armed bandit problem with feedback in the form of partial observations of realisations of an inhomogeneous Poisson process. We design a solution method by combining Thompson sampling with nonparametric inference via increasingly granular Bayesian histograms and derive an $\\tilde{O}(T^{2/3})$ bound on the Bayesian regret in $T$ rounds. This is coupled with the design of an efficent optimisation approach to select actions in polynomial time. In simulations we demonstrate our approach to have substantially lower and less variable regret than competitor algorithms.", "bibtex": "@InProceedings{pmlr-v97-grant19a,\n title = \t {Adaptive Sensor Placement for Continuous Spaces},\n author = {Grant, James and Boukouvalas, Alexis and Griffiths, Ryan-Rhys and Leslie, David and Vakili, Sattar and De Cote, Enrique Munoz},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2385--2393},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/grant19a/grant19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/grant19a.html},\n abstract = \t {We consider the problem of adaptively placing sensors along an interval to detect stochastically-generated events. We present a new formulation of the problem as a continuum-armed bandit problem with feedback in the form of partial observations of realisations of an inhomogeneous Poisson process. We design a solution method by combining Thompson sampling with nonparametric inference via increasingly granular Bayesian histograms and derive an $\\tilde{O}(T^{2/3})$ bound on the Bayesian regret in $T$ rounds. This is coupled with the design of an efficent optimisation approach to select actions in polynomial time. In simulations we demonstrate our approach to have substantially lower and less variable regret than competitor algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/grant19a/grant19a.pdf", "supp": "", "pdf_size": 2529473, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10002606302235210115&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/grant19a.html" }, { "title": "Adaptive Stochastic Natural Gradient Method for One-Shot Neural Architecture Search", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3959", "id": "3959", "author_site": "Youhei Akimoto, Shinichi Shirakawa, Nozomu Yoshinari, Kento Uchida, Shota Saito, Kouhei Nishida", "author": "Youhei Akimoto; Shinichi Shirakawa; Nozomu Yoshinari; Kento Uchida; Shota Saito; Kouhei Nishida", "abstract": "High sensitivity of neural architecture search (NAS) methods against their input such as step-size (i.e., learning rate) and search space prevents practitioners from applying them out-of-the-box to their own problems, albeit its purpose is to automate a part of tuning process. Aiming at a fast, robust, and widely-applicable NAS, we develop a generic optimization framework for NAS. We turn a coupled optimization of connection weights and neural architecture into a differentiable optimization by means of stochastic relaxation. It accepts arbitrary search space (widely-applicable) and enables to employ a gradient-based simultaneous optimization of weights and architecture (fast). We propose a stochastic natural gradient method with an adaptive step-size mechanism built upon our theoretical investigation (robust). Despite its simplicity and no problem-dependent parameter tuning, our method exhibited near state-of-the-art performances with low computational budgets both on image classification and inpainting tasks.", "bibtex": "@InProceedings{pmlr-v97-akimoto19a,\n title = \t {Adaptive Stochastic Natural Gradient Method for One-Shot Neural Architecture Search},\n author = {Akimoto, Youhei and Shirakawa, Shinichi and Yoshinari, Nozomu and Uchida, Kento and Saito, Shota and Nishida, Kouhei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {171--180},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/akimoto19a/akimoto19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/akimoto19a.html},\n abstract = \t {High sensitivity of neural architecture search (NAS) methods against their input such as step-size (i.e., learning rate) and search space prevents practitioners from applying them out-of-the-box to their own problems, albeit its purpose is to automate a part of tuning process. Aiming at a fast, robust, and widely-applicable NAS, we develop a generic optimization framework for NAS. We turn a coupled optimization of connection weights and neural architecture into a differentiable optimization by means of stochastic relaxation. It accepts arbitrary search space (widely-applicable) and enables to employ a gradient-based simultaneous optimization of weights and architecture (fast). We propose a stochastic natural gradient method with an adaptive step-size mechanism built upon our theoretical investigation (robust). Despite its simplicity and no problem-dependent parameter tuning, our method exhibited near state-of-the-art performances with low computational budgets both on image classification and inpainting tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/akimoto19a/akimoto19a.pdf", "supp": "", "pdf_size": 589638, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8278729461791344602&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Tsukuba & RIKEN AIP; Yokohama National University; Yokohama National University; Yokohama National University; SkillUp AI Co., Ltd.; Shinshu University", "aff_domain": "cs.tsukuba.ac.jp;ynu.ac.jp; ; ; ; ", "email": "cs.tsukuba.ac.jp;ynu.ac.jp; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/akimoto19a.html", "aff_unique_index": "0;1;1;1;2;3", "aff_unique_norm": "University of Tsukuba;Yokohama National University;SkillUp AI;Shinshu University", "aff_unique_dep": ";;Co., Ltd.;", "aff_unique_url": "https://www.tsukuba.ac.jp;https://www.yokohama-nu.ac.jp;;https://www.shinshu-u.ac.jp", "aff_unique_abbr": "UT;YNU;;Shinshu U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "Japan;Unknown" }, { "title": "Adaptive and Safe Bayesian Optimization in High Dimensions via One-Dimensional Subspaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4048", "id": "4048", "author_site": "Johannes Kirschner, Mojmir Mutny, Nicole Hiller, Rasmus Ischebeck, Andreas Krause", "author": "Johannes Kirschner; Mojmir Mutny; Nicole Hiller; Rasmus Ischebeck; Andreas Krause", "abstract": "Bayesian optimization is known to be difficult to scale to high dimensions, because the acquisition step requires solving a non-convex optimization problem in the same search space. In order to scale the method and keep its benefits, we propose an algorithm (LineBO) that restricts the problem to a sequence of iteratively chosen one-dimensional sub-problems that can be solved efficiently. We show that our algorithm converges globally and obtains a fast local rate when the function is strongly convex. Further, if the objective has an invariant subspace, our method automatically adapts to the effective dimension without changing the algorithm. When combined with the SafeOpt algorithm to solve the sub-problems, we obtain the first safe Bayesian optimization algorithm with theoretical guarantees applicable in high-dimensional settings. We evaluate our method on multiple synthetic benchmarks, where we obtain competitive performance. Further, we deploy our algorithm to optimize the beam intensity of the Swiss Free Electron Laser with up to 40 parameters while satisfying safe operation constraints.", "bibtex": "@InProceedings{pmlr-v97-kirschner19a,\n title = \t {Adaptive and Safe {B}ayesian Optimization in High Dimensions via One-Dimensional Subspaces},\n author = {Kirschner, Johannes and Mutny, Mojmir and Hiller, Nicole and Ischebeck, Rasmus and Krause, Andreas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3429--3438},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kirschner19a/kirschner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kirschner19a.html},\n abstract = \t {Bayesian optimization is known to be difficult to scale to high dimensions, because the acquisition step requires solving a non-convex optimization problem in the same search space. In order to scale the method and keep its benefits, we propose an algorithm (LineBO) that restricts the problem to a sequence of iteratively chosen one-dimensional sub-problems that can be solved efficiently. We show that our algorithm converges globally and obtains a fast local rate when the function is strongly convex. Further, if the objective has an invariant subspace, our method automatically adapts to the effective dimension without changing the algorithm. When combined with the SafeOpt algorithm to solve the sub-problems, we obtain the first safe Bayesian optimization algorithm with theoretical guarantees applicable in high-dimensional settings. We evaluate our method on multiple synthetic benchmarks, where we obtain competitive performance. Further, we deploy our algorithm to optimize the beam intensity of the Swiss Free Electron Laser with up to 40 parameters while satisfying safe operation constraints.}\n}", "pdf": "http://proceedings.mlr.press/v97/kirschner19a/kirschner19a.pdf", "supp": "", "pdf_size": 1077065, "gs_citation": 194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10899983332260103973&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland; Paul Scherrer Institut, Switzerland; Paul Scherrer Institut, Switzerland; Department of Computer Science, ETH Zurich, Switzerland", "aff_domain": "inf.ethz.ch; ; ; ; ", "email": "inf.ethz.ch; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kirschner19a.html", "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "ETH Zurich;Paul Scherrer Institut", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ethz.ch;https://www.psi.ch", "aff_unique_abbr": "ETHZ;PSI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Addressing the Loss-Metric Mismatch with Adaptive Loss Alignment", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3769", "id": "3769", "author_site": "Chen Huang, Shuangfei Zhai, Walter Talbott, Miguel Angel Bautista Martin, Shih-Yu Sun, Carlos Guestrin, Joshua M Susskind", "author": "Chen Huang; Shuangfei Zhai; Walter Talbott; Miguel Bautista Martin; Shih-Yu Sun; Carlos Guestrin; Josh Susskind", "abstract": "In most machine learning training paradigms a fixed, often handcrafted, loss function is assumed to be a good proxy for an underlying evaluation metric. In this work we assess this assumption by meta-learning an adaptive loss function to directly optimize the evaluation metric. We propose a sample efficient reinforcement learning approach for adapting the loss dynamically during training. We empirically show how this formulation improves performance by simultaneously optimizing the evaluation metric and smoothing the loss landscape. We verify our method in metric learning and classification scenarios, showing considerable improvements over the state-of-the-art on a diverse set of tasks. Importantly, our method is applicable to a wide range of loss functions and evaluation metrics. Furthermore, the learned policies are transferable across tasks and data, demonstrating the versatility of the method.", "bibtex": "@InProceedings{pmlr-v97-huang19f,\n title = \t {Addressing the Loss-Metric Mismatch with Adaptive Loss Alignment},\n author = {Huang, Chen and Zhai, Shuangfei and Talbott, Walter and Martin, Miguel Bautista and Sun, Shih-Yu and Guestrin, Carlos and Susskind, Josh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2891--2900},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/huang19f/huang19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/huang19f.html},\n abstract = \t {In most machine learning training paradigms a fixed, often handcrafted, loss function is assumed to be a good proxy for an underlying evaluation metric. In this work we assess this assumption by meta-learning an adaptive loss function to directly optimize the evaluation metric. We propose a sample efficient reinforcement learning approach for adapting the loss dynamically during training. We empirically show how this formulation improves performance by simultaneously optimizing the evaluation metric and smoothing the loss landscape. We verify our method in metric learning and classification scenarios, showing considerable improvements over the state-of-the-art on a diverse set of tasks. Importantly, our method is applicable to a wide range of loss functions and evaluation metrics. Furthermore, the learned policies are transferable across tasks and data, demonstrating the versatility of the method.}\n}", "pdf": "http://proceedings.mlr.press/v97/huang19f/huang19f.pdf", "supp": "", "pdf_size": 1066983, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2816924928973399583&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States; Apple Inc., Cupertino, United States", "aff_domain": "apple.com; ; ; ; ; ; ", "email": "apple.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/huang19f.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Apple", "aff_unique_dep": "Apple Inc.", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Cupertino", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adjustment Criteria for Generalizing Experimental Findings", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4283", "id": "4283", "author_site": "Juan Correa, Jin Tian, Elias Bareinboim", "author": "Juan Correa; Jin Tian; Elias Bareinboim", "abstract": "Generalizing causal effects from a controlled experiment to settings beyond the particular study population is arguably one of the central tasks found in empirical circles. While a proper design and careful execution of the experiment would support, under mild conditions, the validity of inferences about the population in which the experiment was conducted, two challenges make the extrapolation step to different populations somewhat involved, namely, transportability and sampling selection bias. The former is concerned with disparities in the distributions and causal mechanisms between the domain (i.e., settings, population, environment) where the experiment is conducted and where the inferences are intended; the latter with distortions in the sample\u2019s proportions due to preferential selection of units into the study. In this paper, we investigate the assumptions and machinery necessary for using", "bibtex": "@InProceedings{pmlr-v97-correa19a,\n title = \t {Adjustment Criteria for Generalizing Experimental Findings},\n author = {Correa, Juan and Tian, Jin and Bareinboim, Elias},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1361--1369},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/correa19a/correa19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/correa19a.html},\n abstract = \t {Generalizing causal effects from a controlled experiment to settings beyond the particular study population is arguably one of the central tasks found in empirical circles. While a proper design and careful execution of the experiment would support, under mild conditions, the validity of inferences about the population in which the experiment was conducted, two challenges make the extrapolation step to different populations somewhat involved, namely, transportability and sampling selection bias. The former is concerned with disparities in the distributions and causal mechanisms between the domain (i.e., settings, population, environment) where the experiment is conducted and where the inferences are intended; the latter with distortions in the sample\u2019s proportions due to preferential selection of units into the study. In this paper, we investigate the assumptions and machinery necessary for using", "pdf": "http://proceedings.mlr.press/v97/correa19a/correa19a.pdf", "supp": "", "pdf_size": 402934, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=925008972912493957&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "aff": "Department of Computer Science, Purdue University, Indiana, USA; Computer Science Department, Iowa State University, IA, USA; Department of Computer Science, Purdue University, Indiana, USA", "aff_domain": "purdue.edu; ;purdue.edu", "email": "purdue.edu; ;purdue.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/correa19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Purdue University;Iowa State University", "aff_unique_dep": "Department of Computer Science;Computer Science Department", "aff_unique_url": "https://www.purdue.edu;https://www.iastate.edu", "aff_unique_abbr": "Purdue;ISU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Indiana;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarial Attacks on Node Embeddings via Graph Poisoning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4072", "id": "4072", "author_site": "Aleksandar Bojchevski, Stephan G\u00fcnnemann", "author": "Aleksandar Bojchevski; Stephan G\u00fcnnemann", "abstract": "The goal of network representation learning is to learn low-dimensional node embeddings that capture the graph structure and are useful for solving downstream tasks. However, despite the proliferation of such methods, there is currently no study of their robustness to adversarial attacks. We provide the first adversarial vulnerability analysis on the widely used family of methods based on random walks. We derive efficient adversarial perturbations that poison the network structure and have a negative effect on both the quality of the embeddings and the downstream tasks. We further show that our attacks are transferable since they generalize to many models and are successful even when the attacker is restricted.", "bibtex": "@InProceedings{pmlr-v97-bojchevski19a,\n title = \t {Adversarial Attacks on Node Embeddings via Graph Poisoning},\n author = {Bojchevski, Aleksandar and G{\\\"u}nnemann, Stephan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {695--704},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bojchevski19a/bojchevski19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bojchevski19a.html},\n abstract = \t {The goal of network representation learning is to learn low-dimensional node embeddings that capture the graph structure and are useful for solving downstream tasks. However, despite the proliferation of such methods, there is currently no study of their robustness to adversarial attacks. We provide the first adversarial vulnerability analysis on the widely used family of methods based on random walks. We derive efficient adversarial perturbations that poison the network structure and have a negative effect on both the quality of the embeddings and the downstream tasks. We further show that our attacks are transferable since they generalize to many models and are successful even when the attacker is restricted.}\n}", "pdf": "http://proceedings.mlr.press/v97/bojchevski19a/bojchevski19a.pdf", "supp": "", "pdf_size": 1694593, "gs_citation": 391, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10616006612016117412&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Technical University of Munich; Technical University of Munich", "aff_domain": "in.tum.de; ", "email": "in.tum.de; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/bojchevski19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Adversarial Examples Are a Natural Consequence of Test Error in Noise", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3917", "id": "3917", "author_site": "Justin Gilmer, Nicolas Ford, Nicholas Carlini, Ekin Dogus Cubuk", "author": "Justin Gilmer; Nicolas Ford; Nicholas Carlini; Ekin Cubuk", "abstract": "Over the last few years, the phenomenon of", "bibtex": "@InProceedings{pmlr-v97-gilmer19a,\n title = \t {Adversarial Examples Are a Natural Consequence of Test Error in Noise},\n author = {Gilmer, Justin and Ford, Nicolas and Carlini, Nicholas and Cubuk, Ekin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2280--2289},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gilmer19a/gilmer19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gilmer19a.html},\n abstract = \t {Over the last few years, the phenomenon of", "pdf": "http://proceedings.mlr.press/v97/gilmer19a/gilmer19a.pdf", "supp": "", "pdf_size": 1075775, "gs_citation": 202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17742461639569030742&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Google Brain + Google AI Residency; Google Brain; Google Brain; Google Brain", "aff_domain": "google.com;google.com; ; ", "email": "google.com;google.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/gilmer19a.html", "aff_unique_index": "0+0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarial Generation of Time-Frequency Features with application in audio synthesis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4016", "id": "4016", "author_site": "Andr\u00e9s Marafioti, Nathana\u00ebl Perraudin, Nicki Holighaus, Piotr Majdak", "author": "Andr\u00e9s Marafioti; Nathana\u00ebl Perraudin; Nicki Holighaus; Piotr Majdak", "abstract": "Time-frequency (TF) representations provide powerful and intuitive features for the analysis of time series such as audio. But still, generative modeling of audio in the TF domain is a subtle matter. Consequently, neural audio synthesis widely relies on directly modeling the waveform and previous attempts at unconditionally synthesizing audio from neurally generated invertible TF features still struggle to produce audio at satisfying quality. In this article, focusing on the short-time Fourier transform, we discuss the challenges that arise in audio synthesis based on generated invertible TF features and how to overcome them. We demonstrate the potential of deliberate generative TF modeling by training a generative adversarial network (GAN) on short-time Fourier features. We show that by applying our guidelines, our TF-based network was able to outperform a state-of-the-art GAN generating waveforms directly, despite the similar architecture in the two networks.", "bibtex": "@InProceedings{pmlr-v97-marafioti19a,\n title = \t {Adversarial Generation of Time-Frequency Features with application in audio synthesis},\n author = {Marafioti, Andr{\\'e}s and Perraudin, Nathana{\\\"e}l and Holighaus, Nicki and Majdak, Piotr},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4352--4362},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/marafioti19a/marafioti19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/marafioti19a.html},\n abstract = \t {Time-frequency (TF) representations provide powerful and intuitive features for the analysis of time series such as audio. But still, generative modeling of audio in the TF domain is a subtle matter. Consequently, neural audio synthesis widely relies on directly modeling the waveform and previous attempts at unconditionally synthesizing audio from neurally generated invertible TF features still struggle to produce audio at satisfying quality. In this article, focusing on the short-time Fourier transform, we discuss the challenges that arise in audio synthesis based on generated invertible TF features and how to overcome them. We demonstrate the potential of deliberate generative TF modeling by training a generative adversarial network (GAN) on short-time Fourier features. We show that by applying our guidelines, our TF-based network was able to outperform a state-of-the-art GAN generating waveforms directly, despite the similar architecture in the two networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/marafioti19a/marafioti19a.pdf", "supp": "", "pdf_size": 2480714, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7293234438017145749&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Acoustics Research Institute, Austrian Academy of Sciences; Acoustics Research Institute, Austrian Academy of Sciences; Swiss Data Science Center, ETH Z\u00fcrich; Acoustics Research Institute, Austrian Academy of Sciences", "aff_domain": "kfs.oeaw.ac.at; ;ethz.ch; ", "email": "kfs.oeaw.ac.at; ;ethz.ch; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/marafioti19a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Austrian Academy of Sciences;ETH Zurich", "aff_unique_dep": "Acoustics Research Institute;Swiss Data Science Center", "aff_unique_url": "https://www.oeaw.ac.at;https://www.ethz.ch", "aff_unique_abbr": ";ETHZ", "aff_campus_unique_index": "1", "aff_campus_unique": ";Z\u00fcrich", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Austria;Switzerland" }, { "title": "Adversarial Online Learning with noise", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3633", "id": "3633", "author_site": "Alon Resler, Yishay Mansour", "author": "Alon Resler; Yishay Mansour", "abstract": "We present and study models of adversarial online learning where the feedback observed by the learner is noisy, and the feedback is either full information feedback or bandit feedback. Specifically, we consider binary losses xored with the noise, which is a Bernoulli random variable. We consider both a constant noise rate and a variable noise rate. Our main results are tight regret bounds for learning with noise in the adversarial online learning model.", "bibtex": "@InProceedings{pmlr-v97-resler19a,\n title = \t {Adversarial Online Learning with noise},\n author = {Resler, Alon and Mansour, Yishay},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5429--5437},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/resler19a/resler19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/resler19a.html},\n abstract = \t {We present and study models of adversarial online learning where the feedback observed by the learner is noisy, and the feedback is either full information feedback or bandit feedback. Specifically, we consider binary losses xored with the noise, which is a Bernoulli random variable. We consider both a constant noise rate and a variable noise rate. Our main results are tight regret bounds for learning with noise in the adversarial online learning model.}\n}", "pdf": "http://proceedings.mlr.press/v97/resler19a/resler19a.pdf", "supp": "", "pdf_size": 273168, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6547880943751795299&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Blavatnik School of Computer Science, Tel Aviv University, Tel Aviv, Israel+Google Research, Israel; Blavatnik School of Computer Science, Tel Aviv University, Tel Aviv, Israel+Google Research, Israel", "aff_domain": "gmail.com;gmail.com", "email": "gmail.com;gmail.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/resler19a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Tel Aviv University;Google", "aff_unique_dep": "Blavatnik School of Computer Science;Google Research", "aff_unique_url": "https://www.tau.ac.il;https://research.google", "aff_unique_abbr": "TAU;Google", "aff_campus_unique_index": "0+1;0+1", "aff_campus_unique": "Tel Aviv;Israel", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Israel" }, { "title": "Adversarial camera stickers: A physical camera-based attack on deep learning systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4262", "id": "4262", "author_site": "Juncheng Li, Frank R Schmidt, Zico Kolter", "author": "Juncheng Li; Frank Schmidt; Zico Kolter", "abstract": "Recent work has documented the susceptibility of deep learning systems to adversarial examples, but most such attacks directly manipulate the digital input to a classifier. Although a smaller line of work considers physical adversarial attacks, in all cases these involve manipulating the object of interest, e.g., putting a physical sticker on an object to misclassify it, or manufacturing an object specifically intended to be misclassified. In this work, we consider an alternative question: is it possible to fool deep classifiers, over all perceived objects of a certain type, by physically manipulating the camera itself? We show that by placing a carefully crafted and mainly-translucent sticker over the lens of a camera, one can create universal perturbations of the observed images that are inconspicuous, yet misclassify target objects as a different (targeted) class. To accomplish this, we propose an iterative procedure for both updating the attack perturbation (to make it adversarial for a given classifier), and the threat model itself (to ensure it is physically realizable). For example, we show that we can achieve physically-realizable attacks that fool ImageNet classifiers in a targeted fashion 49.6% of the time. This presents a new class of physically-realizable threat models to consider in the context of adversarially robust machine learning. Our demo video can be viewed at: https://youtu.be/wUVmL33Fx54", "bibtex": "@InProceedings{pmlr-v97-li19j,\n title = \t {Adversarial camera stickers: A physical camera-based attack on deep learning systems},\n author = {Li, Juncheng and Schmidt, Frank and Kolter, Zico},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3896--3904},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19j/li19j.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19j.html},\n abstract = \t {Recent work has documented the susceptibility of deep learning systems to adversarial examples, but most such attacks directly manipulate the digital input to a classifier. Although a smaller line of work considers physical adversarial attacks, in all cases these involve manipulating the object of interest, e.g., putting a physical sticker on an object to misclassify it, or manufacturing an object specifically intended to be misclassified. In this work, we consider an alternative question: is it possible to fool deep classifiers, over all perceived objects of a certain type, by physically manipulating the camera itself? We show that by placing a carefully crafted and mainly-translucent sticker over the lens of a camera, one can create universal perturbations of the observed images that are inconspicuous, yet misclassify target objects as a different (targeted) class. To accomplish this, we propose an iterative procedure for both updating the attack perturbation (to make it adversarial for a given classifier), and the threat model itself (to ensure it is physically realizable). For example, we show that we can achieve physically-realizable attacks that fool ImageNet classifiers in a targeted fashion 49.6% of the time. This presents a new class of physically-realizable threat models to consider in the context of adversarially robust machine learning. Our demo video can be viewed at: https://youtu.be/wUVmL33Fx54}\n}", "pdf": "http://proceedings.mlr.press/v97/li19j/li19j.pdf", "supp": "", "pdf_size": 6349737, "gs_citation": 209, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8454184380086098103&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Bosch Center for Arti\ufb01cial Intelligence + School of Computer Science, Carnegie Mellon University; Bosch Center for Arti\ufb01cial Intelligence + School of Computer Science, Carnegie Mellon University; Bosch Center for Arti\ufb01cial Intelligence + School of Computer Science, Carnegie Mellon University", "aff_domain": "us.bosch.com;de.bosch.com;us.bosch.com", "email": "us.bosch.com;de.bosch.com;us.bosch.com", "github": "", "project": "https://youtu.be/wUVmL33Fx54", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/li19j.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Bosch Center for Arti\ufb01cial Intelligence;Carnegie Mellon University", "aff_unique_dep": "Artificial Intelligence;School of Computer Science", "aff_unique_url": "https://www.bosch-ai.com;https://www.cmu.edu", "aff_unique_abbr": "BCAI;CMU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0+1;0+1;0+1", "aff_country_unique": "Germany;United States" }, { "title": "Adversarial examples from computational constraints", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3956", "id": "3956", "author_site": "Sebastien Bubeck, Yin Tat Lee, Eric Price, Ilya Razenshteyn", "author": "Sebastien Bubeck; Yin Tat Lee; Eric Price; Ilya Razenshteyn", "abstract": "Why are classifiers in high dimension vulnerable to \u201cadversarial\u201d perturbations? We show that it is likely not due to information theoretic limitations, but rather it could be due to computational constraints. First we prove that, for a broad set of classification tasks, the mere existence of a robust classifier implies that it can be found by a possibly exponential-time algorithm with relatively few training examples. Then we give two particular classification tasks where learning a robust classifier is computationally intractable. More precisely we construct two binary classifications task in high dimensional space which are (i) information theoretically easy to learn robustly for large perturbations, (ii) efficiently learnable (non-robustly) by a simple linear separator, (iii) yet are not efficiently robustly learnable, even for small perturbations. Specifically, for the first task hardness holds for any efficient algorithm in the statistical query (SQ) model, while for the second task we rule out any efficient algorithm under a cryptographic assumption. These examples give an exponential separation between classical learning and robust learning in the statistical query model or under a cryptographic assumption. It suggests that adversarial examples may be an unavoidable byproduct of computational limitations of learning algorithms.", "bibtex": "@InProceedings{pmlr-v97-bubeck19a,\n title = \t {Adversarial examples from computational constraints},\n author = {Bubeck, Sebastien and Lee, Yin Tat and Price, Eric and Razenshteyn, Ilya},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {831--840},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bubeck19a/bubeck19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bubeck19a.html},\n abstract = \t {Why are classifiers in high dimension vulnerable to \u201cadversarial\u201d perturbations? We show that it is likely not due to information theoretic limitations, but rather it could be due to computational constraints. First we prove that, for a broad set of classification tasks, the mere existence of a robust classifier implies that it can be found by a possibly exponential-time algorithm with relatively few training examples. Then we give two particular classification tasks where learning a robust classifier is computationally intractable. More precisely we construct two binary classifications task in high dimensional space which are (i) information theoretically easy to learn robustly for large perturbations, (ii) efficiently learnable (non-robustly) by a simple linear separator, (iii) yet are not efficiently robustly learnable, even for small perturbations. Specifically, for the first task hardness holds for any efficient algorithm in the statistical query (SQ) model, while for the second task we rule out any efficient algorithm under a cryptographic assumption. These examples give an exponential separation between classical learning and robust learning in the statistical query model or under a cryptographic assumption. It suggests that adversarial examples may be an unavoidable byproduct of computational limitations of learning algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/bubeck19a/bubeck19a.pdf", "supp": "", "pdf_size": 230893, "gs_citation": 262, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7452233576224660837&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Microsoft Research, Redmond, Washington, USA+University of Washington, Seattle, Washington, USA; Microsoft Research, Redmond, Washington, USA+University of Washington, Seattle, Washington, USA; University of Texas, Austin, Texas, USA; Microsoft Research, Redmond, Washington, USA", "aff_domain": "microsoft.com; ; ;microsoft.com", "email": "microsoft.com; ; ;microsoft.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/bubeck19a.html", "aff_unique_index": "0+1;0+1;2;0", "aff_unique_norm": "Microsoft;University of Washington;University of Texas at Austin", "aff_unique_dep": "Microsoft Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.washington.edu;https://www.utexas.edu", "aff_unique_abbr": "MSR;UW;UT Austin", "aff_campus_unique_index": "0+1;0+1;2;0", "aff_campus_unique": "Redmond;Seattle;Austin", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Adversarially Learned Representations for Information Obfuscation and Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4236", "id": "4236", "author_site": "Martin A Bertran, Natalia Martinez Gil, Afroditi Papadaki, Qiang Qiu, Miguel Rodrigues, Galen Reeves, Guillermo Sapiro", "author": "Martin Bertran; Natalia Martinez; Afroditi Papadaki; Qiang Qiu; Miguel Rodrigues; Galen Reeves; Guillermo Sapiro", "abstract": "Data collection and sharing are pervasive aspects of modern society. This process can either be voluntary, as in the case of a person taking a facial image to unlock his/her phone, or incidental, such as traffic cameras collecting videos on pedestrians. An undesirable side effect of these processes is that shared data can carry information about attributes that users might consider as sensitive, even when such information is of limited use for the task. It is therefore desirable for both data collectors and users to design procedures that minimize sensitive information leakage. Balancing the competing objectives of providing meaningful individualized service levels and inference while obfuscating sensitive information is still an open problem. In this work, we take an information theoretic approach that is implemented as an unconstrained adversarial game between Deep Neural Networks in a principled, data-driven manner. This approach enables us to learn domain-preserving stochastic transformations that maintain performance on existing algorithms while minimizing sensitive information leakage.", "bibtex": "@InProceedings{pmlr-v97-bertran19a,\n title = \t {Adversarially Learned Representations for Information Obfuscation and Inference},\n author = {Bertran, Martin and Martinez, Natalia and Papadaki, Afroditi and Qiu, Qiang and Rodrigues, Miguel and Reeves, Galen and Sapiro, Guillermo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {614--623},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bertran19a/bertran19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bertran19a.html},\n abstract = \t {Data collection and sharing are pervasive aspects of modern society. This process can either be voluntary, as in the case of a person taking a facial image to unlock his/her phone, or incidental, such as traffic cameras collecting videos on pedestrians. An undesirable side effect of these processes is that shared data can carry information about attributes that users might consider as sensitive, even when such information is of limited use for the task. It is therefore desirable for both data collectors and users to design procedures that minimize sensitive information leakage. Balancing the competing objectives of providing meaningful individualized service levels and inference while obfuscating sensitive information is still an open problem. In this work, we take an information theoretic approach that is implemented as an unconstrained adversarial game between Deep Neural Networks in a principled, data-driven manner. This approach enables us to learn domain-preserving stochastic transformations that maintain performance on existing algorithms while minimizing sensitive information leakage.}\n}", "pdf": "http://proceedings.mlr.press/v97/bertran19a/bertran19a.pdf", "supp": "", "pdf_size": 3464535, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6674388784568403343&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Duke University; Duke University; University College London; Duke University; University College London; Duke University; Duke University", "aff_domain": "duke.edu; ; ; ; ; ; ", "email": "duke.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/bertran19a.html", "aff_unique_index": "0;0;1;0;1;0;0", "aff_unique_norm": "Duke University;University College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.duke.edu;https://www.ucl.ac.uk", "aff_unique_abbr": "Duke;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Agnostic Federated Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4100", "id": "4100", "author_site": "Mehryar Mohri, Gary Sivek, Ananda Suresh", "author": "Mehryar Mohri; Gary Sivek; Ananda Theertha Suresh", "abstract": "A key learning scenario in large-scale applications is that of federated learning, where a centralized model is trained based on data originating from a large number of clients. We argue that, with the existing training and inference, federated models can be biased towards different clients. Instead, we propose a new framework of agnostic federated learning, where the centralized model is optimized for any target distribution formed by a mixture of the client distributions. We further show that this framework naturally yields a notion of fairness. We present data-dependent Rademacher complexity guarantees for learning with this objective, which guide the definition of an algorithm for agnostic federated learning. We also give a fast stochastic optimization algorithm for solving the corresponding optimization problem, for which we prove convergence bounds, assuming a convex loss function and a convex hypothesis set. We further empirically demonstrate the benefits of our approach in several datasets. Beyond federated learning, our framework and algorithm can be of interest to other learning scenarios such as cloud computing, domain adaptation, drifting, and other contexts where the training and test distributions do not coincide.", "bibtex": "@InProceedings{pmlr-v97-mohri19a,\n title = \t {Agnostic Federated Learning},\n author = {Mohri, Mehryar and Sivek, Gary and Suresh, Ananda Theertha},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4615--4625},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mohri19a/mohri19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mohri19a.html},\n abstract = \t {A key learning scenario in large-scale applications is that of federated learning, where a centralized model is trained based on data originating from a large number of clients. We argue that, with the existing training and inference, federated models can be biased towards different clients. Instead, we propose a new framework of agnostic federated learning, where the centralized model is optimized for any target distribution formed by a mixture of the client distributions. We further show that this framework naturally yields a notion of fairness. We present data-dependent Rademacher complexity guarantees for learning with this objective, which guide the definition of an algorithm for agnostic federated learning. We also give a fast stochastic optimization algorithm for solving the corresponding optimization problem, for which we prove convergence bounds, assuming a convex loss function and a convex hypothesis set. We further empirically demonstrate the benefits of our approach in several datasets. Beyond federated learning, our framework and algorithm can be of interest to other learning scenarios such as cloud computing, domain adaptation, drifting, and other contexts where the training and test distributions do not coincide.}\n}", "pdf": "http://proceedings.mlr.press/v97/mohri19a/mohri19a.pdf", "supp": "", "pdf_size": 531669, "gs_citation": 1180, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1576121636504228393&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Research, New York + Courant Institute of Mathematical Sciences, New York, NY; Google Research, New York; Google Research, New York", "aff_domain": "google.com;google.com;google.com", "email": "google.com;google.com;google.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mohri19a.html", "aff_unique_index": "0+1;0;0", "aff_unique_norm": "Google;Courant Institute of Mathematical Sciences", "aff_unique_dep": "Google Research;Mathematical Sciences", "aff_unique_url": "https://research.google;https://courant.nyu.edu", "aff_unique_abbr": "Google;Courant", "aff_campus_unique_index": "0+0;0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Almost Unsupervised Text to Speech and Automatic Speech Recognition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3849", "id": "3849", "author_site": "Yi Ren, Xu Tan, Tao Qin, Sheng Zhao, Zhou Zhao, Tie-Yan Liu", "author": "Yi Ren; Xu Tan; Tao Qin; Sheng Zhao; Zhou Zhao; Tie-Yan Liu", "abstract": "Text to speech (TTS) and automatic speech recognition (ASR) are two dual tasks in speech processing and both achieve impressive performance thanks to the recent advance in deep learning and large amount of aligned speech and text data. However, the lack of aligned data poses a major practical problem for TTS and ASR on low-resource languages. In this paper, by leveraging the dual nature of the two tasks, we propose an almost unsupervised learning method that only leverages few hundreds of paired data and extra unpaired data for TTS and ASR. Our method consists of the following components: (1) denoising auto-encoder, which reconstructs speech and text sequences respectively to develop the capability of language modeling both in speech and text domain; (2) dual transformation, where the TTS model transforms the text $y$ into speech $\\hat{x}$, and the ASR model leverages the transformed pair $(\\hat{x},y)$ for training, and vice versa, to boost the accuracy of the two tasks; (3) bidirectional sequence modeling, which address the error propagation problem especially in the long speech and text sequence when training with few paired data; (4) a unified model structure, which combines all the above components for TTS and ASR based on Transformer model. Our method achieves 99.84% in terms of word level intelligible rate and 2.68 MOS for TTS, and 11.7% PER for ASR on LJSpeech dataset, by leveraging only 200 paired speech and text data (about 20 minutes audio), together with extra unpaired speech and text data.", "bibtex": "@InProceedings{pmlr-v97-ren19a,\n title = \t {Almost Unsupervised Text to Speech and Automatic Speech Recognition},\n author = {Ren, Yi and Tan, Xu and Qin, Tao and Zhao, Sheng and Zhao, Zhou and Liu, Tie-Yan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5410--5419},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ren19a/ren19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ren19a.html},\n abstract = \t {Text to speech (TTS) and automatic speech recognition (ASR) are two dual tasks in speech processing and both achieve impressive performance thanks to the recent advance in deep learning and large amount of aligned speech and text data. However, the lack of aligned data poses a major practical problem for TTS and ASR on low-resource languages. In this paper, by leveraging the dual nature of the two tasks, we propose an almost unsupervised learning method that only leverages few hundreds of paired data and extra unpaired data for TTS and ASR. Our method consists of the following components: (1) denoising auto-encoder, which reconstructs speech and text sequences respectively to develop the capability of language modeling both in speech and text domain; (2) dual transformation, where the TTS model transforms the text $y$ into speech $\\hat{x}$, and the ASR model leverages the transformed pair $(\\hat{x},y)$ for training, and vice versa, to boost the accuracy of the two tasks; (3) bidirectional sequence modeling, which address the error propagation problem especially in the long speech and text sequence when training with few paired data; (4) a unified model structure, which combines all the above components for TTS and ASR based on Transformer model. Our method achieves 99.84% in terms of word level intelligible rate and 2.68 MOS for TTS, and 11.7% PER for ASR on LJSpeech dataset, by leveraging only 200 paired speech and text data (about 20 minutes audio), together with extra unpaired speech and text data.}\n}", "pdf": "http://proceedings.mlr.press/v97/ren19a/ren19a.pdf", "supp": "", "pdf_size": 948522, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16710802202249302730&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Zhejiang University; Microsoft Research; Microsoft Research; Microsoft STC Asia; Zhejiang University; Microsoft Research", "aff_domain": "zju.edu.cn;microsoft.com;microsoft.com;microsoft.com;zju.edu.cn;microsoft.com", "email": "zju.edu.cn;microsoft.com;microsoft.com;microsoft.com;zju.edu.cn;microsoft.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/ren19a.html", "aff_unique_index": "0;1;1;1;0;1", "aff_unique_norm": "Zhejiang University;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.zju.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "ZJU;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Asia", "aff_country_unique_index": "0;1;1;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Almost surely constrained convex optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4012", "id": "4012", "author_site": "Olivier Fercoq, Ahmet Alacaoglu, Ion Necoara, Volkan Cevher", "author": "Olivier Fercoq; Ahmet Alacaoglu; Ion Necoara; Volkan Cevher", "abstract": "We propose a stochastic gradient framework for solving stochastic composite convex optimization problems with (possibly) infinite number of linear inclusion constraints that need to be satisfied almost surely. We use smoothing and homotopy techniques to handle constraints without the need for matrix-valued projections. We show for our stochastic gradient algorithm $\\mathcal{O}(\\log(k)/\\sqrt{k})$ convergence rate for general convex objectives and $\\mathcal{O}(\\log(k)/k)$ convergence rate for restricted strongly convex objectives. These rates are known to be optimal up to logarithmic factor, even without constraints. We conduct numerical experiments on basis pursuit, hard margin support vector machines and portfolio optimization problems and show that our algorithm achieves state-of-the-art practical performance.", "bibtex": "@InProceedings{pmlr-v97-fercoq19a,\n title = \t {Almost surely constrained convex optimization},\n author = {Fercoq, Olivier and Alacaoglu, Ahmet and Necoara, Ion and Cevher, Volkan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1910--1919},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fercoq19a/fercoq19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fercoq19a.html},\n abstract = \t {We propose a stochastic gradient framework for solving stochastic composite convex optimization problems with (possibly) infinite number of linear inclusion constraints that need to be satisfied almost surely. We use smoothing and homotopy techniques to handle constraints without the need for matrix-valued projections. We show for our stochastic gradient algorithm $\\mathcal{O}(\\log(k)/\\sqrt{k})$ convergence rate for general convex objectives and $\\mathcal{O}(\\log(k)/k)$ convergence rate for restricted strongly convex objectives. These rates are known to be optimal up to logarithmic factor, even without constraints. We conduct numerical experiments on basis pursuit, hard margin support vector machines and portfolio optimization problems and show that our algorithm achieves state-of-the-art practical performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/fercoq19a/fercoq19a.pdf", "supp": "", "pdf_size": 409539, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17896433868274772913&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "LTCI, T\u00e9l\u00e9com ParisTech, Universit\u00e9 Paris-Saclay; Laboratory for Information and Inference Systems, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne; Department of Automatic Control and Systems Engineering, University Politehnica Bucharest; Laboratory for Information and Inference Systems, \u00c9cole Polytechnique F\u00e9d\u00e9rale de Lausanne", "aff_domain": "telecom-paristech.fr; ; ; ", "email": "telecom-paristech.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/fercoq19a.html", "aff_unique_index": "0;1;2;1", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech;EPFL;University Politehnica Bucharest", "aff_unique_dep": "LTCI;Laboratory for Information and Inference Systems;Department of Automatic Control and Systems Engineering", "aff_unique_url": "https://www.telecom-paris.fr;https://www.epfl.ch;https://www.upb.ro", "aff_unique_abbr": "T\u00e9l\u00e9com ParisTech;EPFL;", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;1;2;1", "aff_country_unique": "France;Switzerland;Romania" }, { "title": "Alternating Minimizations Converge to Second-Order Optimal Solutions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3745", "id": "3745", "author_site": "Qiuwei Li, Zhihui Zhu, Gongguo Tang", "author": "Qiuwei Li; Zhihui Zhu; Gongguo Tang", "abstract": "This work studies the second-order convergence for both standard alternating minimization and proximal alternating minimization. We show that under mild assumptions on the (nonconvex) objective function, both algorithms avoid strict saddles almost surely from random initialization. Together with known first-order convergence results, this implies both algorithms converge to a second-order stationary point. This solves an open problem for the second-order convergence of alternating minimization algorithms that have been widely used in practice to solve large-scale nonconvex problems due to their simple implementation, fast convergence, and superb empirical performance.", "bibtex": "@InProceedings{pmlr-v97-li19n,\n title = \t {Alternating Minimizations Converge to Second-Order Optimal Solutions},\n author = {Li, Qiuwei and Zhu, Zhihui and Tang, Gongguo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3935--3943},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19n/li19n.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19n.html},\n abstract = \t {This work studies the second-order convergence for both standard alternating minimization and proximal alternating minimization. We show that under mild assumptions on the (nonconvex) objective function, both algorithms avoid strict saddles almost surely from random initialization. Together with known first-order convergence results, this implies both algorithms converge to a second-order stationary point. This solves an open problem for the second-order convergence of alternating minimization algorithms that have been widely used in practice to solve large-scale nonconvex problems due to their simple implementation, fast convergence, and superb empirical performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19n/li19n.pdf", "supp": "", "pdf_size": 353320, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4946864975269758352&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Electrical Engineering, Colorado School of Mines; Mathematical Institute for Data Science, Johns Hopkins University; Department of Electrical Engineering, Colorado School of Mines", "aff_domain": "mines.edu;jhu.edu;mines.edu", "email": "mines.edu;jhu.edu;mines.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/li19n.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Colorado School of Mines;Johns Hopkins University", "aff_unique_dep": "Department of Electrical Engineering;Mathematical Institute for Data Science", "aff_unique_url": "https://www.mines.edu;https://www.jhu.edu", "aff_unique_abbr": "CSM;JHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Amortized Monte Carlo Integration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3869", "id": "3869", "author_site": "Adam Golinski, Frank Wood, Tom Rainforth", "author": "Adam Golinski; Frank Wood; Tom Rainforth", "abstract": "Current approaches to amortizing Bayesian inference focus solely on approximating the posterior distribution. Typically, this approximation is, in turn, used to calculate expectations for one or more target functions{\u2014}a computational pipeline which is inefficient when the target function(s) are known upfront. In this paper, we address this inefficiency by introducing AMCI, a method for amortizing Monte Carlo integration directly. AMCI operates similarly to amortized inference but produces three distinct amortized proposals, each tailored to a different component of the overall expectation calculation. At runtime, samples are produced separately from each amortized proposal, before being combined to an overall estimate of the expectation. We show that while existing approaches are fundamentally limited in the level of accuracy they can achieve, AMCI can theoretically produce arbitrarily small errors for any integrable target function using only a single sample from each proposal at runtime. We further show that it is able to empirically outperform the theoretically optimal selfnormalized importance sampler on a number of example problems. Furthermore, AMCI allows not only for amortizing over datasets but also amortizing over target functions.", "bibtex": "@InProceedings{pmlr-v97-golinski19a,\n title = \t {Amortized {M}onte {C}arlo Integration},\n author = {Golinski, Adam and Wood, Frank and Rainforth, Tom},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2309--2318},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/golinski19a/golinski19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/golinski19a.html},\n abstract = \t {Current approaches to amortizing Bayesian inference focus solely on approximating the posterior distribution. Typically, this approximation is, in turn, used to calculate expectations for one or more target functions{\u2014}a computational pipeline which is inefficient when the target function(s) are known upfront. In this paper, we address this inefficiency by introducing AMCI, a method for amortizing Monte Carlo integration directly. AMCI operates similarly to amortized inference but produces three distinct amortized proposals, each tailored to a different component of the overall expectation calculation. At runtime, samples are produced separately from each amortized proposal, before being combined to an overall estimate of the expectation. We show that while existing approaches are fundamentally limited in the level of accuracy they can achieve, AMCI can theoretically produce arbitrarily small errors for any integrable target function using only a single sample from each proposal at runtime. We further show that it is able to empirically outperform the theoretically optimal selfnormalized importance sampler on a number of example problems. Furthermore, AMCI allows not only for amortizing over datasets but also amortizing over target functions.}\n}", "pdf": "http://proceedings.mlr.press/v97/golinski19a/golinski19a.pdf", "supp": "", "pdf_size": 501318, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7430114062861179606&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Statistics, University of Oxford, United Kingdom+Department of Engineering Science, University of Oxford, United Kingdom; Department of Computer Science, University of British Columbia, Vancouver, Canada; Department of Statistics, University of Oxford, United Kingdom", "aff_domain": "robots.ox.ac.uk; ;stats.ox.ac.uk", "email": "robots.ox.ac.uk; ;stats.ox.ac.uk", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/golinski19a.html", "aff_unique_index": "0+0;1;0", "aff_unique_norm": "University of Oxford;University of British Columbia", "aff_unique_dep": "Department of Statistics;Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk;https://www.ubc.ca", "aff_unique_abbr": "Oxford;UBC", "aff_campus_unique_index": "0+0;1;0", "aff_campus_unique": "Oxford;Vancouver", "aff_country_unique_index": "0+0;1;0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "An Instability in Variational Inference for Topic Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3966", "id": "3966", "author_site": "Behrooz Ghorbani, Hamidreza Hakim Javadi, Andrea Montanari", "author": "Behrooz Ghorbani; Hamid Javadi; Andrea Montanari", "abstract": "Naive mean field variational methods are the state of-the-art approach to inference in topic modeling. We show that these methods suffer from an instability that can produce misleading conclusions. Namely, for certain regimes of the model parameters, variational inference outputs a non-trivial decomposition into topics. However -for the same parameter values- the data contain no actual information about the true topic decomposition, and the output of the algorithm is uncorrelated with it. In particular, the estimated posterior mean is wrong, and estimated credible regions do not achieve the nominal coverage. We discuss how this instability is remedied by more accurate mean field approximations.", "bibtex": "@InProceedings{pmlr-v97-ghorbani19a,\n title = \t {An Instability in Variational Inference for Topic Models},\n author = {Ghorbani, Behrooz and Javadi, Hamid and Montanari, Andrea},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2221--2231},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ghorbani19a/ghorbani19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ghorbani19a.html},\n abstract = \t {Naive mean field variational methods are the state of-the-art approach to inference in topic modeling. We show that these methods suffer from an instability that can produce misleading conclusions. Namely, for certain regimes of the model parameters, variational inference outputs a non-trivial decomposition into topics. However -for the same parameter values- the data contain no actual information about the true topic decomposition, and the output of the algorithm is uncorrelated with it. In particular, the estimated posterior mean is wrong, and estimated credible regions do not achieve the nominal coverage. We discuss how this instability is remedied by more accurate mean field approximations.}\n}", "pdf": "http://proceedings.mlr.press/v97/ghorbani19a/ghorbani19a.pdf", "supp": "", "pdf_size": 874057, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11836025418522429778&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Electrical Engineering, Stanford University, CA; Digital Signal Processing Group, Rice University, TX; Department of Statistics, Stanford University, CA", "aff_domain": "stanford.edu; ; ", "email": "stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ghorbani19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Stanford University;Rice University", "aff_unique_dep": "Department of Electrical Engineering;Digital Signal Processing Group", "aff_unique_url": "https://www.stanford.edu;https://www.rice.edu", "aff_unique_abbr": "Stanford;Rice", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "California;Houston;CA", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "An Investigation into Neural Net Optimization via Hessian Eigenvalue Density", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4303", "id": "4303", "author_site": "Behrooz Ghorbani, Shankar Krishnan, Ying Xiao", "author": "Behrooz Ghorbani; Shankar Krishnan; Ying Xiao", "abstract": "To understand the dynamics of training in deep neural networks, we study the evolution of the Hessian eigenvalue density throughout the optimization process. In non-batch normalized networks, we observe the rapid appearance of large isolated eigenvalues in the spectrum, along with a surprising concentration of the gradient in the corresponding eigenspaces. In a batch normalized network, these two effects are almost absent. We give a theoretical rationale to partially explain these phenomena. As part of this work, we adapt advanced tools from numerical linear algebra that allow scalable and accurate estimation of the entire Hessian spectrum of ImageNet-scale neural networks; this technique may be of independent interest in other applications.", "bibtex": "@InProceedings{pmlr-v97-ghorbani19b,\n title = \t {An Investigation into Neural Net Optimization via Hessian Eigenvalue Density},\n author = {Ghorbani, Behrooz and Krishnan, Shankar and Xiao, Ying},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2232--2241},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ghorbani19b/ghorbani19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/ghorbani19b.html},\n abstract = \t {To understand the dynamics of training in deep neural networks, we study the evolution of the Hessian eigenvalue density throughout the optimization process. In non-batch normalized networks, we observe the rapid appearance of large isolated eigenvalues in the spectrum, along with a surprising concentration of the gradient in the corresponding eigenspaces. In a batch normalized network, these two effects are almost absent. We give a theoretical rationale to partially explain these phenomena. As part of this work, we adapt advanced tools from numerical linear algebra that allow scalable and accurate estimation of the entire Hessian spectrum of ImageNet-scale neural networks; this technique may be of independent interest in other applications.}\n}", "pdf": "http://proceedings.mlr.press/v97/ghorbani19b/ghorbani19b.pdf", "supp": "", "pdf_size": 1282244, "gs_citation": 383, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6414005755575052194&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering, Stanford University + Machine Perception, Google Inc.; Machine Perception, Google Inc.; Machine Perception, Google Inc.", "aff_domain": "stanford.edu; ; ", "email": "stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ghorbani19b.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": "Department of Electrical Engineering;Machine Perception", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "An Investigation of Model-Free Planning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3809", "id": "3809", "author_site": "Arthur Guez, Mehdi Mirza, Karol Gregor, Rishabh Kabra, Sebastien Racaniere, Theophane Weber, David Raposo, Adam Santoro, Laurent Orseau, Tom Eccles, Greg Wayne, David Silver, Timothy Lillicrap", "author": "Arthur Guez; Mehdi Mirza; Karol Gregor; Rishabh Kabra; Sebastien Racaniere; Theophane Weber; David Raposo; Adam Santoro; Laurent Orseau; Tom Eccles; Greg Wayne; David Silver; Timothy Lillicrap", "abstract": "The field of reinforcement learning (RL) is facing increasingly challenging domains with combinatorial complexity. For an RL agent to address these challenges, it is essential that it can plan effectively. Prior work has typically utilized an explicit model of the environment, combined with a specific planning algorithm (such as tree search). More recently, a new family of methods have been proposed that learn how to plan, by providing the structure for planning via an inductive bias in the function approximator (such as a tree structured neural network), trained end-to-end by a model-free RL algorithm. In this paper, we go even further, and demonstrate empirically that an entirely model-free approach, without special structure beyond standard neural network components such as convolutional networks and LSTMs, can learn to exhibit many of the characteristics typically associated with a model-based planner. We measure our agent\u2019s effectiveness at planning in terms of its ability to generalize across a combinatorial and irreversible state space, its data efficiency, and its ability to utilize additional thinking time. We find that our agent has many of the characteristics that one might expect to find in a planning algorithm. Furthermore, it exceeds the state-of-the-art in challenging combinatorial domains such as Sokoban and outperforms other model-free approaches that utilize strong inductive biases toward planning.", "bibtex": "@InProceedings{pmlr-v97-guez19a,\n title = \t {An Investigation of Model-Free Planning},\n author = {Guez, Arthur and Mirza, Mehdi and Gregor, Karol and Kabra, Rishabh and Racaniere, Sebastien and Weber, Theophane and Raposo, David and Santoro, Adam and Orseau, Laurent and Eccles, Tom and Wayne, Greg and Silver, David and Lillicrap, Timothy},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2464--2473},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/guez19a/guez19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/guez19a.html},\n abstract = \t {The field of reinforcement learning (RL) is facing increasingly challenging domains with combinatorial complexity. For an RL agent to address these challenges, it is essential that it can plan effectively. Prior work has typically utilized an explicit model of the environment, combined with a specific planning algorithm (such as tree search). More recently, a new family of methods have been proposed that learn how to plan, by providing the structure for planning via an inductive bias in the function approximator (such as a tree structured neural network), trained end-to-end by a model-free RL algorithm. In this paper, we go even further, and demonstrate empirically that an entirely model-free approach, without special structure beyond standard neural network components such as convolutional networks and LSTMs, can learn to exhibit many of the characteristics typically associated with a model-based planner. We measure our agent\u2019s effectiveness at planning in terms of its ability to generalize across a combinatorial and irreversible state space, its data efficiency, and its ability to utilize additional thinking time. We find that our agent has many of the characteristics that one might expect to find in a planning algorithm. Furthermore, it exceeds the state-of-the-art in challenging combinatorial domains such as Sokoban and outperforms other model-free approaches that utilize strong inductive biases toward planning.}\n}", "pdf": "http://proceedings.mlr.press/v97/guez19a/guez19a.pdf", "supp": "", "pdf_size": 1690239, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7566080617462830679&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK", "aff_domain": "google.com;google.com;google.com;google.com; ; ; ; ; ; ; ; ; ", "email": "google.com;google.com;google.com;google.com; ; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 13, "oa": "https://proceedings.mlr.press/v97/guez19a.html", "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "An Optimal Private Stochastic-MAB Algorithm based on Optimal Private Stopping Rule", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4245", "id": "4245", "author_site": "Touqir Sajed, Or Sheffet", "author": "Touqir Sajed; Or Sheffet", "abstract": "We present a provably optimal differentially private algorithm for the stochastic multi-arm bandit problem, as opposed to the private analogue of the UCB-algorithm (Mishra and Thakurta, 2015; Tossou and Dimitrakakis, 2016) which doesn\u2019t meet the recently discovered lower-bound of $\\Omega \\left(\\frac{K\\log(T)}{\\epsilon} \\right)$ (Shariff and Sheffet, 2018). Our construction is based on a different algorithm, Successive Elimination (Even-Dar et al., 2002), that repeatedly pulls all remaining arms until an arm is found to be suboptimal and is then eliminated. In order to devise a private analogue of Successive Elimination we visit the problem of private", "bibtex": "@InProceedings{pmlr-v97-sajed19a,\n title = \t {An Optimal Private Stochastic-{MAB} Algorithm based on Optimal Private Stopping Rule},\n author = {Sajed, Touqir and Sheffet, Or},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5579--5588},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sajed19a/sajed19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/sajed19a.html},\n abstract = \t {We present a provably optimal differentially private algorithm for the stochastic multi-arm bandit problem, as opposed to the private analogue of the UCB-algorithm (Mishra and Thakurta, 2015; Tossou and Dimitrakakis, 2016) which doesn\u2019t meet the recently discovered lower-bound of $\\Omega \\left(\\frac{K\\log(T)}{\\epsilon} \\right)$ (Shariff and Sheffet, 2018). Our construction is based on a different algorithm, Successive Elimination (Even-Dar et al., 2002), that repeatedly pulls all remaining arms until an arm is found to be suboptimal and is then eliminated. In order to devise a private analogue of Successive Elimination we visit the problem of private", "pdf": "http://proceedings.mlr.press/v97/sajed19a/sajed19a.pdf", "supp": "", "pdf_size": 455481, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3020293759047137437&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computing Science, University of Alberta, Edmonton AB, Canada; Department of Computing Science, University of Alberta, Edmonton AB, Canada", "aff_domain": "ualberta.ca;ualberta.ca", "email": "ualberta.ca;ualberta.ca", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/sajed19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Alberta", "aff_unique_dep": "Department of Computing Science", "aff_unique_url": "https://www.ualberta.ca", "aff_unique_abbr": "UAlberta", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Edmonton", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Analogies Explained: Towards Understanding Word Embeddings", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4051", "id": "4051", "author_site": "Carl Allen, Timothy Hospedales", "author": "Carl Allen; Timothy Hospedales", "abstract": "Word embeddings generated by neural network methods such as word2vec (W2V) are well known to exhibit seemingly linear behaviour, e.g. the embeddings of analogy \u201cwoman is to queen as man is to king\u201d approximately describe a parallelogram. This property is particularly intriguing since the embeddings are not trained to achieve it. Several explanations have been proposed, but each introduces assumptions that do not hold in practice. We derive a probabilistically grounded definition of paraphrasing that we re-interpret as word transformation, a mathematical description of \u201c$w_x$ is to $w_y$\u201d. From these concepts we prove existence of linear relationship between W2V-type embeddings that underlie the analogical phenomenon, identifying explicit error terms.", "bibtex": "@InProceedings{pmlr-v97-allen19a,\n title = \t {Analogies Explained: Towards Understanding Word Embeddings},\n author = {Allen, Carl and Hospedales, Timothy},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {223--231},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/allen19a/allen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/allen19a.html},\n abstract = \t {Word embeddings generated by neural network methods such as word2vec (W2V) are well known to exhibit seemingly linear behaviour, e.g. the embeddings of analogy \u201cwoman is to queen as man is to king\u201d approximately describe a parallelogram. This property is particularly intriguing since the embeddings are not trained to achieve it. Several explanations have been proposed, but each introduces assumptions that do not hold in practice. We derive a probabilistically grounded definition of paraphrasing that we re-interpret as word transformation, a mathematical description of \u201c$w_x$ is to $w_y$\u201d. From these concepts we prove existence of linear relationship between W2V-type embeddings that underlie the analogical phenomenon, identifying explicit error terms.}\n}", "pdf": "http://proceedings.mlr.press/v97/allen19a/allen19a.pdf", "supp": "", "pdf_size": 634245, "gs_citation": 206, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15445529659618849253&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "School of Informatics, University of Edinburgh; School of Informatics, University of Edinburgh", "aff_domain": "ed.ac.uk; ", "email": "ed.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/allen19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "School of Informatics", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Edinburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Analyzing Federated Learning through an Adversarial Lens", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3853", "id": "3853", "author_site": "Arjun Nitin Bhagoji, Supriyo Chakraborty, Prateek Mittal, Seraphin Calo", "author": "Arjun Nitin Bhagoji; Supriyo Chakraborty; Prateek Mittal; Seraphin Calo", "abstract": "Federated learning distributes model training among a multitude of agents, who, guided by privacy concerns, perform training using their local data but share only model parameter updates, for iterative aggregation at the server to train an overall global model. In this work, we explore how the federated learning setting gives rise to a new threat, namely model poisoning, which differs from traditional data poisoning. Model poisoning is carried out by an adversary controlling a small number of malicious agents (usually 1) with the aim of causing the global model to misclassify a set of chosen inputs with high con\ufb01dence. We explore a number of strategies to carry out this attack on deep neural networks, starting with targeted model poisoning using a simple boosting of the malicious agent\u2019s update to overcome the effects of other agents. We also propose two critical notions of stealth to detect malicious updates. We bypass these by including them in the adversarial objective to carry out stealthy model poisoning. We improve its stealth with the use of an alternating minimization strategy which alternately optimizes for stealth and the adversarial objective. We also empirically demonstrate that Byzantine-resilient aggregation strategies are not robust to our attacks. Our results indicate that highly constrained adversaries can carry out model poisoning attacks while maintaining stealth, thus highlighting the vulnerability of the federated learning setting and the need to develop effective defense strategies.", "bibtex": "@InProceedings{pmlr-v97-bhagoji19a,\n title = \t {Analyzing Federated Learning through an Adversarial Lens},\n author = {Bhagoji, Arjun Nitin and Chakraborty, Supriyo and Mittal, Prateek and Calo, Seraphin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {634--643},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bhagoji19a/bhagoji19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bhagoji19a.html},\n abstract = \t {Federated learning distributes model training among a multitude of agents, who, guided by privacy concerns, perform training using their local data but share only model parameter updates, for iterative aggregation at the server to train an overall global model. In this work, we explore how the federated learning setting gives rise to a new threat, namely model poisoning, which differs from traditional data poisoning. Model poisoning is carried out by an adversary controlling a small number of malicious agents (usually 1) with the aim of causing the global model to misclassify a set of chosen inputs with high con\ufb01dence. We explore a number of strategies to carry out this attack on deep neural networks, starting with targeted model poisoning using a simple boosting of the malicious agent\u2019s update to overcome the effects of other agents. We also propose two critical notions of stealth to detect malicious updates. We bypass these by including them in the adversarial objective to carry out stealthy model poisoning. We improve its stealth with the use of an alternating minimization strategy which alternately optimizes for stealth and the adversarial objective. We also empirically demonstrate that Byzantine-resilient aggregation strategies are not robust to our attacks. Our results indicate that highly constrained adversaries can carry out model poisoning attacks while maintaining stealth, thus highlighting the vulnerability of the federated learning setting and the need to develop effective defense strategies.}\n}", "pdf": "http://proceedings.mlr.press/v97/bhagoji19a/bhagoji19a.pdf", "supp": "", "pdf_size": 1325442, "gs_citation": 1470, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16839948122426603319&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Princeton University + I.B.M. Research; I.B.M. T.J. Watson Research Center; Princeton University; I.B.M. T.J. Watson Research Center", "aff_domain": "princeton.edu; ; ; ", "email": "princeton.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/bhagoji19a.html", "aff_unique_index": "0+1;1;0;1", "aff_unique_norm": "Princeton University;IBM", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.princeton.edu;https://www.ibm.com/research", "aff_unique_abbr": "Princeton;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Analyzing and Improving Representations with the Soft Nearest Neighbor Loss", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4280", "id": "4280", "author_site": "Nicholas Frosst, Nicolas Papernot, Geoffrey Hinton", "author": "Nicholas Frosst; Nicolas Papernot; Geoffrey Hinton", "abstract": "We explore and expand the Soft Nearest Neighbor Loss to measure the entanglement of class manifolds in representation space: i.e., how close pairs of points from the same class are relative to pairs of points from different classes. We demonstrate several use cases of the loss. As an analytical tool, it provides insights into the evolution of class similarity structures during learning. Surprisingly, we find that maximizing the entanglement of representations of different classes in the hidden layers is beneficial for discrimination in the final layer, possibly because it encourages representations to identify class-independent similarity structures. Maximizing the soft nearest neighbor loss in the hidden layers leads not only to better-calibrated estimates of uncertainty on outlier data but also marginally improved generalization. Data that is not from the training distribution can be recognized by observing that in the hidden layers, it has fewer than the normal number of neighbors from the predicted class.", "bibtex": "@InProceedings{pmlr-v97-frosst19a,\n title = \t {Analyzing and Improving Representations with the Soft Nearest Neighbor Loss},\n author = {Frosst, Nicholas and Papernot, Nicolas and Hinton, Geoffrey},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2012--2020},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/frosst19a/frosst19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/frosst19a.html},\n abstract = \t {We explore and expand the Soft Nearest Neighbor Loss to measure the entanglement of class manifolds in representation space: i.e., how close pairs of points from the same class are relative to pairs of points from different classes. We demonstrate several use cases of the loss. As an analytical tool, it provides insights into the evolution of class similarity structures during learning. Surprisingly, we find that maximizing the entanglement of representations of different classes in the hidden layers is beneficial for discrimination in the final layer, possibly because it encourages representations to identify class-independent similarity structures. Maximizing the soft nearest neighbor loss in the hidden layers leads not only to better-calibrated estimates of uncertainty on outlier data but also marginally improved generalization. Data that is not from the training distribution can be recognized by observing that in the hidden layers, it has fewer than the normal number of neighbors from the predicted class.}\n}", "pdf": "http://proceedings.mlr.press/v97/frosst19a/frosst19a.pdf", "supp": "", "pdf_size": 664861, "gs_citation": 190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11092372353656200973&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google Brain; Google Brain; Google Brain", "aff_domain": "google.com;google.com; ", "email": "google.com;google.com; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/frosst19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Anomaly Detection With Multiple-Hypotheses Predictions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3586", "id": "3586", "author_site": "Duc Tam Nguyen, Zhongyu Lou, Michael Klar, Thomas Brox", "author": "Duc Tam Nguyen; Zhongyu Lou; Michael Klar; Thomas Brox", "abstract": "In one-class-learning tasks, only the normal case (foreground) can be modeled with data, whereas the variation of all possible anomalies is too erratic to be described by samples. Thus, due to the lack of representative data, the wide-spread discriminative approaches cannot cover such learning tasks, and rather generative models,which attempt to learn the input density of the foreground, are used. However, generative models suffer from a large input dimensionality (as in images) and are typically inefficient learners.We propose to learn the data distribution of the foreground more efficiently with a multi-hypotheses autoencoder. Moreover, the model is criticized by a discriminator, which prevents artificial data modes not supported by data, and which enforces diversity across hypotheses. Our multiple-hypotheses-based anomaly detection framework allows the reliable identification of out-of-distribution samples. For anomaly detection on CIFAR-10, it yields up to 3.9% points improvement over previously reported results. On a real anomaly detection task, the approach reduces the error of the baseline models from 6.8% to 1.5%.", "bibtex": "@InProceedings{pmlr-v97-nguyen19b,\n title = \t {Anomaly Detection With Multiple-Hypotheses Predictions},\n author = {Nguyen, Duc Tam and Lou, Zhongyu and Klar, Michael and Brox, Thomas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4800--4809},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nguyen19b/nguyen19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/nguyen19b.html},\n abstract = \t {In one-class-learning tasks, only the normal case (foreground) can be modeled with data, whereas the variation of all possible anomalies is too erratic to be described by samples. Thus, due to the lack of representative data, the wide-spread discriminative approaches cannot cover such learning tasks, and rather generative models,which attempt to learn the input density of the foreground, are used. However, generative models suffer from a large input dimensionality (as in images) and are typically inefficient learners.We propose to learn the data distribution of the foreground more efficiently with a multi-hypotheses autoencoder. Moreover, the model is criticized by a discriminator, which prevents artificial data modes not supported by data, and which enforces diversity across hypotheses. Our multiple-hypotheses-based anomaly detection framework allows the reliable identification of out-of-distribution samples. For anomaly detection on CIFAR-10, it yields up to 3.9% points improvement over previously reported results. On a real anomaly detection task, the approach reduces the error of the baseline models from 6.8% to 1.5%.}\n}", "pdf": "http://proceedings.mlr.press/v97/nguyen19b/nguyen19b.pdf", "supp": "", "pdf_size": 1724936, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1179800469792326390&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Computer Vision Group, University of Freiburg, Freiburg, Germany+Corporate Research, Robert Bosch GmbH, Renningen, Germany; Corporate Research, Robert Bosch GmbH, Renningen, Germany; Corporate Research, Robert Bosch GmbH, Renningen, Germany; Computer Vision Group, University of Freiburg, Freiburg, Germany", "aff_domain": "informatik.uni-freiburg.de;de.bosch.com;de.bosch.com;informatik.uni-freiburg.de", "email": "informatik.uni-freiburg.de;de.bosch.com;de.bosch.com;informatik.uni-freiburg.de", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/nguyen19b.html", "aff_unique_index": "0+1;1;1;0", "aff_unique_norm": "University of Freiburg;Robert Bosch GmbH", "aff_unique_dep": "Computer Vision Group;Corporate Research", "aff_unique_url": "https://www.uni-freiburg.de;https://www.bosch.com", "aff_unique_abbr": ";Bosch", "aff_campus_unique_index": "0+1;1;1;0", "aff_campus_unique": "Freiburg;Renningen", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Anytime Online-to-Batch, Optimism and Acceleration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4147", "id": "4147", "author": "Ashok Cutkosky", "abstract": "A standard way to obtain convergence guarantees in stochastic convex optimization is to run an online learning algorithm and then output the average of its iterates: the actual iterates of the online learning algorithm do not come with individual guarantees. We close this gap by introducing a black-box modification to any online learning algorithm whose iterates converge to the optimum in stochastic scenarios. We then consider the case of smooth losses, and show that combining our approach with optimistic online learning algorithms immediately yields a fast convergence rate of $O(L/T^{3/2}+\\sigma/\\sqrt{T})$ on $L$-smooth problems with $\\sigma^2$ variance in the gradients. Finally, we provide a reduction that converts any adaptive online algorithm into one that obtains the optimal accelerated rate of $\\tilde O(L/T^2 + \\sigma/\\sqrt{T})$, while still maintaining $\\tilde O(1/\\sqrt{T})$ convergence in the non-smooth setting. Importantly, our algorithms adapt to $L$ and $\\sigma$ automatically: they do not need to know either to obtain these rates.", "bibtex": "@InProceedings{pmlr-v97-cutkosky19a,\n title = \t {Anytime Online-to-Batch, Optimism and Acceleration},\n author = {Cutkosky, Ashok},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1446--1454},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cutkosky19a/cutkosky19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cutkosky19a.html},\n abstract = \t {A standard way to obtain convergence guarantees in stochastic convex optimization is to run an online learning algorithm and then output the average of its iterates: the actual iterates of the online learning algorithm do not come with individual guarantees. We close this gap by introducing a black-box modification to any online learning algorithm whose iterates converge to the optimum in stochastic scenarios. We then consider the case of smooth losses, and show that combining our approach with optimistic online learning algorithms immediately yields a fast convergence rate of $O(L/T^{3/2}+\\sigma/\\sqrt{T})$ on $L$-smooth problems with $\\sigma^2$ variance in the gradients. Finally, we provide a reduction that converts any adaptive online algorithm into one that obtains the optimal accelerated rate of $\\tilde O(L/T^2 + \\sigma/\\sqrt{T})$, while still maintaining $\\tilde O(1/\\sqrt{T})$ convergence in the non-smooth setting. Importantly, our algorithms adapt to $L$ and $\\sigma$ automatically: they do not need to know either to obtain these rates.}\n}", "pdf": "http://proceedings.mlr.press/v97/cutkosky19a/cutkosky19a.pdf", "supp": "", "pdf_size": 262130, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13500922681156091245&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Research, California, USA", "aff_domain": "cutkosky.com", "email": "cutkosky.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/cutkosky19a.html", "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0", "aff_campus_unique": "California", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Approximated Oracle Filter Pruning for Destructive CNN Width Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3835", "id": "3835", "author_site": "XIAOHAN DING, guiguang ding, Yuchen Guo, Jungong Han, Chenggang Yan", "author": "Xiaohan Ding; Guiguang Ding; Yuchen Guo; Jungong Han; Chenggang Yan", "abstract": "It is not easy to design and run Convolutional Neural Networks (CNNs) due to: 1) finding the optimal number of filters (i.e., the width) at each layer is tricky, given an architecture; and 2) the computational intensity of CNNs impedes the deployment on computationally limited devices. Oracle Pruning is designed to remove the unimportant filters from a well-trained CNN, which estimates the filters\u2019 importance by ablating them in turn and evaluating the model, thus delivers high accuracy but suffers from intolerable time complexity, and requires a given resulting width but cannot automatically find it. To address these problems, we propose Approximated Oracle Filter Pruning (AOFP), which keeps searching for the least important filters in a binary search manner, makes pruning attempts by masking out filters randomly, accumulates the resulting errors, and finetunes the model via a multi-path framework. As AOFP enables simultaneous pruning on multiple layers, we can prune an existing very deep CNN with acceptable time cost, negligible accuracy drop, and no heuristic knowledge, or re-design a model which exerts higher accuracy and faster inference.", "bibtex": "@InProceedings{pmlr-v97-ding19a,\n title = \t {Approximated Oracle Filter Pruning for Destructive {CNN} Width Optimization},\n author = {Ding, Xiaohan and Ding, Guiguang and Guo, Yuchen and Han, Jungong and Yan, Chenggang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1607--1616},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ding19a/ding19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ding19a.html},\n abstract = \t {It is not easy to design and run Convolutional Neural Networks (CNNs) due to: 1) finding the optimal number of filters (i.e., the width) at each layer is tricky, given an architecture; and 2) the computational intensity of CNNs impedes the deployment on computationally limited devices. Oracle Pruning is designed to remove the unimportant filters from a well-trained CNN, which estimates the filters\u2019 importance by ablating them in turn and evaluating the model, thus delivers high accuracy but suffers from intolerable time complexity, and requires a given resulting width but cannot automatically find it. To address these problems, we propose Approximated Oracle Filter Pruning (AOFP), which keeps searching for the least important filters in a binary search manner, makes pruning attempts by masking out filters randomly, accumulates the resulting errors, and finetunes the model via a multi-path framework. As AOFP enables simultaneous pruning on multiple layers, we can prune an existing very deep CNN with acceptable time cost, negligible accuracy drop, and no heuristic knowledge, or re-design a model which exerts higher accuracy and faster inference.}\n}", "pdf": "http://proceedings.mlr.press/v97/ding19a/ding19a.pdf", "supp": "", "pdf_size": 2649472, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=979238780615518812&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/ding19a.html" }, { "title": "Approximating Orthogonal Matrices with Effective Givens Factorization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3791", "id": "3791", "author_site": "Thomas Frerix, Joan Bruna", "author": "Thomas Frerix; Joan Bruna", "abstract": "We analyze effective approximation of unitary matrices. In our formulation, a unitary matrix is represented as a product of rotations in two-dimensional subspaces, so-called Givens rotations. Instead of the quadratic dimension dependence when applying a dense matrix, applying such an approximation scales with the number factors, each of which can be implemented efficiently. Consequently, in settings where an approximation is once computed and then applied many times, such a representation becomes advantageous. Although effective Givens factorization is not possible for generic unitary operators, we show that minimizing a sparsity-inducing objective with a coordinate descent algorithm on the unitary group yields good factorizations for structured matrices. Canonical applications of such a setup are orthogonal basis transforms. We demonstrate numerical results of approximating the graph Fourier transform, which is the matrix obtained when diagonalizing a graph Laplacian.", "bibtex": "@InProceedings{pmlr-v97-frerix19a,\n title = \t {Approximating Orthogonal Matrices with Effective Givens Factorization},\n author = {Frerix, Thomas and Bruna, Joan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1993--2001},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/frerix19a/frerix19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/frerix19a.html},\n abstract = \t {We analyze effective approximation of unitary matrices. In our formulation, a unitary matrix is represented as a product of rotations in two-dimensional subspaces, so-called Givens rotations. Instead of the quadratic dimension dependence when applying a dense matrix, applying such an approximation scales with the number factors, each of which can be implemented efficiently. Consequently, in settings where an approximation is once computed and then applied many times, such a representation becomes advantageous. Although effective Givens factorization is not possible for generic unitary operators, we show that minimizing a sparsity-inducing objective with a coordinate descent algorithm on the unitary group yields good factorizations for structured matrices. Canonical applications of such a setup are orthogonal basis transforms. We demonstrate numerical results of approximating the graph Fourier transform, which is the matrix obtained when diagonalizing a graph Laplacian.}\n}", "pdf": "http://proceedings.mlr.press/v97/frerix19a/frerix19a.pdf", "supp": "", "pdf_size": 352075, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16649468225264145943&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Technical University of Munich; New York University", "aff_domain": "tum.de;cims.nyu.edu", "email": "tum.de;cims.nyu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/frerix19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Technical University of Munich;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.nyu.edu", "aff_unique_abbr": "TUM;NYU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Germany;United States" }, { "title": "Approximation and non-parametric estimation of ResNet-type convolutional neural networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3724", "id": "3724", "author_site": "Kenta Oono, Taiji Suzuki", "author": "Kenta Oono; Taiji Suzuki", "abstract": "Convolutional neural networks (CNNs) have been shown to achieve optimal approximation and estimation error rates (in minimax sense) in several function classes. However, previous analyzed optimal CNNs are unrealistically wide and difficult to obtain via optimization due to sparse constraints in important function classes, including the H\u00f6lder class. We show a ResNet-type CNN can attain the minimax optimal error rates in these classes in more plausible situations \u2013 it can be dense, and its width, channel size, and filter size are constant with respect to sample size. The key idea is that we can replicate the learning ability of Fully-connected neural networks (FNNs) by tailored CNNs, as long as the FNNs have", "bibtex": "@InProceedings{pmlr-v97-oono19a,\n title = \t {Approximation and non-parametric estimation of {R}es{N}et-type convolutional neural networks},\n author = {Oono, Kenta and Suzuki, Taiji},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4922--4931},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/oono19a/oono19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/oono19a.html},\n abstract = \t {Convolutional neural networks (CNNs) have been shown to achieve optimal approximation and estimation error rates (in minimax sense) in several function classes. However, previous analyzed optimal CNNs are unrealistically wide and difficult to obtain via optimization due to sparse constraints in important function classes, including the H\u00f6lder class. We show a ResNet-type CNN can attain the minimax optimal error rates in these classes in more plausible situations \u2013 it can be dense, and its width, channel size, and filter size are constant with respect to sample size. The key idea is that we can replicate the learning ability of Fully-connected neural networks (FNNs) by tailored CNNs, as long as the FNNs have", "pdf": "http://proceedings.mlr.press/v97/oono19a/oono19a.pdf", "supp": "", "pdf_size": 392160, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7569493455271789411&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Graduate School of Information Science and Technology, The University of Tokyo, Tokyo, Japan+Preferred Networks, Inc. (PFN), Tokyo, Japan; Graduate School of Information Science and Technology, The University of Tokyo, Tokyo, Japan+Center for Advanced Intelligence Project (AIP), RIKEN, Tokyo, Japan", "aff_domain": "mist.i.u-tokyo.ac.jp; ", "email": "mist.i.u-tokyo.ac.jp; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/oono19a.html", "aff_unique_index": "0+1;0+2", "aff_unique_norm": "University of Tokyo;Preferred Networks, Inc.;RIKEN", "aff_unique_dep": "Graduate School of Information Science and Technology;;Center for Advanced Intelligence Project (AIP)", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.preferred-networks.com;https://www.riken.jp", "aff_unique_abbr": "UTokyo;PFN;RIKEN", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Japan" }, { "title": "Are Generative Classifiers More Robust to Adversarial Attacks?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3897", "id": "3897", "author_site": "Yingzhen Li, John Bradshaw, Yash Sharma", "author": "Yingzhen Li; John Bradshaw; Yash Sharma", "abstract": "There is a rising interest in studying the robustness of deep neural network classifiers against adversaries, with both advanced attack and defence techniques being actively developed. However, most recent work focuses on discriminative classifiers, which only model the conditional distribution of the labels given the inputs. In this paper, we propose and investigate the deep Bayes classifier, which improves classical naive Bayes with conditional deep generative models. We further develop detection methods for adversarial examples, which reject inputs with low likelihood under the generative model. Experimental results suggest that deep Bayes classifiers are more robust than deep discriminative classifiers, and that the proposed detection methods are effective against many recently proposed attacks.", "bibtex": "@InProceedings{pmlr-v97-li19a,\n title = \t {Are Generative Classifiers More Robust to Adversarial Attacks?},\n author = {Li, Yingzhen and Bradshaw, John and Sharma, Yash},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3804--3814},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19a/li19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19a.html},\n abstract = \t {There is a rising interest in studying the robustness of deep neural network classifiers against adversaries, with both advanced attack and defence techniques being actively developed. However, most recent work focuses on discriminative classifiers, which only model the conditional distribution of the labels given the inputs. In this paper, we propose and investigate the deep Bayes classifier, which improves classical naive Bayes with conditional deep generative models. We further develop detection methods for adversarial examples, which reject inputs with low likelihood under the generative model. Experimental results suggest that deep Bayes classifiers are more robust than deep discriminative classifiers, and that the proposed detection methods are effective against many recently proposed attacks.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19a/li19a.pdf", "supp": "", "pdf_size": 1531025, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10770378244624939531&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "Microsoft Research Cambridge, UK; University of Cambridge, UK + Max Planck Institute for Intelligent Systems, Germany; Eberhard Karls University of T\u00fcbingen, Germany", "aff_domain": "microsoft.com; ; ", "email": "microsoft.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/li19a.html", "aff_unique_index": "0;1+2;3", "aff_unique_norm": "Microsoft;University of Cambridge;Max Planck Institute for Intelligent Systems;Eberhard Karls University of T\u00fcbingen", "aff_unique_dep": "Research;;;", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/cambridge;https://www.cam.ac.uk;https://www.mpi-is.mpg.de;https://www.uni-tuebingen.de/", "aff_unique_abbr": "MSR Cambridge;Cambridge;MPI-IS;Uni T\u00fcbingen", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0+1;1", "aff_country_unique": "United Kingdom;Germany" }, { "title": "Area Attention", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3744", "id": "3744", "author_site": "Yang Li, Lukasz Kaiser, Samy Bengio, Si Si", "author": "Yang Li; Lukasz Kaiser; Samy Bengio; Si Si", "abstract": "Existing attention mechanisms are trained to attend to individual items in a collection (the memory) with a predefined, fixed granularity, e.g., a word token or an image grid. We propose area attention: a way to attend to areas in the memory, where each area contains a group of items that are structurally adjacent, e.g., spatially for a 2D memory such as images, or temporally for a 1D memory such as natural language sentences. Importantly, the shape and the size of an area are dynamically determined via learning, which enables a model to attend to information with varying granularity. Area attention can easily work with existing model architectures such as multi-head attention for simultaneously attending to multiple areas in the memory. We evaluate area attention on two tasks: neural machine translation (both character and token-level) and image captioning, and improve upon strong (state-of-the-art) baselines in all the cases. These improvements are obtainable with a basic form of area attention that is parameter free.", "bibtex": "@InProceedings{pmlr-v97-li19e,\n title = \t {Area Attention},\n author = {Li, Yang and Kaiser, Lukasz and Bengio, Samy and Si, Si},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3846--3855},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19e/li19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19e.html},\n abstract = \t {Existing attention mechanisms are trained to attend to individual items in a collection (the memory) with a predefined, fixed granularity, e.g., a word token or an image grid. We propose area attention: a way to attend to areas in the memory, where each area contains a group of items that are structurally adjacent, e.g., spatially for a 2D memory such as images, or temporally for a 1D memory such as natural language sentences. Importantly, the shape and the size of an area are dynamically determined via learning, which enables a model to attend to information with varying granularity. Area attention can easily work with existing model architectures such as multi-head attention for simultaneously attending to multiple areas in the memory. We evaluate area attention on two tasks: neural machine translation (both character and token-level) and image captioning, and improve upon strong (state-of-the-art) baselines in all the cases. These improvements are obtainable with a basic form of area attention that is parameter free.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19e/li19e.pdf", "supp": "", "pdf_size": 501337, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8914327285913451682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "Google Research, Mountain View, CA, USA; Google Research, Mountain View, CA, USA; Google Research, Mountain View, CA, USA; Google Research, Mountain View, CA, USA", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/li19e.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Asynchronous Batch Bayesian Optimisation with Improved Local Penalisation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4152", "id": "4152", "author_site": "Ahsan Alvi, Binxin Ru, Jan-Peter Calliess, Stephen Roberts, Michael A Osborne", "author": "Ahsan Alvi; Binxin Ru; Jan-Peter Calliess; Stephen Roberts; Michael A. Osborne", "abstract": "Batch Bayesian optimisation (BO) has been successfully applied to hyperparameter tuning using parallel computing, but it is wasteful of resources: workers that complete jobs ahead of others are left idle. We address this problem by developing an approach, Penalising Locally for Asynchronous Bayesian Optimisation on K Workers (PLAyBOOK), for asynchronous parallel BO. We demonstrate empirically the efficacy of PLAyBOOK and its variants on synthetic tasks and a real-world problem. We undertake a comparison between synchronous and asynchronous BO, and show that asynchronous BO often outperforms synchronous batch BO in both wall-clock time and sample efficiency.", "bibtex": "@InProceedings{pmlr-v97-alvi19a,\n title = \t {Asynchronous Batch {B}ayesian Optimisation with Improved Local Penalisation},\n author = {Alvi, Ahsan and Ru, Binxin and Calliess, Jan-Peter and Roberts, Stephen and Osborne, Michael A.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {253--262},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/alvi19a/alvi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/alvi19a.html},\n abstract = \t {Batch Bayesian optimisation (BO) has been successfully applied to hyperparameter tuning using parallel computing, but it is wasteful of resources: workers that complete jobs ahead of others are left idle. We address this problem by developing an approach, Penalising Locally for Asynchronous Bayesian Optimisation on K Workers (PLAyBOOK), for asynchronous parallel BO. We demonstrate empirically the efficacy of PLAyBOOK and its variants on synthetic tasks and a real-world problem. We undertake a comparison between synchronous and asynchronous BO, and show that asynchronous BO often outperforms synchronous batch BO in both wall-clock time and sample efficiency.}\n}", "pdf": "http://proceedings.mlr.press/v97/alvi19a/alvi19a.pdf", "supp": "", "pdf_size": 7513808, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17891210137592442168&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Engineering Science, University of Oxford+Mind Foundry Ltd., Oxford, UK+Oxford-Man Institute of Quantitative Finance; Department of Engineering Science, University of Oxford+Mind Foundry Ltd., Oxford, UK+Oxford-Man Institute of Quantitative Finance; Department of Engineering Science, University of Oxford; Department of Engineering Science, University of Oxford+Mind Foundry Ltd., Oxford, UK+Oxford-Man Institute of Quantitative Finance; Department of Engineering Science, University of Oxford+Mind Foundry Ltd., Oxford, UK+Oxford-Man Institute of Quantitative Finance", "aff_domain": "robots.ox.ac.uk;robots.ox.ac.uk; ;robots.ox.ac.uk;robots.ox.ac.uk", "email": "robots.ox.ac.uk;robots.ox.ac.uk; ;robots.ox.ac.uk;robots.ox.ac.uk", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/alvi19a.html", "aff_unique_index": "0+1+0;0+1+0;0;0+1+0;0+1+0", "aff_unique_norm": "University of Oxford;Mind Foundry Ltd.", "aff_unique_dep": "Department of Engineering Science;", "aff_unique_url": "https://www.ox.ac.uk;https://www.mindfoundry.co.uk", "aff_unique_abbr": "Oxford;", "aff_campus_unique_index": "0+0+0;0+0+0;0;0+0+0;0+0+0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0+0+0;0+0+0;0;0+0+0;0+0+0", "aff_country_unique": "United Kingdom" }, { "title": "AutoVC: Zero-Shot Voice Style Transfer with Only Autoencoder Loss", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4214", "id": "4214", "author_site": "Kaizhi Qian, Yang Zhang, Shiyu Chang, Xuesong Yang, Mark Hasegawa-Johnson", "author": "Kaizhi Qian; Yang Zhang; Shiyu Chang; Xuesong Yang; Mark Hasegawa-Johnson", "abstract": "Despite the progress in voice conversion, many-to-many voice conversion trained on non-parallel data, as well as zero-shot voice conversion, remains under-explored. Deep style transfer algorithms, generative adversarial networks (GAN) in particular, are being applied as new solutions in this field. However, GAN training is very sophisticated and difficult, and there is no strong evidence that its generated speech is of good perceptual quality. In this paper, we propose a new style transfer scheme that involves only an autoencoder with a carefully designed bottleneck. We formally show that this scheme can achieve distribution-matching style transfer by training only on self-reconstruction loss. Based on this scheme, we proposed AutoVC, which achieves state-of-the-art results in many-to-many voice conversion with non-parallel data, and which is the first to perform zero-shot voice conversion.", "bibtex": "@InProceedings{pmlr-v97-qian19c,\n title = \t {{A}uto{VC}: Zero-Shot Voice Style Transfer with Only Autoencoder Loss},\n author = {Qian, Kaizhi and Zhang, Yang and Chang, Shiyu and Yang, Xuesong and Hasegawa-Johnson, Mark},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5210--5219},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/qian19c/qian19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/qian19c.html},\n abstract = \t {Despite the progress in voice conversion, many-to-many voice conversion trained on non-parallel data, as well as zero-shot voice conversion, remains under-explored. Deep style transfer algorithms, generative adversarial networks (GAN) in particular, are being applied as new solutions in this field. However, GAN training is very sophisticated and difficult, and there is no strong evidence that its generated speech is of good perceptual quality. In this paper, we propose a new style transfer scheme that involves only an autoencoder with a carefully designed bottleneck. We formally show that this scheme can achieve distribution-matching style transfer by training only on self-reconstruction loss. Based on this scheme, we proposed AutoVC, which achieves state-of-the-art results in many-to-many voice conversion with non-parallel data, and which is the first to perform zero-shot voice conversion.}\n}", "pdf": "http://proceedings.mlr.press/v97/qian19c/qian19c.pdf", "supp": "", "pdf_size": 606601, "gs_citation": 632, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16861313448156905141&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "University of Illinois at Urbana-Champaign, IL, USA+MIT-IBM Watson AI Lab, Cambridge, MA, USA+IBM Research, Cambridge, MA, USA; University of Illinois at Urbana-Champaign, IL, USA+MIT-IBM Watson AI Lab, Cambridge, MA, USA+IBM Research, Cambridge, MA, USA; MIT-IBM Watson AI Lab, Cambridge, MA, USA+IBM Research, Cambridge, MA, USA; University of Illinois at Urbana-Champaign, IL, USA; University of Illinois at Urbana-Champaign, IL, USA", "aff_domain": "illinois.edu; ; ; ; ", "email": "illinois.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/qian19c.html", "aff_unique_index": "0+1+1;0+1+1;1+1;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;IBM", "aff_unique_dep": ";AI Lab", "aff_unique_url": "https://illinois.edu;https://www.ibmwatson.com/", "aff_unique_abbr": "UIUC;MIT-IBM AI Lab", "aff_campus_unique_index": "0+1+1;0+1+1;1+1;0;0", "aff_campus_unique": "Urbana-Champaign;Cambridge", "aff_country_unique_index": "0+0+0;0+0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Automated Model Selection with Bayesian Quadrature", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3784", "id": "3784", "author_site": "Henry Chai, Jean-Francois Ton, Michael A Osborne, Roman Garnett", "author": "Henry Chai; Jean-Francois Ton; Michael A. Osborne; Roman Garnett", "abstract": "We present a novel technique for tailoring Bayesian quadrature (BQ) to model selection. The state-of-the-art for comparing the evidence of multiple models relies on Monte Carlo methods, which converge slowly and are unreliable for computationally expensive models. Although previous research has shown that BQ offers sample efficiency superior to Monte Carlo in computing the evidence of an individual model, applying BQ directly to model comparison may waste computation producing an overly-accurate estimate for the evidence of a clearly poor model. We propose an automated and efficient algorithm for computing the most-relevant quantity for model selection: the posterior model probability. Our technique maximizes the mutual information between this quantity and observations of the models\u2019 likelihoods, yielding efficient sample acquisition across disparate model spaces when likelihood observations are limited. Our method produces more-accurate posterior estimates using fewer likelihood evaluations than standard Bayesian quadrature and Monte Carlo estimators, as we demonstrate on synthetic and real-world examples.", "bibtex": "@InProceedings{pmlr-v97-chai19a,\n title = \t {Automated Model Selection with {B}ayesian Quadrature},\n author = {Chai, Henry and Ton, Jean-Francois and Osborne, Michael A. and Garnett, Roman},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {931--940},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chai19a/chai19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chai19a.html},\n abstract = \t {We present a novel technique for tailoring Bayesian quadrature (BQ) to model selection. The state-of-the-art for comparing the evidence of multiple models relies on Monte Carlo methods, which converge slowly and are unreliable for computationally expensive models. Although previous research has shown that BQ offers sample efficiency superior to Monte Carlo in computing the evidence of an individual model, applying BQ directly to model comparison may waste computation producing an overly-accurate estimate for the evidence of a clearly poor model. We propose an automated and efficient algorithm for computing the most-relevant quantity for model selection: the posterior model probability. Our technique maximizes the mutual information between this quantity and observations of the models\u2019 likelihoods, yielding efficient sample acquisition across disparate model spaces when likelihood observations are limited. Our method produces more-accurate posterior estimates using fewer likelihood evaluations than standard Bayesian quadrature and Monte Carlo estimators, as we demonstrate on synthetic and real-world examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/chai19a/chai19a.pdf", "supp": "", "pdf_size": 320293, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7204864902061428147&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science and Engineering, Washington University in St. Louis, Saint Louis, MO, USA; Department of Statistics, University of Oxford, Oxford, United Kingdom; Department of Engineering Science, University of Oxford, Oxford, United Kingdom+Mind Foundry, Oxford, United Kingdom; Department of Computer Science and Engineering, Washington University in St. Louis, Saint Louis, MO, USA", "aff_domain": "wustl.edu; ; ; ", "email": "wustl.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chai19a.html", "aff_unique_index": "0;1;1+2;0", "aff_unique_norm": "Washington University in St. Louis;University of Oxford;Mind Foundry", "aff_unique_dep": "Department of Computer Science and Engineering;Department of Statistics;", "aff_unique_url": "https://wustl.edu;https://www.ox.ac.uk;https://www.mindfoundry.ai", "aff_unique_abbr": "WUSTL;Oxford;", "aff_campus_unique_index": "0;1;1+1;0", "aff_campus_unique": "Saint Louis;Oxford", "aff_country_unique_index": "0;1;1+1;0", "aff_country_unique": "United States;United Kingdom" }, { "title": "Automatic Classifiers as Scientific Instruments: One Step Further Away from Ground-Truth", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3562", "id": "3562", "author_site": "Jacob Whitehill, Anand Ramakrishnan", "author": "Jacob Whitehill; Anand Ramakrishnan", "abstract": "Automatic machine learning-based detectors of various psychological and social phenomena (e.g., emotion, stress, engagement) have great potential to advance basic science. However, when a detector d is trained to approximate an existing measurement tool (e.g., a questionnaire, observation protocol), then care must be taken when interpreting measurements collected using d since they are one step further removed from the under- lying construct. We examine how the accuracy of d, as quantified by the correlation q of d\u2019s out- puts with the ground-truth construct U, impacts the estimated correlation between U (e.g., stress) and some other phenomenon V (e.g., academic performance). In particular: (1) We show that if the true correlation between U and V is r, then the expected sample correlation, over all vectors T n whose correlation with U is q, is qr. (2) We derive a formula for the probability that the sample correlation (over n subjects) using d is positive given that the true correlation is negative (and vice-versa); this probability can be substantial (around 20 - 30%) for values of n and q that have been used in recent affective computing studies. (3) With the goal to reduce the variance of correlations estimated by an automatic detector, we show that training multiple neural networks d(1) , . . . , d(m) using different training architectures and hyperparameters for the same detection task provides only limited \u201ccoverage\u201d of T^n.", "bibtex": "@InProceedings{pmlr-v97-whitehill19a,\n title = \t {Automatic Classifiers as Scientific Instruments: One Step Further Away from Ground-Truth},\n author = {Whitehill, Jacob and Ramakrishnan, Anand},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6757--6765},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/whitehill19a/whitehill19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/whitehill19a.html},\n abstract = \t {Automatic machine learning-based detectors of various psychological and social phenomena (e.g., emotion, stress, engagement) have great potential to advance basic science. However, when a detector d is trained to approximate an existing measurement tool (e.g., a questionnaire, observation protocol), then care must be taken when interpreting measurements collected using d since they are one step further removed from the under- lying construct. We examine how the accuracy of d, as quantified by the correlation q of d\u2019s out- puts with the ground-truth construct U, impacts the estimated correlation between U (e.g., stress) and some other phenomenon V (e.g., academic performance). In particular: (1) We show that if the true correlation between U and V is r, then the expected sample correlation, over all vectors T n whose correlation with U is q, is qr. (2) We derive a formula for the probability that the sample correlation (over n subjects) using d is positive given that the true correlation is negative (and vice-versa); this probability can be substantial (around 20 - 30%) for values of n and q that have been used in recent affective computing studies. (3) With the goal to reduce the variance of correlations estimated by an automatic detector, we show that training multiple neural networks d(1) , . . . , d(m) using different training architectures and hyperparameters for the same detection task provides only limited \u201ccoverage\u201d of T^n.}\n}", "pdf": "http://proceedings.mlr.press/v97/whitehill19a/whitehill19a.pdf", "supp": "", "pdf_size": 1921183, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15728781370198115085&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Worcester Polytechnic Institute (WPI), MA, USA; Department of Computer Science, Worcester Polytechnic Institute (WPI), MA, USA", "aff_domain": "wpi.edu; ", "email": "wpi.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/whitehill19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Worcester Polytechnic Institute", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.wpi.edu", "aff_unique_abbr": "WPI", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Worcester", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Automatic Posterior Transformation for Likelihood-Free Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4268", "id": "4268", "author_site": "David Greenberg, Marcel Nonnenmacher, Jakob Macke", "author": "David Greenberg; Marcel Nonnenmacher; Jakob Macke", "abstract": "How can one perform Bayesian inference on stochastic simulators with intractable likelihoods? A recent approach is to learn the posterior from adaptively proposed simulations using neural network-based conditional density estimators. However, existing methods are limited to a narrow range of proposal distributions or require importance weighting that can limit performance in practice. Here we present automatic posterior transformation (APT), a new sequential neural posterior estimation method for simulation-based inference. APT can modify the posterior estimate using arbitrary, dynamically updated proposals, and is compatible with powerful flow-based density estimators. It is more flexible, scalable and efficient than previous simulation-based inference techniques. APT can operate directly on high-dimensional time series and image data, opening up new applications for likelihood-free inference.", "bibtex": "@InProceedings{pmlr-v97-greenberg19a,\n title = \t {Automatic Posterior Transformation for Likelihood-Free Inference},\n author = {Greenberg, David and Nonnenmacher, Marcel and Macke, Jakob},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2404--2414},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/greenberg19a/greenberg19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/greenberg19a.html},\n abstract = \t {How can one perform Bayesian inference on stochastic simulators with intractable likelihoods? A recent approach is to learn the posterior from adaptively proposed simulations using neural network-based conditional density estimators. However, existing methods are limited to a narrow range of proposal distributions or require importance weighting that can limit performance in practice. Here we present automatic posterior transformation (APT), a new sequential neural posterior estimation method for simulation-based inference. APT can modify the posterior estimate using arbitrary, dynamically updated proposals, and is compatible with powerful flow-based density estimators. It is more flexible, scalable and efficient than previous simulation-based inference techniques. APT can operate directly on high-dimensional time series and image data, opening up new applications for likelihood-free inference.}\n}", "pdf": "http://proceedings.mlr.press/v97/greenberg19a/greenberg19a.pdf", "supp": "", "pdf_size": 1139868, "gs_citation": 392, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9520658637115522401&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Computational Neuroengineering, Department of Electrical and Computer Engineering, Technical University of Munich, Munich, Germany; Computational Neuroengineering, Department of Electrical and Computer Engineering, Technical University of Munich, Munich, Germany; Computational Neuroengineering, Department of Electrical and Computer Engineering, Technical University of Munich, Munich, Germany", "aff_domain": "tum.de;tum.de;tum.de", "email": "tum.de;tum.de;tum.de", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/greenberg19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "title": "Autoregressive Energy Machines", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3871", "id": "3871", "author_site": "Conor Durkan, Charlie Nash", "author": "Charlie Nash; Conor Durkan", "abstract": "Neural density estimators are flexible families of parametric models which have seen widespread use in unsupervised machine learning in recent years. Maximum-likelihood training typically dictates that these models be constrained to specify an explicit density. However, this limitation can be overcome by instead using a neural network to specify an energy function, or unnormalized density, which can subsequently be normalized to obtain a valid distribution. The challenge with this approach lies in accurately estimating the normalizing constant of the high-dimensional energy function. We propose the Autoregressive Energy Machine, an energy-based model which simultaneously learns an unnormalized density and computes an importance-sampling estimate of the normalizing constant for each conditional in an autoregressive decomposition. The Autoregressive Energy Machine achieves state-of-the-art performance on a suite of density-estimation tasks.", "bibtex": "@InProceedings{pmlr-v97-durkan19a,\n title = \t {Autoregressive Energy Machines},\n author = {Nash, Charlie and Durkan, Conor},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1735--1744},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/durkan19a/durkan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/durkan19a.html},\n abstract = \t {Neural density estimators are flexible families of parametric models which have seen widespread use in unsupervised machine learning in recent years. Maximum-likelihood training typically dictates that these models be constrained to specify an explicit density. However, this limitation can be overcome by instead using a neural network to specify an energy function, or unnormalized density, which can subsequently be normalized to obtain a valid distribution. The challenge with this approach lies in accurately estimating the normalizing constant of the high-dimensional energy function. We propose the Autoregressive Energy Machine, an energy-based model which simultaneously learns an unnormalized density and computes an importance-sampling estimate of the normalizing constant for each conditional in an autoregressive decomposition. The Autoregressive Energy Machine achieves state-of-the-art performance on a suite of density-estimation tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/durkan19a/durkan19a.pdf", "supp": "", "pdf_size": 4236135, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6729811760374247021&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Informatics, University of Edinburgh, United Kingdom; School of Informatics, University of Edinburgh, United Kingdom", "aff_domain": "ed.ac.uk;ed.ac.uk", "email": "ed.ac.uk;ed.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/durkan19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "School of Informatics", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Edinburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "BERT and PALs: Projected Attention Layers for Efficient Adaptation in Multi-Task Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4070", "id": "4070", "author_site": "Asa Cooper Stickland, Iain Murray", "author": "Asa Cooper Stickland; Iain Murray", "abstract": "Multi-task learning shares information between related tasks, sometimes reducing the number of parameters required. State-of-the-art results across multiple natural language understanding tasks in the GLUE benchmark have previously used transfer from a single large task: unsupervised pre-training with BERT, where a separate BERT model was fine-tuned for each task. We explore multi-task approaches that share a \\hbox{single} BERT model with a small number of additional task-specific parameters. Using new adaptation modules, PALs or \u2018projected attention layers\u2019, we match the performance of separately fine-tuned models on the GLUE benchmark with $\\approx$7 times fewer parameters, and obtain state-of-the-art results on the Recognizing Textual Entailment dataset.", "bibtex": "@InProceedings{pmlr-v97-stickland19a,\n title = \t {{BERT} and {PAL}s: Projected Attention Layers for Efficient Adaptation in Multi-Task Learning},\n author = {Stickland, Asa Cooper and Murray, Iain},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5986--5995},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/stickland19a/stickland19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/stickland19a.html},\n abstract = \t {Multi-task learning shares information between related tasks, sometimes reducing the number of parameters required. State-of-the-art results across multiple natural language understanding tasks in the GLUE benchmark have previously used transfer from a single large task: unsupervised pre-training with BERT, where a separate BERT model was fine-tuned for each task. We explore multi-task approaches that share a \\hbox{single} BERT model with a small number of additional task-specific parameters. Using new adaptation modules, PALs or \u2018projected attention layers\u2019, we match the performance of separately fine-tuned models on the GLUE benchmark with $\\approx$7 times fewer parameters, and obtain state-of-the-art results on the Recognizing Textual Entailment dataset.}\n}", "pdf": "http://proceedings.mlr.press/v97/stickland19a/stickland19a.pdf", "supp": "", "pdf_size": 257844, "gs_citation": 324, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3136454913064441910&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Informatics, University of Edinburgh; School of Informatics, University of Edinburgh", "aff_domain": "ed.ac.uk; ", "email": "ed.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/stickland19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "School of Informatics", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Edinburgh", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Band-limited Training and Inference for Convolutional Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3587", "id": "3587", "author_site": "Adam Dziedzic, John Paparrizos, Sanjay Krishnan, Aaron Elmore, Michael Franklin", "author": "Adam Dziedzic; John Paparrizos; Sanjay Krishnan; Aaron Elmore; Michael Franklin", "abstract": "The convolutional layers are core building blocks of neural network architectures. In general, a convolutional filter applies to the entire frequency spectrum of the input data. We explore artificially constraining the frequency spectra of these filters and data, called band-limiting, during training. The frequency domain constraints apply to both the feed-forward and back-propagation steps. Experimentally, we observe that Convolutional Neural Networks (CNNs) are resilient to this compression scheme and results suggest that CNNs learn to leverage lower-frequency components. In particular, we found: (1) band-limited training can effectively control the resource usage (GPU and memory); (2) models trained with band-limited layers retain high prediction accuracy; and (3) requires no modification to existing training algorithms or neural network architectures to use unlike other compression schemes.", "bibtex": "@InProceedings{pmlr-v97-dziedzic19a,\n title = \t {Band-limited Training and Inference for Convolutional Neural Networks},\n author = {Dziedzic, Adam and Paparrizos, John and Krishnan, Sanjay and Elmore, Aaron and Franklin, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1745--1754},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dziedzic19a/dziedzic19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dziedzic19a.html},\n abstract = \t {The convolutional layers are core building blocks of neural network architectures. In general, a convolutional filter applies to the entire frequency spectrum of the input data. We explore artificially constraining the frequency spectra of these filters and data, called band-limiting, during training. The frequency domain constraints apply to both the feed-forward and back-propagation steps. Experimentally, we observe that Convolutional Neural Networks (CNNs) are resilient to this compression scheme and results suggest that CNNs learn to leverage lower-frequency components. In particular, we found: (1) band-limited training can effectively control the resource usage (GPU and memory); (2) models trained with band-limited layers retain high prediction accuracy; and (3) requires no modification to existing training algorithms or neural network architectures to use unlike other compression schemes.}\n}", "pdf": "http://proceedings.mlr.press/v97/dziedzic19a/dziedzic19a.pdf", "supp": "", "pdf_size": 2762848, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13311225323219881337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, University of Chicago; Department of Computer Science, University of Chicago; Department of Computer Science, University of Chicago; Department of Computer Science, University of Chicago; Department of Computer Science, University of Chicago", "aff_domain": "uchicago.edu;uchicago.edu; ; ; ", "email": "uchicago.edu;uchicago.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/dziedzic19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Chicago", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.uchicago.edu", "aff_unique_abbr": "UChicago", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bandit Multiclass Linear Classification: Efficient Algorithms for the Separable Case", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3672", "id": "3672", "author_site": "Alina Beygelzimer, David Pal, Balazs Szorenyi, Devanathan Thiruvenkatachari, Chen-Yu Wei, Chicheng Zhang", "author": "Alina Beygelzimer; David Pal; Balazs Szorenyi; Devanathan Thiruvenkatachari; Chen-Yu Wei; Chicheng Zhang", "abstract": "We study the problem of efficient online multiclass linear classification with bandit feedback, where all examples belong to one of $K$ classes and lie in the $d$-dimensional Euclidean space. Previous works have left open the challenge of designing efficient algorithms with finite mistake bounds when the data is linearly separable by a margin $\\gamma$. In this work, we take a first step towards this problem. We consider two notions of linear separability: strong and weak. 1. Under the strong linear separability condition, we design an efficient algorithm that achieves a near-optimal mistake bound of $O\\left(\\frac{K}{\\gamma^2} \\right)$. 2. Under the more challenging weak linear separability condition, we design an efficient algorithm with a mistake bound of $2^{\\widetilde{O}(\\min(K \\log^2 \\frac{1}{\\gamma}, \\sqrt{\\frac{1}{\\gamma}} \\log K))}$. Our algorithm is based on kernel Perceptron, which is inspired by the work of Klivans & Servedio (2008) on improperly learning intersection of halfspaces.", "bibtex": "@InProceedings{pmlr-v97-beygelzimer19a,\n title = \t {Bandit Multiclass Linear Classification: Efficient Algorithms for the Separable Case},\n author = {Beygelzimer, Alina and Pal, David and Szorenyi, Balazs and Thiruvenkatachari, Devanathan and Wei, Chen-Yu and Zhang, Chicheng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {624--633},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/beygelzimer19a/beygelzimer19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/beygelzimer19a.html},\n abstract = \t {We study the problem of efficient online multiclass linear classification with bandit feedback, where all examples belong to one of $K$ classes and lie in the $d$-dimensional Euclidean space. Previous works have left open the challenge of designing efficient algorithms with finite mistake bounds when the data is linearly separable by a margin $\\gamma$. In this work, we take a first step towards this problem. We consider two notions of linear separability: strong and weak. 1. Under the strong linear separability condition, we design an efficient algorithm that achieves a near-optimal mistake bound of $O\\left(\\frac{K}{\\gamma^2} \\right)$. 2. Under the more challenging weak linear separability condition, we design an efficient algorithm with a mistake bound of $2^{\\widetilde{O}(\\min(K \\log^2 \\frac{1}{\\gamma}, \\sqrt{\\frac{1}{\\gamma}} \\log K))}$. Our algorithm is based on kernel Perceptron, which is inspired by the work of Klivans & Servedio (2008) on improperly learning intersection of halfspaces.}\n}", "pdf": "http://proceedings.mlr.press/v97/beygelzimer19a/beygelzimer19a.pdf", "supp": "", "pdf_size": 462482, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11422241676475211596&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Yahoo Research; Yahoo Research; Yahoo Research; New York University; University of Southern California; Microsoft Research", "aff_domain": "gmail.com; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/beygelzimer19a.html", "aff_unique_index": "0;0;0;1;2;3", "aff_unique_norm": "Yahoo;New York University;University of Southern California;Microsoft", "aff_unique_dep": "Yahoo Research;;;Microsoft Research", "aff_unique_url": "https://research.yahoo.com;https://www.nyu.edu;https://www.usc.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Yahoo Research;NYU;USC;MSR", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Batch Policy Learning under Constraints", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3576", "id": "3576", "author_site": "Hoang Le, Cameron Voloshin, Yisong Yue", "author": "Hoang Le; Cameron Voloshin; Yisong Yue", "abstract": "When learning policies for real-world domains, two important questions arise: (i) how to efficiently use pre-collected off-policy, non-optimal behavior data; and (ii) how to mediate among different competing objectives and constraints. We thus study the problem of batch policy learning under multiple constraints, and offer a systematic solution. We first propose a flexible meta-algorithm that admits any batch reinforcement learning and online learning procedure as subroutines. We then present a specific algorithmic instantiation and provide performance guarantees for the main objective and all constraints. As part of off-policy learning, we propose a simple method for off-policy policy evaluation (OPE) and derive PAC-style bounds. Our algorithm achieves strong empirical results in different domains, including in a challenging problem of simulated car driving subject to multiple constraints such as lane keeping and smooth driving. We also show experimentally that our OPE method outperforms other popular OPE techniques on a standalone basis, especially in a high-dimensional setting.", "bibtex": "@InProceedings{pmlr-v97-le19a,\n title = \t {Batch Policy Learning under Constraints},\n author = {Le, Hoang and Voloshin, Cameron and Yue, Yisong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3703--3712},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/le19a/le19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/le19a.html},\n abstract = \t {When learning policies for real-world domains, two important questions arise: (i) how to efficiently use pre-collected off-policy, non-optimal behavior data; and (ii) how to mediate among different competing objectives and constraints. We thus study the problem of batch policy learning under multiple constraints, and offer a systematic solution. We first propose a flexible meta-algorithm that admits any batch reinforcement learning and online learning procedure as subroutines. We then present a specific algorithmic instantiation and provide performance guarantees for the main objective and all constraints. As part of off-policy learning, we propose a simple method for off-policy policy evaluation (OPE) and derive PAC-style bounds. Our algorithm achieves strong empirical results in different domains, including in a challenging problem of simulated car driving subject to multiple constraints such as lane keeping and smooth driving. We also show experimentally that our OPE method outperforms other popular OPE techniques on a standalone basis, especially in a high-dimensional setting.}\n}", "pdf": "http://proceedings.mlr.press/v97/le19a/le19a.pdf", "supp": "", "pdf_size": 1166589, "gs_citation": 396, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13735809935554561658&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "aff": "California Institute of Technology; California Institute of Technology; California Institute of Technology", "aff_domain": "caltech.edu; ; ", "email": "caltech.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/le19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "California Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.caltech.edu", "aff_unique_abbr": "Caltech", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pasadena", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "BayesNAS: A Bayesian Approach for Neural Architecture Search", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4231", "id": "4231", "author_site": "Hongpeng Zhou, Minghao Yang, Jun Wang, Wei Pan", "author": "Hongpeng Zhou; Minghao Yang; Jun Wang; Wei Pan", "abstract": "One-Shot Neural Architecture Search (NAS) is a promising method to significantly reduce search time without any separate training. It can be treated as a Network Compression problem on the architecture parameters from an over-parameterized network. However, there are two issues associated with most one-shot NAS methods. First, dependencies between a node and its predecessors and successors are often disregarded which result in improper treatment over zero operations. Second, architecture parameters pruning based on their magnitude is questionable. In this paper, we employ the classic Bayesian learning approach to alleviate these two issues by modeling architecture parameters using hierarchical automatic relevance determination (HARD) priors. Unlike other NAS methods, we train the over-parameterized network for only one epoch then update the architecture. Impressively, this enabled us to find the architecture in both proxy and proxyless tasks on CIFAR-10 within only 0.2 GPU days using a single GPU. As a byproduct, our approach can be transferred directly to compress convolutional neural networks by enforcing structural sparsity which achieves extremely sparse networks without accuracy deterioration.", "bibtex": "@InProceedings{pmlr-v97-zhou19e,\n title = \t {{B}ayes{NAS}: A {B}ayesian Approach for Neural Architecture Search},\n author = {Zhou, Hongpeng and Yang, Minghao and Wang, Jun and Pan, Wei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7603--7613},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhou19e/zhou19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhou19e.html},\n abstract = \t {One-Shot Neural Architecture Search (NAS) is a promising method to significantly reduce search time without any separate training. It can be treated as a Network Compression problem on the architecture parameters from an over-parameterized network. However, there are two issues associated with most one-shot NAS methods. First, dependencies between a node and its predecessors and successors are often disregarded which result in improper treatment over zero operations. Second, architecture parameters pruning based on their magnitude is questionable. In this paper, we employ the classic Bayesian learning approach to alleviate these two issues by modeling architecture parameters using hierarchical automatic relevance determination (HARD) priors. Unlike other NAS methods, we train the over-parameterized network for only one epoch then update the architecture. Impressively, this enabled us to find the architecture in both proxy and proxyless tasks on CIFAR-10 within only 0.2 GPU days using a single GPU. As a byproduct, our approach can be transferred directly to compress convolutional neural networks by enforcing structural sparsity which achieves extremely sparse networks without accuracy deterioration.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhou19e/zhou19e.pdf", "supp": "", "pdf_size": 1300446, "gs_citation": 236, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14284560481999949717&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Department of Cognitive Robotics, Delft University of Technology, Netherlands; Department of Cognitive Robotics, Delft University of Technology, Netherlands; Department of Computer Science, University College London, UK; Department of Cognitive Robotics, Delft University of Technology, Netherlands", "aff_domain": "tudelft.nl;tudelft.nl;ucl.ac.uk;tudelft.nl", "email": "tudelft.nl;tudelft.nl;ucl.ac.uk;tudelft.nl", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/zhou19e.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Delft University of Technology;University College London", "aff_unique_dep": "Department of Cognitive Robotics;Department of Computer Science", "aff_unique_url": "https://www.tudelft.nl;https://www.ucl.ac.uk", "aff_unique_abbr": "TU Delft;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Netherlands;United Kingdom" }, { "title": "Bayesian Action Decoder for Deep Multi-Agent Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3726", "id": "3726", "author_site": "Jakob Foerster, Francis Song, Edward Hughes, Neil Burch, Iain Dunning, Shimon Whiteson, Matthew Botvinick, Michael Bowling", "author": "Jakob Foerster; Francis Song; Edward Hughes; Neil Burch; Iain Dunning; Shimon Whiteson; Matthew Botvinick; Michael Bowling", "abstract": "When observing the actions of others, humans make inferences about why they acted as they did, and what this implies about the world; humans also use the fact that their actions will be interpreted in this manner, allowing them to act informatively and thereby communicate efficiently with others. Although learning algorithms have recently achieved superhuman performance in a number of two-player, zero-sum games, scalable multi-agent reinforcement learning algorithms that can discover effective strategies and conventions in complex, partially observable settings have proven elusive. We present the", "bibtex": "@InProceedings{pmlr-v97-foerster19a,\n title = \t {{B}ayesian Action Decoder for Deep Multi-Agent Reinforcement Learning},\n author = {Foerster, Jakob and Song, Francis and Hughes, Edward and Burch, Neil and Dunning, Iain and Whiteson, Shimon and Botvinick, Matthew and Bowling, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1942--1951},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/foerster19a/foerster19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/foerster19a.html},\n abstract = \t {When observing the actions of others, humans make inferences about why they acted as they did, and what this implies about the world; humans also use the fact that their actions will be interpreted in this manner, allowing them to act informatively and thereby communicate efficiently with others. Although learning algorithms have recently achieved superhuman performance in a number of two-player, zero-sum games, scalable multi-agent reinforcement learning algorithms that can discover effective strategies and conventions in complex, partially observable settings have proven elusive. We present the", "pdf": "http://proceedings.mlr.press/v97/foerster19a/foerster19a.pdf", "supp": "", "pdf_size": 613159, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9798514568859664649&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/foerster19a.html" }, { "title": "Bayesian Counterfactual Risk Minimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3876", "id": "3876", "author_site": "Ben London, Ted Sandler", "author": "Ben London; Ted Sandler", "abstract": "We present a Bayesian view of counterfactual risk minimization (CRM) for offline learning from logged bandit feedback. Using PAC-Bayesian analysis, we derive a new generalization bound for the truncated inverse propensity score estimator. We apply the bound to a class of Bayesian policies, which motivates a novel, potentially data-dependent, regularization technique for CRM. Experimental results indicate that this technique outperforms standard $L_2$ regularization, and that it is competitive with variance regularization while being both simpler to implement and more computationally efficient.", "bibtex": "@InProceedings{pmlr-v97-london19a,\n title = \t {{B}ayesian Counterfactual Risk Minimization},\n author = {London, Ben and Sandler, Ted},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4125--4133},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/london19a/london19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/london19a.html},\n abstract = \t {We present a Bayesian view of counterfactual risk minimization (CRM) for offline learning from logged bandit feedback. Using PAC-Bayesian analysis, we derive a new generalization bound for the truncated inverse propensity score estimator. We apply the bound to a class of Bayesian policies, which motivates a novel, potentially data-dependent, regularization technique for CRM. Experimental results indicate that this technique outperforms standard $L_2$ regularization, and that it is competitive with variance regularization while being both simpler to implement and more computationally efficient.}\n}", "pdf": "http://proceedings.mlr.press/v97/london19a/london19a.pdf", "supp": "", "pdf_size": 396430, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=526491441804145700&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Amazon, Seattle, WA, USA; Amazon, Seattle, WA, USA", "aff_domain": "amazon.com; ", "email": "amazon.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/london19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon", "aff_unique_url": "https://www.amazon.com", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Deconditional Kernel Mean Embeddings", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3983", "id": "3983", "author_site": "Kelvin Hsu, Fabio Ramos", "author": "Kelvin Hsu; Fabio Ramos", "abstract": "Conditional kernel mean embeddings form an attractive nonparametric framework for representing conditional means of functions, describing the observation processes for many complex models. However, the recovery of the original underlying function of interest whose conditional mean was observed is a challenging inference task. We formalize deconditional kernel mean embeddings as a solution to this inverse problem, and show that it can be naturally viewed as a nonparametric Bayes' rule. Critically, we introduce the notion of task transformed Gaussian processes and establish deconditional kernel means as their posterior predictive mean. This connection provides Bayesian interpretations and uncertainty estimates for deconditional kernel mean embeddings, explains their regularization hyperparameters, and reveals a marginal likelihood for kernel hyperparameter learning. These revelations further enable practical applications such as likelihood-free inference and learning sparse representations for big data.", "bibtex": "@InProceedings{pmlr-v97-hsu19a,\n title = \t {{B}ayesian Deconditional Kernel Mean Embeddings},\n author = {Hsu, Kelvin and Ramos, Fabio},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2830--2838},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hsu19a/hsu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hsu19a.html},\n abstract = \t {Conditional kernel mean embeddings form an attractive nonparametric framework for representing conditional means of functions, describing the observation processes for many complex models. However, the recovery of the original underlying function of interest whose conditional mean was observed is a challenging inference task. We formalize deconditional kernel mean embeddings as a solution to this inverse problem, and show that it can be naturally viewed as a nonparametric Bayes' rule. Critically, we introduce the notion of task transformed Gaussian processes and establish deconditional kernel means as their posterior predictive mean. This connection provides Bayesian interpretations and uncertainty estimates for deconditional kernel mean embeddings, explains their regularization hyperparameters, and reveals a marginal likelihood for kernel hyperparameter learning. These revelations further enable practical applications such as likelihood-free inference and learning sparse representations for big data.}\n}", "pdf": "http://proceedings.mlr.press/v97/hsu19a/hsu19a.pdf", "supp": "", "pdf_size": 2952449, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5413730934184143183&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "University of Sydney+CSIRO, Sydney; University of Sydney+NVIDIA, Seattle", "aff_domain": "sydney.edu.au; ", "email": "sydney.edu.au; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/hsu19a.html", "aff_unique_index": "0+1;0+2", "aff_unique_norm": "University of Sydney;Commonwealth Scientific and Industrial Research Organisation;NVIDIA", "aff_unique_dep": ";;NVIDIA", "aff_unique_url": "https://www.sydney.edu.au;https://www.csiro.au;https://www.nvidia.com", "aff_unique_abbr": "USYD;CSIRO;NV", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Sydney;Seattle", "aff_country_unique_index": "0+0;0+1", "aff_country_unique": "Australia;United States" }, { "title": "Bayesian Generative Active Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3619", "id": "3619", "author_site": "Toan Tran, Thanh-Toan Do, Ian Reid, Gustavo Carneiro", "author": "Toan Tran; Thanh-Toan Do; Ian Reid; Gustavo Carneiro", "abstract": "Deep learning models have demonstrated outstanding performance in several problems, but their training process tends to require immense amounts of computational and human resources for training and labeling, constraining the types of problems that can be tackled. Therefore, the design of effective training methods that require small labeled training sets is an important research direction that will allow a more effective use of resources. Among current approaches designed to address this issue, two are particularly interesting: data augmentation and active learning. Data augmentation achieves this goal by artificially generating new training points, while active learning relies on the selection of the \u201cmost informative\u201d subset of unlabeled training samples to be labelled by an oracle. Although successful in practice, data augmentation can waste computational resources because it indiscriminately generates samples that are not guaranteed to be informative, and active learning selects a small subset of informative samples (from a large un-annotated set) that may be insufficient for the training process. In this paper, we propose a Bayesian generative active deep learning approach that combines active learning with data augmentation \u2013 we provide theoretical and empirical evidence (MNIST, CIFAR-$\\{10,100\\}$, and SVHN) that our approach has more efficient training and better classification results than data augmentation and active learning.", "bibtex": "@InProceedings{pmlr-v97-tran19a,\n title = \t {{B}ayesian Generative Active Deep Learning},\n author = {Tran, Toan and Do, Thanh-Toan and Reid, Ian and Carneiro, Gustavo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6295--6304},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tran19a/tran19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tran19a.html},\n abstract = \t {Deep learning models have demonstrated outstanding performance in several problems, but their training process tends to require immense amounts of computational and human resources for training and labeling, constraining the types of problems that can be tackled. Therefore, the design of effective training methods that require small labeled training sets is an important research direction that will allow a more effective use of resources. Among current approaches designed to address this issue, two are particularly interesting: data augmentation and active learning. Data augmentation achieves this goal by artificially generating new training points, while active learning relies on the selection of the \u201cmost informative\u201d subset of unlabeled training samples to be labelled by an oracle. Although successful in practice, data augmentation can waste computational resources because it indiscriminately generates samples that are not guaranteed to be informative, and active learning selects a small subset of informative samples (from a large un-annotated set) that may be insufficient for the training process. In this paper, we propose a Bayesian generative active deep learning approach that combines active learning with data augmentation \u2013 we provide theoretical and empirical evidence (MNIST, CIFAR-$\\{10,100\\}$, and SVHN) that our approach has more efficient training and better classification results than data augmentation and active learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/tran19a/tran19a.pdf", "supp": "", "pdf_size": 1044297, "gs_citation": 181, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3794698160994325836&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "University of Adelaide, Australia; University of Liverpool; University of Adelaide, Australia; University of Adelaide, Australia", "aff_domain": "adelaide.edu.au; ; ; ", "email": "adelaide.edu.au; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/tran19a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Adelaide;University of Liverpool", "aff_unique_dep": ";", "aff_unique_url": "https://www.adelaide.edu.au;https://www.liverpool.ac.uk", "aff_unique_abbr": "Adelaide;Liv Uni", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Australia;United Kingdom" }, { "title": "Bayesian Joint Spike-and-Slab Graphical Lasso", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3635", "id": "3635", "author_site": "Zehang Li, Tyler Mccormick, Samuel Clark", "author": "Zehang Li; Tyler Mccormick; Samuel Clark", "abstract": "In this article, we propose a new class of priors for Bayesian inference with multiple Gaussian graphical models. We introduce Bayesian treatments of two popular procedures, the group graphical lasso and the fused graphical lasso, and extend them to a continuous spike-and-slab framework to allow self-adaptive shrinkage and model selection simultaneously. We develop an EM algorithm that performs fast and dynamic explorations of posterior modes. Our approach selects sparse models efficiently and automatically with substantially smaller bias than would be induced by alternative regularization procedures. The performance of the proposed methods are demonstrated through simulation and two real data examples.", "bibtex": "@InProceedings{pmlr-v97-li19h,\n title = \t {{B}ayesian Joint Spike-and-Slab Graphical Lasso},\n author = {Li, Zehang and Mccormick, Tyler and Clark, Samuel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3877--3885},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19h/li19h.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19h.html},\n abstract = \t {In this article, we propose a new class of priors for Bayesian inference with multiple Gaussian graphical models. We introduce Bayesian treatments of two popular procedures, the group graphical lasso and the fused graphical lasso, and extend them to a continuous spike-and-slab framework to allow self-adaptive shrinkage and model selection simultaneously. We develop an EM algorithm that performs fast and dynamic explorations of posterior modes. Our approach selects sparse models efficiently and automatically with substantially smaller bias than would be induced by alternative regularization procedures. The performance of the proposed methods are demonstrated through simulation and two real data examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19h/li19h.pdf", "supp": "", "pdf_size": 500946, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11980207298770096957&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Biostatistics, Yale School of Public Health, New Haven, Connecticut, USA+Department of Statistics, University of Washington, Seattle, Washington, USA+Department of Sociology, University of Washington, Seattle, Washington, USA; Department of Statistics, University of Washington, Seattle, Washington, USA+Department of Sociology, University of Washington, Seattle, Washington, USA; Department of Sociology, Ohio State University, Columbus, Ohio, USA", "aff_domain": "yale.edu; ; ", "email": "yale.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/li19h.html", "aff_unique_index": "0+1+1;1+1;2", "aff_unique_norm": "Yale School of Public Health;University of Washington;Ohio State University", "aff_unique_dep": "Department of Biostatistics;Department of Statistics;Department of Sociology", "aff_unique_url": "https://publichealth.yale.edu;https://www.washington.edu;https://www.osu.edu", "aff_unique_abbr": "Yale SPH;UW;OSU", "aff_campus_unique_index": "0+1+1;1+1;2", "aff_campus_unique": "New Haven;Seattle;Columbus", "aff_country_unique_index": "0+0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Bayesian Nonparametric Federated Learning of Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3892", "id": "3892", "author_site": "Mikhail Yurochkin, Mayank Agarwal, Soumya Ghosh, Kristjan Greenewald, Nghia Hoang, Yasaman Khazaeni", "author": "Mikhail Yurochkin; Mayank Agarwal; Soumya Ghosh; Kristjan Greenewald; Nghia Hoang; Yasaman Khazaeni", "abstract": "In federated learning problems, data is scattered across different servers and exchanging or pooling it is often impractical or prohibited. We develop a Bayesian nonparametric framework for federated learning with neural networks. Each data server is assumed to provide local neural network weights, which are modeled through our framework. We then develop an inference approach that allows us to synthesize a more expressive global network without additional supervision, data pooling and with as few as a single communication round. We then demonstrate the efficacy of our approach on federated learning problems simulated from two popular image classification datasets.", "bibtex": "@InProceedings{pmlr-v97-yurochkin19a,\n title = \t {{B}ayesian Nonparametric Federated Learning of Neural Networks},\n author = {Yurochkin, Mikhail and Agarwal, Mayank and Ghosh, Soumya and Greenewald, Kristjan and Hoang, Nghia and Khazaeni, Yasaman},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7252--7261},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yurochkin19a/yurochkin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yurochkin19a.html},\n abstract = \t {In federated learning problems, data is scattered across different servers and exchanging or pooling it is often impractical or prohibited. We develop a Bayesian nonparametric framework for federated learning with neural networks. Each data server is assumed to provide local neural network weights, which are modeled through our framework. We then develop an inference approach that allows us to synthesize a more expressive global network without additional supervision, data pooling and with as few as a single communication round. We then demonstrate the efficacy of our approach on federated learning problems simulated from two popular image classification datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/yurochkin19a/yurochkin19a.pdf", "supp": "", "pdf_size": 6578583, "gs_citation": 932, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14489502397862024393&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "IBM Research, Cambridge+MIT-IBM Watson AI Lab; IBM Research, Cambridge+MIT-IBM Watson AI Lab; IBM Research, Cambridge+MIT-IBM Watson AI Lab+Center for Computational Health; IBM Research, Cambridge+MIT-IBM Watson AI Lab; IBM Research, Cambridge+MIT-IBM Watson AI Lab; IBM Research, Cambridge+MIT-IBM Watson AI Lab", "aff_domain": "ibm.com; ; ; ; ; ", "email": "ibm.com; ; ; ; ; ", "github": "https://github.com/IBM/probabilistic-federated-neural-matching", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/yurochkin19a.html", "aff_unique_index": "0+1;0+1;0+1+2;0+1;0+1;0+1", "aff_unique_norm": "IBM;Massachusetts Institute of Technology;Center for Computational Health", "aff_unique_dep": "IBM Research;IBM Watson AI Lab;Computational Health", "aff_unique_url": "https://www.ibm.com/research;https://www.mitibmwatsonailab.org;", "aff_unique_abbr": "IBM;MIT-IBM AI Lab;", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0;0+0", "aff_country_unique": "United States;" }, { "title": "Bayesian Optimization Meets Bayesian Optimal Stopping", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3919", "id": "3919", "author_site": "Zhongxiang Dai, Haibin Yu, Bryan Kian Hsiang Low, Patrick Jaillet", "author": "Zhongxiang Dai; Haibin Yu; Bryan Kian Hsiang Low; Patrick Jaillet", "abstract": "Bayesian optimization (BO) is a popular paradigm for optimizing the hyperparameters of machine learning (ML) models due to its sample efficiency. Many ML models require running an iterative training procedure (e.g., stochastic gradient descent). This motivates the question whether information available during the training process (e.g., validation accuracy after each epoch) can be exploited for improving the epoch efficiency of BO algorithms by early-stopping model training under hyperparameter settings that will end up under-performing and hence eliminating unnecessary training epochs. This paper proposes to unify BO (specifically, Gaussian process-upper confidence bound (GP-UCB)) with Bayesian optimal stopping (BO-BOS) to boost the epoch efficiency of BO. To achieve this, while GP-UCB is sample-efficient in the number of function evaluations, BOS complements it with epoch efficiency for each function evaluation by providing a principled optimal stopping mechanism for early stopping. BO-BOS preserves the (asymptotic) no-regret performance of GP-UCB using our specified choice of BOS parameters that is amenable to an elegant interpretation in terms of the exploration-exploitation trade-off. We empirically evaluate the performance of BO-BOS and demonstrate its generality in hyperparameter optimization of ML models and two other interesting applications.", "bibtex": "@InProceedings{pmlr-v97-dai19a,\n title = \t {{B}ayesian Optimization Meets {B}ayesian Optimal Stopping},\n author = {Dai, Zhongxiang and Yu, Haibin and Low, Bryan Kian Hsiang and Jaillet, Patrick},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1496--1506},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dai19a/dai19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dai19a.html},\n abstract = \t {Bayesian optimization (BO) is a popular paradigm for optimizing the hyperparameters of machine learning (ML) models due to its sample efficiency. Many ML models require running an iterative training procedure (e.g., stochastic gradient descent). This motivates the question whether information available during the training process (e.g., validation accuracy after each epoch) can be exploited for improving the epoch efficiency of BO algorithms by early-stopping model training under hyperparameter settings that will end up under-performing and hence eliminating unnecessary training epochs. This paper proposes to unify BO (specifically, Gaussian process-upper confidence bound (GP-UCB)) with Bayesian optimal stopping (BO-BOS) to boost the epoch efficiency of BO. To achieve this, while GP-UCB is sample-efficient in the number of function evaluations, BOS complements it with epoch efficiency for each function evaluation by providing a principled optimal stopping mechanism for early stopping. BO-BOS preserves the (asymptotic) no-regret performance of GP-UCB using our specified choice of BOS parameters that is amenable to an elegant interpretation in terms of the exploration-exploitation trade-off. We empirically evaluate the performance of BO-BOS and demonstrate its generality in hyperparameter optimization of ML models and two other interesting applications.}\n}", "pdf": "http://proceedings.mlr.press/v97/dai19a/dai19a.pdf", "supp": "", "pdf_size": 910613, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=262415177762177545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, National University of Singapore, Republic of Singapore; Department of Computer Science, National University of Singapore, Republic of Singapore; Department of Computer Science, National University of Singapore, Republic of Singapore; Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, USA", "aff_domain": "comp.nus.edu.sg;comp.nus.edu.sg;comp.nus.edu.sg;mit.edu", "email": "comp.nus.edu.sg;comp.nus.edu.sg;comp.nus.edu.sg;mit.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/dai19a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "National University of Singapore;Massachusetts Institute of Technology", "aff_unique_dep": "Department of Computer Science;Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.nus.edu.sg;https://web.mit.edu", "aff_unique_abbr": "NUS;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Singapore;United States" }, { "title": "Bayesian Optimization of Composite Functions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3963", "id": "3963", "author_site": "Raul Astudillo, Peter I Frazier", "author": "Raul Astudillo; Peter Frazier", "abstract": "We consider optimization of composite objective functions, i.e., of the form $f(x)=g(h(x))$, where $h$ is a black-box derivative-free expensive-to-evaluate function with vector-valued outputs, and $g$ is a cheap-to-evaluate real-valued function. While these problems can be solved with standard Bayesian optimization, we propose a novel approach that exploits the composite structure of the objective function to substantially improve sampling efficiency. Our approach models $h$ using a multi-output Gaussian process and chooses where to sample using the expected improvement evaluated on the implied non-Gaussian posterior on $f$, which we call expected improvement for composite functions (EI-CF). Although EI-CF cannot be computed in closed form, we provide a novel stochastic gradient estimator that allows its efficient maximization. We also show that our approach is asymptotically consistent, i.e., that it recovers a globally optimal solution as sampling effort grows to infinity, generalizing previous convergence results for classical expected improvement. Numerical experiments show that our approach dramatically outperforms standard Bayesian optimization benchmarks, reducing simple regret by several orders of magnitude.", "bibtex": "@InProceedings{pmlr-v97-astudillo19a,\n title = \t {{B}ayesian Optimization of Composite Functions},\n author = {Astudillo, Raul and Frazier, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {354--363},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/astudillo19a/astudillo19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/astudillo19a.html},\n abstract = \t {We consider optimization of composite objective functions, i.e., of the form $f(x)=g(h(x))$, where $h$ is a black-box derivative-free expensive-to-evaluate function with vector-valued outputs, and $g$ is a cheap-to-evaluate real-valued function. While these problems can be solved with standard Bayesian optimization, we propose a novel approach that exploits the composite structure of the objective function to substantially improve sampling efficiency. Our approach models $h$ using a multi-output Gaussian process and chooses where to sample using the expected improvement evaluated on the implied non-Gaussian posterior on $f$, which we call expected improvement for composite functions (EI-CF). Although EI-CF cannot be computed in closed form, we provide a novel stochastic gradient estimator that allows its efficient maximization. We also show that our approach is asymptotically consistent, i.e., that it recovers a globally optimal solution as sampling effort grows to infinity, generalizing previous convergence results for classical expected improvement. Numerical experiments show that our approach dramatically outperforms standard Bayesian optimization benchmarks, reducing simple regret by several orders of magnitude.}\n}", "pdf": "http://proceedings.mlr.press/v97/astudillo19a/astudillo19a.pdf", "supp": "", "pdf_size": 4565488, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17436887215746606899&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "School of Operations Research and Information Engineering, Cornell University, Ithaca, NY, USA+Uber, San Francisco, CA, USA; School of Operations Research and Information Engineering, Cornell University, Ithaca, NY, USA+Uber, San Francisco, CA, USA", "aff_domain": "cornell.edu;cornell.edu", "email": "cornell.edu;cornell.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/astudillo19a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Cornell University;Uber", "aff_unique_dep": "School of Operations Research and Information Engineering;", "aff_unique_url": "https://www.cornell.edu;https://www.uber.com", "aff_unique_abbr": "Cornell;Uber", "aff_campus_unique_index": "0+1;0+1", "aff_campus_unique": "Ithaca;San Francisco", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "United States" }, { "title": "Bayesian leave-one-out cross-validation for large data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3825", "id": "3825", "author_site": "M\u00e5ns Magnusson, Michael Andersen, Johan Jonasson, Aki Vehtari", "author": "M\u00e5ns Magnusson; Michael Andersen; Johan Jonasson; Aki Vehtari", "abstract": "Model inference, such as model comparison, model checking, and model selection, is an important part of model development. Leave-one-out cross-validation (LOO) is a general approach for assessing the generalizability of a model, but unfortunately, LOO does not scale well to large datasets. We propose a combination of using approximate inference techniques and probability-proportional-to-size-sampling (PPS) for fast LOO model evaluation for large datasets. We provide both theoretical and empirical results showing good properties for large data.", "bibtex": "@InProceedings{pmlr-v97-magnusson19a,\n title = \t {{B}ayesian leave-one-out cross-validation for large data},\n author = {Magnusson, M{\\aa}ns and Andersen, Michael and Jonasson, Johan and Vehtari, Aki},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4244--4253},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/magnusson19a/magnusson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/magnusson19a.html},\n abstract = \t {Model inference, such as model comparison, model checking, and model selection, is an important part of model development. Leave-one-out cross-validation (LOO) is a general approach for assessing the generalizability of a model, but unfortunately, LOO does not scale well to large datasets. We propose a combination of using approximate inference techniques and probability-proportional-to-size-sampling (PPS) for fast LOO model evaluation for large datasets. We provide both theoretical and empirical results showing good properties for large data.}\n}", "pdf": "http://proceedings.mlr.press/v97/magnusson19a/magnusson19a.pdf", "supp": "", "pdf_size": 290289, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8231064948439410794&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Department of Computer Science, Aalto University, Finland; Department of Computer Science, Aalto University, Finland + Department of Applied Mathematics and Computer Science, Technical University of Denmark, Denmark; Department of Mathematical Sciences, Chalmers University of Technology and University of Gothenburg, Sweden; Department of Computer Science, Aalto University, Finland", "aff_domain": "aalto.fi; ; ; ", "email": "aalto.fi; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/magnusson19a.html", "aff_unique_index": "0;0+1;2;0", "aff_unique_norm": "Aalto University;Technical University of Denmark;Chalmers University of Technology", "aff_unique_dep": "Department of Computer Science;Department of Applied Mathematics and Computer Science;Department of Mathematical Sciences", "aff_unique_url": "https://www.aalto.fi;https://www.tud.dk;https://www.chalmers.se", "aff_unique_abbr": "Aalto;DTU;Chalmers", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;2;0", "aff_country_unique": "Finland;Denmark;Sweden" }, { "title": "Beating Stochastic and Adversarial Semi-bandits Optimally and Simultaneously", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3810", "id": "3810", "author_site": "Julian Zimmert, Haipeng Luo, Chen-Yu Wei", "author": "Julian Zimmert; Haipeng Luo; Chen-Yu Wei", "abstract": "We develop the first general semi-bandit algorithm that simultaneously achieves $\\mathcal{O}(\\log T)$ regret for stochastic environments and $\\mathcal{O}(\\sqrt{T})$ regret for adversarial environments without knowledge of the regime or the number of rounds $T$. The leading problem-dependent constants of our bounds are not only optimal in some worst-case sense studied previously, but also optimal for two concrete instances of semi-bandit problems. Our algorithm and analysis extend the recent work of (Zimmert & Seldin, 2019) for the special case of multi-armed bandits, but importantly requires a novel hybrid regularizer designed specifically for semi-bandit. Experimental results on synthetic data show that our algorithm indeed performs well uniformly over different environments. We finally provide a preliminary extension of our results to the full bandit feedback.", "bibtex": "@InProceedings{pmlr-v97-zimmert19a,\n title = \t {Beating Stochastic and Adversarial Semi-bandits Optimally and Simultaneously},\n author = {Zimmert, Julian and Luo, Haipeng and Wei, Chen-Yu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7683--7692},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zimmert19a/zimmert19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zimmert19a.html},\n abstract = \t {We develop the first general semi-bandit algorithm that simultaneously achieves $\\mathcal{O}(\\log T)$ regret for stochastic environments and $\\mathcal{O}(\\sqrt{T})$ regret for adversarial environments without knowledge of the regime or the number of rounds $T$. The leading problem-dependent constants of our bounds are not only optimal in some worst-case sense studied previously, but also optimal for two concrete instances of semi-bandit problems. Our algorithm and analysis extend the recent work of (Zimmert & Seldin, 2019) for the special case of multi-armed bandits, but importantly requires a novel hybrid regularizer designed specifically for semi-bandit. Experimental results on synthetic data show that our algorithm indeed performs well uniformly over different environments. We finally provide a preliminary extension of our results to the full bandit feedback.}\n}", "pdf": "http://proceedings.mlr.press/v97/zimmert19a/zimmert19a.pdf", "supp": "", "pdf_size": 449513, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8341679914934022750&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, University of Copenhagen, Copenhagen, Denmark; Department of Computer Science, University of Southern California, United States; Department of Computer Science, University of Southern California, United States", "aff_domain": "di.ku.dk;usc.edu;usc.edu", "email": "di.ku.dk;usc.edu;usc.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zimmert19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Copenhagen;University of Southern California", "aff_unique_dep": "Department of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.ku.dk;https://www.usc.edu", "aff_unique_abbr": "UCPH;USC", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Copenhagen;Los Angeles", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Denmark;United States" }, { "title": "Benefits and Pitfalls of the Exponential Mechanism with Applications to Hilbert Spaces and Functional PCA", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3732", "id": "3732", "author_site": "Jordan Awan, Ana Kenney, Matthew Reimherr, Aleksandra Slavkovi\u0107", "author": "Jordan Awan; Ana Kenney; Matthew Reimherr; Aleksandra Slavkovi\u0107", "abstract": "The exponential mechanism is a fundamental tool of Differential Privacy (DP) due to its strong privacy guarantees and flexibility. We study its extension to settings with summaries based on infinite dimensional outputs such as with functional data analysis, shape analysis, and nonparametric statistics. We show that the mechanism must be designed with respect to a specific base measure over the output space, such as a Gaussian process. We provide a positive result that establishes a Central Limit Theorem for the exponential mechanism quite broadly. We also provide a negative result, showing that the magnitude of noise introduced for privacy is asymptotically non-negligible relative to the statistical estimation error. We develop an $\\ep$-DP mechanism for functional principal component analysis, applicable in separable Hilbert spaces, and demonstrate its performance via simulations and applications to two datasets.", "bibtex": "@InProceedings{pmlr-v97-awan19a,\n title = \t {Benefits and Pitfalls of the Exponential Mechanism with Applications to {H}ilbert Spaces and Functional {PCA}},\n author = {Awan, Jordan and Kenney, Ana and Reimherr, Matthew and Slavkovi{\\'c}, Aleksandra},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {374--384},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/awan19a/awan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/awan19a.html},\n abstract = \t {The exponential mechanism is a fundamental tool of Differential Privacy (DP) due to its strong privacy guarantees and flexibility. We study its extension to settings with summaries based on infinite dimensional outputs such as with functional data analysis, shape analysis, and nonparametric statistics. We show that the mechanism must be designed with respect to a specific base measure over the output space, such as a Gaussian process. We provide a positive result that establishes a Central Limit Theorem for the exponential mechanism quite broadly. We also provide a negative result, showing that the magnitude of noise introduced for privacy is asymptotically non-negligible relative to the statistical estimation error. We develop an $\\ep$-DP mechanism for functional principal component analysis, applicable in separable Hilbert spaces, and demonstrate its performance via simulations and applications to two datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/awan19a/awan19a.pdf", "supp": "", "pdf_size": 457060, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5922431302989512803&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Statistics, Pennsylvania State University; Department of Statistics, Pennsylvania State University; Department of Statistics, Pennsylvania State University; Department of Statistics, Pennsylvania State University", "aff_domain": "psu.edu; ; ; ", "email": "psu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/awan19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Pennsylvania State University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.psu.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Better generalization with less data using robust gradient descent", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3604", "id": "3604", "author_site": "Matthew J. Holland, Kazushi Ikeda", "author": "Matthew Holland; Kazushi Ikeda", "abstract": "For learning tasks where the data (or losses) may be heavy-tailed, algorithms based on empirical risk minimization may require a substantial number of observations in order to perform well off-sample. In pursuit of stronger performance under weaker assumptions, we propose a technique which uses a cheap and robust iterative estimate of the risk gradient, which can be easily fed into any steepest descent procedure. Finite-sample risk bounds are provided under weak moment assumptions on the loss gradient. The algorithm is simple to implement, and empirical tests using simulations and real-world data illustrate that more efficient and reliable learning is possible without prior knowledge of the loss tails.", "bibtex": "@InProceedings{pmlr-v97-holland19a,\n title = \t {Better generalization with less data using robust gradient descent},\n author = {Holland, Matthew and Ikeda, Kazushi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2761--2770},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/holland19a/holland19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/holland19a.html},\n abstract = \t {For learning tasks where the data (or losses) may be heavy-tailed, algorithms based on empirical risk minimization may require a substantial number of observations in order to perform well off-sample. In pursuit of stronger performance under weaker assumptions, we propose a technique which uses a cheap and robust iterative estimate of the risk gradient, which can be easily fed into any steepest descent procedure. Finite-sample risk bounds are provided under weak moment assumptions on the loss gradient. The algorithm is simple to implement, and empirical tests using simulations and real-world data illustrate that more efficient and reliable learning is possible without prior knowledge of the loss tails.}\n}", "pdf": "http://proceedings.mlr.press/v97/holland19a/holland19a.pdf", "supp": "", "pdf_size": 4568370, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7328878873565177954&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Institute of Scientific and Industrial Research, Osaka University; Division of Information Science, Nara Institute of Science and Technology", "aff_domain": "ar.sanken.osaka-u.ac.jp; ", "email": "ar.sanken.osaka-u.ac.jp; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/holland19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Osaka University;Nara Institute of Science and Technology", "aff_unique_dep": "Institute of Scientific and Industrial Research;Division of Information Science", "aff_unique_url": "https://www.osaka-u.ac.jp;https://www.nist.go.jp", "aff_unique_abbr": "OSU;NIST", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Osaka;Nara", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Beyond Adaptive Submodularity: Approximation Guarantees of Greedy Policy with Adaptive Submodularity Ratio", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3723", "id": "3723", "author_site": "Kaito Fujii, Shinsaku Sakaue", "author": "Kaito Fujii; Shinsaku Sakaue", "abstract": "We propose a new concept named adaptive submodularity ratio to study the greedy policy for sequential decision making. While the greedy policy is known to perform well for a wide variety of adaptive stochastic optimization problems in practice, its theoretical properties have been analyzed only for a limited class of problems. We narrow the gap between theory and practice by using adaptive submodularity ratio, which enables us to prove approximation guarantees of the greedy policy for a substantially wider class of problems. Examples of newly analyzed problems include important applications such as adaptive influence maximization and adaptive feature selection. Our adaptive submodularity ratio also provides bounds of adaptivity gaps. Experiments confirm that the greedy policy performs well with the applications being considered compared to standard heuristics.", "bibtex": "@InProceedings{pmlr-v97-fujii19a,\n title = \t {Beyond Adaptive Submodularity: Approximation Guarantees of Greedy Policy with Adaptive Submodularity Ratio},\n author = {Fujii, Kaito and Sakaue, Shinsaku},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2042--2051},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fujii19a/fujii19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fujii19a.html},\n abstract = \t {We propose a new concept named adaptive submodularity ratio to study the greedy policy for sequential decision making. While the greedy policy is known to perform well for a wide variety of adaptive stochastic optimization problems in practice, its theoretical properties have been analyzed only for a limited class of problems. We narrow the gap between theory and practice by using adaptive submodularity ratio, which enables us to prove approximation guarantees of the greedy policy for a substantially wider class of problems. Examples of newly analyzed problems include important applications such as adaptive influence maximization and adaptive feature selection. Our adaptive submodularity ratio also provides bounds of adaptivity gaps. Experiments confirm that the greedy policy performs well with the applications being considered compared to standard heuristics.}\n}", "pdf": "http://proceedings.mlr.press/v97/fujii19a/fujii19a.pdf", "supp": "", "pdf_size": 2034718, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6128197979654494099&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Tokyo; NTT Communication Science Laboratories", "aff_domain": "mist.i.u-tokyo.ac.jp; ", "email": "mist.i.u-tokyo.ac.jp; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/fujii19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Tokyo;NTT Communication Science Laboratories", "aff_unique_dep": ";", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.ntt-csl.com", "aff_unique_abbr": "UTokyo;NTT CSL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Japan" }, { "title": "Beyond Backprop: Online Alternating Minimization with Auxiliary Variables", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4325", "id": "4325", "author_site": "Anna Choromanska, Benjamin Cowen, Sadhana Kumaravel, Ronny Luss, Mattia Rigotti, Irina Rish, Paolo DiAchille, Viatcheslav Gurev, Brian Kingsbury, Ravi Tejwani, Djallel Bouneffouf", "author": "Anna Choromanska; Benjamin Cowen; Sadhana Kumaravel; Ronny Luss; Mattia Rigotti; Irina Rish; Paolo Diachille; Viatcheslav Gurev; Brian Kingsbury; Ravi Tejwani; Djallel Bouneffouf", "abstract": "Despite significant recent advances in deep neural networks, training them remains a challenge due to the highly non-convex nature of the objective function. State-of-the-art methods rely on error backpropagation, which suffers from several well-known issues, such as vanishing and exploding gradients, inability to handle non-differentiable nonlinearities and to parallelize weight-updates across layers, and biological implausibility. These limitations continue to motivate exploration of alternative training algorithms, including several recently proposed auxiliary-variable methods which break the complex nested objective function into local subproblems. However, those techniques are mainly offline (batch), which limits their applicability to extremely large datasets, as well as to online, continual or reinforcement learning. The main contribution of our work is a novel online (stochastic/mini-batch) alternating minimization (AM) approach for training deep neural networks, together with the first theoretical convergence guarantees for AM in stochastic settings and promising empirical results on a variety of architectures and datasets.", "bibtex": "@InProceedings{pmlr-v97-choromanska19a,\n title = \t {Beyond Backprop: Online Alternating Minimization with Auxiliary Variables},\n author = {Choromanska, Anna and Cowen, Benjamin and Kumaravel, Sadhana and Luss, Ronny and Rigotti, Mattia and Rish, Irina and Diachille, Paolo and Gurev, Viatcheslav and Kingsbury, Brian and Tejwani, Ravi and Bouneffouf, Djallel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1193--1202},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/choromanska19a/choromanska19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/choromanska19a.html},\n abstract = \t {Despite significant recent advances in deep neural networks, training them remains a challenge due to the highly non-convex nature of the objective function. State-of-the-art methods rely on error backpropagation, which suffers from several well-known issues, such as vanishing and exploding gradients, inability to handle non-differentiable nonlinearities and to parallelize weight-updates across layers, and biological implausibility. These limitations continue to motivate exploration of alternative training algorithms, including several recently proposed auxiliary-variable methods which break the complex nested objective function into local subproblems. However, those techniques are mainly offline (batch), which limits their applicability to extremely large datasets, as well as to online, continual or reinforcement learning. The main contribution of our work is a novel online (stochastic/mini-batch) alternating minimization (AM) approach for training deep neural networks, together with the first theoretical convergence guarantees for AM in stochastic settings and promising empirical results on a variety of architectures and datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/choromanska19a/choromanska19a.pdf", "supp": "", "pdf_size": 2014485, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13143560607415133217&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "ECE NYU Tandon; ECE NYU Tandon; IBM T.J. Watson Research Center; IBM T.J. Watson Research Center; IBM T.J. Watson Research Center; IBM T.J. Watson Research Center; IBM T.J. Watson Research Center; IBM T.J. Watson Research Center; IBM T.J. Watson Research Center; MIT; IBM T.J. Watson Research Center", "aff_domain": "ibm.com; ; ; ; ; ; ; ; ; ; ", "email": "ibm.com; ; ; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 11, "oa": "https://proceedings.mlr.press/v97/choromanska19a.html", "aff_unique_index": "0;0;1;1;1;1;1;1;1;2;1", "aff_unique_norm": "New York University Tandon School of Engineering;IBM;Massachusetts Institute of Technology", "aff_unique_dep": "Department of Electrical and Computer Engineering;Research Center;", "aff_unique_url": "https://engineering.nyu.edu;https://www.ibm.com/research/watson;https://web.mit.edu", "aff_unique_abbr": "NYU Tandon;IBM;MIT", "aff_campus_unique_index": "0;0;1;1;1;1;1;1;1;1", "aff_campus_unique": "Brooklyn;T.J. Watson;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Beyond the Chinese Restaurant and Pitman-Yor processes: Statistical Models with double power-law behavior", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4022", "id": "4022", "author_site": "Fadhel Ayed, Juho Lee, Francois Caron", "author": "Fadhel Ayed; Juho Lee; Francois Caron", "abstract": "Bayesian nonparametric approaches, in particular the Pitman-Yor process and the associated two-parameter Chinese Restaurant process, have been successfully used in applications where the data exhibit a power-law behavior. Examples include natural language processing, natural images or networks. There is also growing empirical evidence suggesting that some datasets exhibit a two-regime power-law behavior: one regime for small frequencies, and a second regime, with a different exponent, for high frequencies. In this paper, we introduce a class of completely random measures which are doubly regularly-varying. Contrary to the Pitman-Yor process, we show that when completely random measures in this class are normalized to obtain random probability measures and associated random partitions, such partitions exhibit a double power-law behavior. We present two general constructions and discuss in particular two models within this class: the beta prime process (Broderick et al. (2015, 2018) and a novel process called generalized BFRY process. We derive efficient Markov chain Monte Carlo algorithms to estimate the parameters of these models. Finally, we show that the proposed models provide a better fit than the Pitman-Yor process on various datasets.", "bibtex": "@InProceedings{pmlr-v97-ayed19a,\n title = \t {Beyond the Chinese Restaurant and Pitman-Yor processes: Statistical Models with double power-law behavior},\n author = {Ayed, Fadhel and Lee, Juho and Caron, Francois},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {395--404},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ayed19a/ayed19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ayed19a.html},\n abstract = \t {Bayesian nonparametric approaches, in particular the Pitman-Yor process and the associated two-parameter Chinese Restaurant process, have been successfully used in applications where the data exhibit a power-law behavior. Examples include natural language processing, natural images or networks. There is also growing empirical evidence suggesting that some datasets exhibit a two-regime power-law behavior: one regime for small frequencies, and a second regime, with a different exponent, for high frequencies. In this paper, we introduce a class of completely random measures which are doubly regularly-varying. Contrary to the Pitman-Yor process, we show that when completely random measures in this class are normalized to obtain random probability measures and associated random partitions, such partitions exhibit a double power-law behavior. We present two general constructions and discuss in particular two models within this class: the beta prime process (Broderick et al. (2015, 2018) and a novel process called generalized BFRY process. We derive efficient Markov chain Monte Carlo algorithms to estimate the parameters of these models. Finally, we show that the proposed models provide a better fit than the Pitman-Yor process on various datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/ayed19a/ayed19a.pdf", "supp": "", "pdf_size": 844143, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7805425707346893329&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Statistics, University of Oxford; Department of Statistics, University of Oxford + AITRICS, Seoul, Republic of Korea; Department of Statistics, University of Oxford", "aff_domain": "stats.ox.ac.uk; ; ", "email": "stats.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ayed19a.html", "aff_unique_index": "0;0+1;0", "aff_unique_norm": "University of Oxford;AITRICS", "aff_unique_dep": "Department of Statistics;", "aff_unique_url": "https://www.ox.ac.uk;", "aff_unique_abbr": "Oxford;", "aff_campus_unique_index": "0;0+1;0", "aff_campus_unique": "Oxford;Seoul", "aff_country_unique_index": "0;0+1;0", "aff_country_unique": "United Kingdom;South Korea" }, { "title": "Bias Also Matters: Bias Attribution for Deep Neural Network Explanation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3905", "id": "3905", "author_site": "Shengjie Wang, Tianyi Zhou, Jeff Bilmes", "author": "Shengjie Wang; Tianyi Zhou; Jeff Bilmes", "abstract": "The gradient of a deep neural network (DNN) w.r.t. the input provides information that can be used to explain the output prediction in terms of the input features and has been widely studied to assist in interpreting DNNs. In a linear model (i.e., g(x) = wx + b), the gradient corresponds to the weights w. Such a model can reasonably locally-linearly approximate a smooth nonlinear DNN, and hence the weights of this local model are the gradient. The bias b, however, is usually overlooked in attribution methods. In this paper, we observe that since the bias in a DNN also has a non-negligible contribution to the correctness of predictions, it can also play a significant role in understanding DNN behavior. We propose a backpropagation-type algorithm \u201cbias back-propagation (BBp)\u201d that starts at the output layer and iteratively attributes the bias of each layer to its input nodes as well as combining the resulting bias term of the previous layer. Together with the backpropagation of the gradient generating w, we can fully recover the locally linear model g(x) = wx + b. In experiments, we show that BBp can generate complementary and highly interpretable explanations.", "bibtex": "@InProceedings{pmlr-v97-wang19p,\n title = \t {Bias Also Matters: Bias Attribution for Deep Neural Network Explanation},\n author = {Wang, Shengjie and Zhou, Tianyi and Bilmes, Jeff},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6659--6667},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19p/wang19p.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19p.html},\n abstract = \t {The gradient of a deep neural network (DNN) w.r.t. the input provides information that can be used to explain the output prediction in terms of the input features and has been widely studied to assist in interpreting DNNs. In a linear model (i.e., g(x) = wx + b), the gradient corresponds to the weights w. Such a model can reasonably locally-linearly approximate a smooth nonlinear DNN, and hence the weights of this local model are the gradient. The bias b, however, is usually overlooked in attribution methods. In this paper, we observe that since the bias in a DNN also has a non-negligible contribution to the correctness of predictions, it can also play a significant role in understanding DNN behavior. We propose a backpropagation-type algorithm \u201cbias back-propagation (BBp)\u201d that starts at the output layer and iteratively attributes the bias of each layer to its input nodes as well as combining the resulting bias term of the previous layer. Together with the backpropagation of the gradient generating w, we can fully recover the locally linear model g(x) = wx + b. In experiments, we show that BBp can generate complementary and highly interpretable explanations.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19p/wang19p.pdf", "supp": "", "pdf_size": 3852817, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8475068318232695769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Paul G. Allen School of Computer Science & Engineering; Paul G. Allen School of Computer Science & Engineering; Department of Electrical & Computer Engineering, University of Washington, Seattle, USA", "aff_domain": "uw.edu;uw.edu;uw.edu", "email": "uw.edu;uw.edu;uw.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wang19p.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Paul G. Allen School of Computer Science & Engineering", "aff_unique_url": "https://www.cs.washington.edu", "aff_unique_abbr": "UW CSE", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Bilinear Bandits with Low-rank Structure", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3780", "id": "3780", "author_site": "Kwang-Sung Jun, Rebecca Willett, Stephen Wright, Robert Nowak", "author": "Kwang-Sung Jun; Rebecca Willett; Stephen Wright; Robert Nowak", "abstract": "We introduce the bilinear bandit problem with low-rank structure in which an action takes the form of a pair of arms from two different entity types, and the reward is a bilinear function of the known feature vectors of the arms. The unknown in the problem is a $d_1$ by $d_2$ matrix $\\mathbf{\\Theta}^*$ that defines the reward, and has low rank $r \\ll \\min\\{d_1,d_2\\}$. Determination of $\\mathbf{\\Theta}^*$ with this low-rank structure poses a significant challenge in finding the right exploration-exploitation tradeoff. In this work, we propose a new two-stage algorithm called \u201cExplore-Subspace-Then-Refine\u201d (ESTR). The first stage is an explicit subspace exploration, while the second stage is a linear bandit algorithm called \u201calmost-low-dimensional OFUL\u201d (LowOFUL) that exploits and further refines the estimated subspace via a regularization technique. We show that the regret of ESTR is $\\widetilde{\\mathcal{O}}((d_1+d_2)^{3/2} \\sqrt{r T})$ where $\\widetilde{\\mathcal{O}}$ hides logarithmic factors and $T$ is the time horizon, which improves upon the regret of $\\widetilde{\\mathcal{O}}(d_1d_2\\sqrt{T})$ attained for a na\u00efve linear bandit reduction. We conjecture that the regret bound of ESTR is unimprovable up to polylogarithmic factors, and our preliminary experiment shows that ESTR outperforms a na\u00efve linear bandit reduction.", "bibtex": "@InProceedings{pmlr-v97-jun19a,\n title = \t {Bilinear Bandits with Low-rank Structure},\n author = {Jun, Kwang-Sung and Willett, Rebecca and Wright, Stephen and Nowak, Robert},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3163--3172},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jun19a/jun19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jun19a.html},\n abstract = \t {We introduce the bilinear bandit problem with low-rank structure in which an action takes the form of a pair of arms from two different entity types, and the reward is a bilinear function of the known feature vectors of the arms. The unknown in the problem is a $d_1$ by $d_2$ matrix $\\mathbf{\\Theta}^*$ that defines the reward, and has low rank $r \\ll \\min\\{d_1,d_2\\}$. Determination of $\\mathbf{\\Theta}^*$ with this low-rank structure poses a significant challenge in finding the right exploration-exploitation tradeoff. In this work, we propose a new two-stage algorithm called \u201cExplore-Subspace-Then-Refine\u201d (ESTR). The first stage is an explicit subspace exploration, while the second stage is a linear bandit algorithm called \u201calmost-low-dimensional OFUL\u201d (LowOFUL) that exploits and further refines the estimated subspace via a regularization technique. We show that the regret of ESTR is $\\widetilde{\\mathcal{O}}((d_1+d_2)^{3/2} \\sqrt{r T})$ where $\\widetilde{\\mathcal{O}}$ hides logarithmic factors and $T$ is the time horizon, which improves upon the regret of $\\widetilde{\\mathcal{O}}(d_1d_2\\sqrt{T})$ attained for a na\u00efve linear bandit reduction. We conjecture that the regret bound of ESTR is unimprovable up to polylogarithmic factors, and our preliminary experiment shows that ESTR outperforms a na\u00efve linear bandit reduction.}\n}", "pdf": "http://proceedings.mlr.press/v97/jun19a/jun19a.pdf", "supp": "", "pdf_size": 1225406, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9045050767842834407&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Boston University; University of Chicago; University of Wisconsin-Madison; University of Wisconsin-Madison", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/jun19a.html", "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Boston University;University of Chicago;University of Wisconsin-Madison", "aff_unique_dep": ";;", "aff_unique_url": "https://www.bu.edu;https://www.uchicago.edu;https://www.wisc.edu", "aff_unique_abbr": "BU;UChicago;UW-Madison", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bit-Swap: Recursive Bits-Back Coding for Lossless Compression with Hierarchical Latent Variables", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4305", "id": "4305", "author_site": "Friso Kingma, Pieter Abbeel, Jonathan Ho", "author": "Friso Kingma; Pieter Abbeel; Jonathan Ho", "abstract": "The bits-back argument suggests that latent variable models can be turned into lossless compression schemes. Translating the bits-back argument into efficient and practical lossless compression schemes for general latent variable models, however, is still an open problem. Bits-Back with Asymmetric Numeral Systems (BB-ANS), recently proposed by Townsend et al,. 2019, makes bits-back coding practically feasible for latent variable models with one latent layer, but it is inefficient for hierarchical latent variable models. In this paper we propose Bit-Swap, a new compression scheme that generalizes BB-ANS and achieves strictly better compression rates for hierarchical latent variable models with Markov chain structure. Through experiments we verify that Bit-Swap results in lossless compression rates that are empirically superior to existing techniques.", "bibtex": "@InProceedings{pmlr-v97-kingma19a,\n title = \t {Bit-Swap: Recursive Bits-Back Coding for Lossless Compression with Hierarchical Latent Variables},\n author = {Kingma, Friso and Abbeel, Pieter and Ho, Jonathan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3408--3417},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kingma19a/kingma19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kingma19a.html},\n abstract = \t {The bits-back argument suggests that latent variable models can be turned into lossless compression schemes. Translating the bits-back argument into efficient and practical lossless compression schemes for general latent variable models, however, is still an open problem. Bits-Back with Asymmetric Numeral Systems (BB-ANS), recently proposed by Townsend et al,. 2019, makes bits-back coding practically feasible for latent variable models with one latent layer, but it is inefficient for hierarchical latent variable models. In this paper we propose Bit-Swap, a new compression scheme that generalizes BB-ANS and achieves strictly better compression rates for hierarchical latent variable models with Markov chain structure. Through experiments we verify that Bit-Swap results in lossless compression rates that are empirically superior to existing techniques.}\n}", "pdf": "http://proceedings.mlr.press/v97/kingma19a/kingma19a.pdf", "supp": "", "pdf_size": 1768733, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12443881008782599419&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "University of California, Berkeley, California, USA; University of California, Berkeley, California, USA; University of California, Berkeley, California, USA", "aff_domain": "gmail.com; ;berkeley.edu", "email": "gmail.com; ;berkeley.edu", "github": "https://github.com/fhkingma/bitswap", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kingma19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Blended Conditonal Gradients", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4183", "id": "4183", "author_site": "G\u00e1bor Braun, Sebastian Pokutta, Dan Tu, Stephen Wright", "author": "G\u00e1bor Braun; Sebastian Pokutta; Dan Tu; Stephen Wright", "abstract": "We present a blended conditional gradient approach for minimizing a smooth convex function over a polytope P, combining the Frank{\u2013}Wolfe algorithm (also called conditional gradient) with gradient-based steps, different from away steps and pairwise steps, but still achieving linear convergence for strongly convex functions, along with good practical performance. Our approach retains all favorable properties of conditional gradient algorithms, notably avoidance of projections onto P and maintenance of iterates as sparse convex combinations of a limited number of extreme points of P. The algorithm is lazy, making use of inexpensive inexact solutions of the linear programming subproblem that characterizes the conditional gradient approach. It decreases measures of optimality (primal and dual gaps) rapidly, both in the number of iterations and in wall-clock time, outperforming even the lazy conditional gradient algorithms of Braun et al. 2017. We also present a streamlined version of the algorithm that applies when P is the probability simplex.", "bibtex": "@InProceedings{pmlr-v97-braun19a,\n title = \t {Blended Conditonal Gradients},\n author = {Braun, G{\\'a}bor and Pokutta, Sebastian and Tu, Dan and Wright, Stephen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {735--743},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/braun19a/braun19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/braun19a.html},\n abstract = \t {We present a blended conditional gradient approach for minimizing a smooth convex function over a polytope P, combining the Frank{\u2013}Wolfe algorithm (also called conditional gradient) with gradient-based steps, different from away steps and pairwise steps, but still achieving linear convergence for strongly convex functions, along with good practical performance. Our approach retains all favorable properties of conditional gradient algorithms, notably avoidance of projections onto P and maintenance of iterates as sparse convex combinations of a limited number of extreme points of P. The algorithm is lazy, making use of inexpensive inexact solutions of the linear programming subproblem that characterizes the conditional gradient approach. It decreases measures of optimality (primal and dual gaps) rapidly, both in the number of iterations and in wall-clock time, outperforming even the lazy conditional gradient algorithms of Braun et al. 2017. We also present a streamlined version of the algorithm that applies when P is the probability simplex.}\n}", "pdf": "http://proceedings.mlr.press/v97/braun19a/braun19a.pdf", "supp": "", "pdf_size": 2228171, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6610472350258988994&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "ISyE, Georgia Institute of Technology; ISyE, Georgia Institute of Technology; ISyE, Georgia Institute of Technology; Computer Sciences Department, University of Wisconsin", "aff_domain": "isye.gatech.edu;isye.gatech.edu;gatech.edu;cs.wisc.edu", "email": "isye.gatech.edu;isye.gatech.edu;gatech.edu;cs.wisc.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/braun19a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Georgia Institute of Technology;University of Wisconsin", "aff_unique_dep": "Industrial and Systems Engineering;Computer Sciences Department", "aff_unique_url": "https://www.gatech.edu;https://www.wisc.edu", "aff_unique_abbr": "Georgia Tech;UW", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Atlanta;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Boosted Density Estimation Remastered", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3788", "id": "3788", "author_site": "Zac Cranko, Richard Nock", "author": "Zac Cranko; Richard Nock", "abstract": "There has recently been a steady increase in the number iterative approaches to density estimation. However, an accompanying burst of formal convergence guarantees has not followed; all results pay the price of heavy assumptions which are often unrealistic or hard to check. The", "bibtex": "@InProceedings{pmlr-v97-cranko19b,\n title = \t {Boosted Density Estimation Remastered},\n author = {Cranko, Zac and Nock, Richard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1416--1425},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cranko19b/cranko19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/cranko19b.html},\n abstract = \t {There has recently been a steady increase in the number iterative approaches to density estimation. However, an accompanying burst of formal convergence guarantees has not followed; all results pay the price of heavy assumptions which are often unrealistic or hard to check. The", "pdf": "http://proceedings.mlr.press/v97/cranko19b/cranko19b.pdf", "supp": "", "pdf_size": 965831, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17228246928161756997&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "The Australian National University; The University of Sydney", "aff_domain": "anu.edu.au; ", "email": "anu.edu.au; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/cranko19b.html", "aff_unique_index": "0;1", "aff_unique_norm": "Australian National University;University of Sydney", "aff_unique_dep": ";", "aff_unique_url": "https://www.anu.edu.au;https://www.sydney.edu.au", "aff_unique_abbr": "ANU;USYD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Australia" }, { "title": "Bounding User Contributions: A Bias-Variance Trade-off in Differential Privacy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4220", "id": "4220", "author_site": "Kareem Amin, Alex Kulesza, andres munoz, Sergei Vassilvitskii", "author": "Kareem Amin; Alex Kulesza; Andres Munoz; Sergei Vassilvtiskii", "abstract": "Differentially private learning algorithms protect individual participants in the training dataset by guaranteeing that their presence does not significantly change the resulting model. In order to make this promise, such algorithms need to know the maximum contribution that can be made by a single user: the more data an individual can contribute, the more noise will need to be added to protect them. While most existing analyses assume that the maximum contribution is known and fixed in advance{\u2014}indeed, it is often assumed that each user contributes only a single example{\u2014}we argue that in practice there is a meaningful choice to be made. On the one hand, if we allow users to contribute large amounts of data, we may end up adding excessive noise to protect a few outliers, even when the majority contribute only modestly. On the other hand, limiting users to small contributions keeps noise levels low at the cost of potentially discarding significant amounts of excess data, thus introducing bias. Here, we characterize this trade-off for an empirical risk minimization setting, showing that in general there is a \u201csweet spot\u201d that depends on measurable properties of the dataset, but that there is also a concrete cost to privacy that cannot be avoided simply by collecting more data.", "bibtex": "@InProceedings{pmlr-v97-amin19a,\n title = \t {Bounding User Contributions: A Bias-Variance Trade-off in Differential Privacy},\n author = {Amin, Kareem and Kulesza, Alex and Munoz, Andres and Vassilvtiskii, Sergei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {263--271},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/amin19a/amin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/amin19a.html},\n abstract = \t {Differentially private learning algorithms protect individual participants in the training dataset by guaranteeing that their presence does not significantly change the resulting model. In order to make this promise, such algorithms need to know the maximum contribution that can be made by a single user: the more data an individual can contribute, the more noise will need to be added to protect them. While most existing analyses assume that the maximum contribution is known and fixed in advance{\u2014}indeed, it is often assumed that each user contributes only a single example{\u2014}we argue that in practice there is a meaningful choice to be made. On the one hand, if we allow users to contribute large amounts of data, we may end up adding excessive noise to protect a few outliers, even when the majority contribute only modestly. On the other hand, limiting users to small contributions keeps noise levels low at the cost of potentially discarding significant amounts of excess data, thus introducing bias. Here, we characterize this trade-off for an empirical risk minimization setting, showing that in general there is a \u201csweet spot\u201d that depends on measurable properties of the dataset, but that there is also a concrete cost to privacy that cannot be avoided simply by collecting more data.}\n}", "pdf": "http://proceedings.mlr.press/v97/amin19a/amin19a.pdf", "supp": "", "pdf_size": 901733, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5449859677081737678&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Research New York, NY, USA; Google Research New York, NY, USA; Google Research New York, NY, USA; Google Research New York, NY, USA", "aff_domain": "google.com;google.com;google.com;google.com", "email": "google.com;google.com;google.com;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/amin19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Breaking Inter-Layer Co-Adaptation by Classifier Anonymization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3713", "id": "3713", "author_site": "Ikuro Sato, Kohta Ishikawa, Guoqing Liu, Masayuki Tanaka", "author": "Ikuro Sato; Kohta Ishikawa; Guoqing Liu; Masayuki Tanaka", "abstract": "This study addresses an issue of co-adaptation between a feature extractor and a classifier in a neural network. A naive joint optimization of a feature extractor and a classifier often brings situations in which an excessively complex feature distribution adapted to a very specific classifier degrades the test performance. We introduce a method called Feature-extractor Optimization through Classifier Anonymization (FOCA), which is designed to avoid an explicit co-adaptation between a feature extractor and a particular classifier by using many randomly-generated, weak classifiers during optimization. We put forth a mathematical proposition that states the FOCA features form a point-like distribution within the same class in a class-separable fashion under special conditions. Real-data experiments under more general conditions provide supportive evidences.", "bibtex": "@InProceedings{pmlr-v97-sato19a,\n title = \t {Breaking Inter-Layer Co-Adaptation by Classifier Anonymization},\n author = {Sato, Ikuro and Ishikawa, Kohta and Liu, Guoqing and Tanaka, Masayuki},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5619--5627},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sato19a/sato19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/sato19a.html},\n abstract = \t {This study addresses an issue of co-adaptation between a feature extractor and a classifier in a neural network. A naive joint optimization of a feature extractor and a classifier often brings situations in which an excessively complex feature distribution adapted to a very specific classifier degrades the test performance. We introduce a method called Feature-extractor Optimization through Classifier Anonymization (FOCA), which is designed to avoid an explicit co-adaptation between a feature extractor and a particular classifier by using many randomly-generated, weak classifiers during optimization. We put forth a mathematical proposition that states the FOCA features form a point-like distribution within the same class in a class-separable fashion under special conditions. Real-data experiments under more general conditions provide supportive evidences.}\n}", "pdf": "http://proceedings.mlr.press/v97/sato19a/sato19a.pdf", "supp": "", "pdf_size": 10036592, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13217006044020508935&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff": "Denso IT Laboratory, Inc., Japan; Denso IT Laboratory, Inc., Japan; Denso IT Laboratory, Inc., Japan; National Institute of Advanced Industrial Science and Technology, Japan", "aff_domain": "d-itlab.co.jp; ; ; ", "email": "d-itlab.co.jp; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/sato19a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Denso IT Laboratory, Inc.;National Institute of Advanced Industrial Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.denso.com;https://www.aist.go.jp", "aff_unique_abbr": "Denso IT;AIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Japan" }, { "title": "Breaking the Softmax Bottleneck via Learnable Monotonic Pointwise Non-linearities", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4329", "id": "4329", "author_site": "Octavian-Eugen Ganea, Sylvain Gelly, Gary Becigneul, Aliaksei Severyn", "author": "Octavian Ganea; Sylvain Gelly; Gary Becigneul; Aliaksei Severyn", "abstract": "The Softmax function on top of a final linear layer is the de facto method to output probability distributions in neural networks. In many applications such as language models or text generation, this model has to produce distributions over large output vocabularies. Recently, this has been shown to have limited representational capacity due to its connection with the rank bottleneck in matrix factorization. However, little is known about the limitations of Linear-Softmax for quantities of practical interest such as cross entropy or mode estimation, a direction that we explore here. As an efficient and effective solution to alleviate this issue, we propose to learn parametric monotonic functions on top of the logits. We theoretically investigate the rank increasing capabilities of such monotonic functions. Empirically, our method improves in two different quality metrics over the traditional Linear-Softmax layer in synthetic and real language model experiments, adding little time or memory overhead, while being comparable to the more computationally expensive mixture of Softmaxes.", "bibtex": "@InProceedings{pmlr-v97-ganea19a,\n title = \t {Breaking the Softmax Bottleneck via Learnable Monotonic Pointwise Non-linearities},\n author = {Ganea, Octavian and Gelly, Sylvain and Becigneul, Gary and Severyn, Aliaksei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2073--2082},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ganea19a/ganea19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ganea19a.html},\n abstract = \t {The Softmax function on top of a final linear layer is the de facto method to output probability distributions in neural networks. In many applications such as language models or text generation, this model has to produce distributions over large output vocabularies. Recently, this has been shown to have limited representational capacity due to its connection with the rank bottleneck in matrix factorization. However, little is known about the limitations of Linear-Softmax for quantities of practical interest such as cross entropy or mode estimation, a direction that we explore here. As an efficient and effective solution to alleviate this issue, we propose to learn parametric monotonic functions on top of the logits. We theoretically investigate the rank increasing capabilities of such monotonic functions. Empirically, our method improves in two different quality metrics over the traditional Linear-Softmax layer in synthetic and real language model experiments, adding little time or memory overhead, while being comparable to the more computationally expensive mixture of Softmaxes.}\n}", "pdf": "http://proceedings.mlr.press/v97/ganea19a/ganea19a.pdf", "supp": "", "pdf_size": 3192099, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16879940404602302277&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, ETH Z\u00fcrich, Switzerland+Google Brain; Google Brain; Google Research; Google Research", "aff_domain": "inf.ethz.ch;google.com; ; ", "email": "inf.ethz.ch;google.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/ganea19a.html", "aff_unique_index": "0+1;1;1;1", "aff_unique_norm": "ETH Zurich;Google", "aff_unique_dep": "Department of Computer Science;Google Brain", "aff_unique_url": "https://www.ethz.ch;https://brain.google.com", "aff_unique_abbr": "ETHZ;Google Brain", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+1;1;1;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Breaking the gridlock in Mixture-of-Experts: Consistent and Efficient Algorithms", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3718", "id": "3718", "author_site": "Ashok Vardhan Makkuva, Pramod Viswanath, Sreeram Kannan, Sewoong Oh", "author": "Ashok Makkuva; Pramod Viswanath; Sreeram Kannan; Sewoong Oh", "abstract": "Mixture-of-Experts (MoE) is a widely popular model for ensemble learning and is a basic building block of highly successful modern neural networks as well as a component in Gated Recurrent Units (GRU) and Attention networks. However, present algorithms for learning MoE, including the EM algorithm and gradient descent, are known to get stuck in local optima. From a theoretical viewpoint, finding an efficient and provably consistent algorithm to learn the parameters remains a long standing open problem for more than two decades. In this paper, we introduce the first algorithm that learns the true parameters of a MoE model for a wide class of non-linearities with global consistency guarantees. While existing algorithms jointly or iteratively estimate the expert parameters and the gating parameters in the MoE, we propose a novel algorithm that breaks the deadlock and can directly estimate the expert parameters by sensing its echo in a carefully designed cross-moment tensor between the inputs and the output. Once the experts are known, the recovery of gating parameters still requires an EM algorithm; however, we show that the EM algorithm for this simplified problem, unlike the joint EM algorithm, converges to the true parameters. We empirically validate our algorithm on both the synthetic and real data sets in a variety of settings, and show superior performance to standard baselines.", "bibtex": "@InProceedings{pmlr-v97-makkuva19a,\n title = \t {Breaking the gridlock in Mixture-of-Experts: Consistent and Efficient Algorithms},\n author = {Makkuva, Ashok and Viswanath, Pramod and Kannan, Sreeram and Oh, Sewoong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4304--4313},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/makkuva19a/makkuva19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/makkuva19a.html},\n abstract = \t {Mixture-of-Experts (MoE) is a widely popular model for ensemble learning and is a basic building block of highly successful modern neural networks as well as a component in Gated Recurrent Units (GRU) and Attention networks. However, present algorithms for learning MoE, including the EM algorithm and gradient descent, are known to get stuck in local optima. From a theoretical viewpoint, finding an efficient and provably consistent algorithm to learn the parameters remains a long standing open problem for more than two decades. In this paper, we introduce the first algorithm that learns the true parameters of a MoE model for a wide class of non-linearities with global consistency guarantees. While existing algorithms jointly or iteratively estimate the expert parameters and the gating parameters in the MoE, we propose a novel algorithm that breaks the deadlock and can directly estimate the expert parameters by sensing its echo in a carefully designed cross-moment tensor between the inputs and the output. Once the experts are known, the recovery of gating parameters still requires an EM algorithm; however, we show that the EM algorithm for this simplified problem, unlike the joint EM algorithm, converges to the true parameters. We empirically validate our algorithm on both the synthetic and real data sets in a variety of settings, and show superior performance to standard baselines.}\n}", "pdf": "http://proceedings.mlr.press/v97/makkuva19a/makkuva19a.pdf", "supp": "", "pdf_size": 423573, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13449993823356572630&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical and Computer Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, IL, USA; Allen School of Computer Science & Engineering, University of Washington, Seattle, USA; Department of Electrical Engineering, University of Washington, Seattle, USA; Department of Electrical and Computer Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, IL, USA", "aff_domain": "illinois.edu; ; ; ", "email": "illinois.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/makkuva19a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Washington", "aff_unique_dep": "Department of Electrical and Computer Engineering;Allen School of Computer Science & Engineering", "aff_unique_url": "https://illinois.edu;https://www.cs.washington.edu", "aff_unique_abbr": "UIUC;UW", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Urbana-Champaign;Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Bridging Theory and Algorithm for Domain Adaptation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3630", "id": "3630", "author_site": "Yuchen Zhang, Tianle Liu, Mingsheng Long, Michael Jordan", "author": "Yuchen Zhang; Tianle Liu; Mingsheng Long; Michael Jordan", "abstract": "This paper addresses the problem of unsupervised domain adaption from theoretical and algorithmic perspectives. Existing domain adaptation theories naturally imply minimax optimization algorithms, which connect well with the domain adaptation methods based on adversarial learning. However, several disconnections still exist and form the gap between theory and algorithm. We extend previous theories (Mansour et al., 2009c; Ben-David et al., 2010) to multiclass classification in domain adaptation, where classifiers based on the scoring functions and margin loss are standard choices in algorithm design. We introduce Margin Disparity Discrepancy, a novel measurement with rigorous generalization bounds, tailored to the distribution comparison with the asymmetric margin loss, and to the minimax optimization for easier training. Our theory can be seamlessly transformed into an adversarial learning algorithm for domain adaptation, successfully bridging the gap between theory and algorithm. A series of empirical studies show that our algorithm achieves the state of the art accuracies on challenging domain adaptation tasks.", "bibtex": "@InProceedings{pmlr-v97-zhang19i,\n title = \t {Bridging Theory and Algorithm for Domain Adaptation},\n author = {Zhang, Yuchen and Liu, Tianle and Long, Mingsheng and Jordan, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7404--7413},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19i/zhang19i.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19i.html},\n abstract = \t {This paper addresses the problem of unsupervised domain adaption from theoretical and algorithmic perspectives. Existing domain adaptation theories naturally imply minimax optimization algorithms, which connect well with the domain adaptation methods based on adversarial learning. However, several disconnections still exist and form the gap between theory and algorithm. We extend previous theories (Mansour et al., 2009c; Ben-David et al., 2010) to multiclass classification in domain adaptation, where classifiers based on the scoring functions and margin loss are standard choices in algorithm design. We introduce Margin Disparity Discrepancy, a novel measurement with rigorous generalization bounds, tailored to the distribution comparison with the asymmetric margin loss, and to the minimax optimization for easier training. Our theory can be seamlessly transformed into an adversarial learning algorithm for domain adaptation, successfully bridging the gap between theory and algorithm. A series of empirical studies show that our algorithm achieves the state of the art accuracies on challenging domain adaptation tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19i/zhang19i.pdf", "supp": "", "pdf_size": 3058037, "gs_citation": 945, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12036658661059863941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Software+Research Center for Big Data, BNRist; School of Software+Department of Mathematical Science, Tsinghua University, China; School of Software+Research Center for Big Data, BNRist; University of California, Berkeley, USA", "aff_domain": "mails.tsinghua.edu.cn; ;tsinghua.edu.cn; ", "email": "mails.tsinghua.edu.cn; ;tsinghua.edu.cn; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/zhang19i.html", "aff_unique_index": "0+1;0+2;0+1;3", "aff_unique_norm": "School of Software;BNRist;Tsinghua University;University of California, Berkeley", "aff_unique_dep": "Software;Research Center for Big Data;Department of Mathematical Science;", "aff_unique_url": ";;https://www.tsinghua.edu.cn;https://www.berkeley.edu", "aff_unique_abbr": ";;Tsinghua;UC Berkeley", "aff_campus_unique_index": ";;;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": ";1;;2", "aff_country_unique": ";China;United States" }, { "title": "CAB: Continuous Adaptive Blending for Policy Evaluation and Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4208", "id": "4208", "author_site": "Yi Su, Luke Lequn Wang, Michele Santacatterina, Thorsten Joachims", "author": "Yi Su; Lequn Wang; Michele Santacatterina; Thorsten Joachims", "abstract": "The ability to perform offline A/B-testing and off-policy learning using logged contextual bandit feedback is highly desirable in a broad range of applications, including recommender systems, search engines, ad placement, and personalized health care. Both offline A/B-testing and off-policy learning require a counterfactual estimator that evaluates how some new policy would have performed, if it had been used instead of the logging policy. In this paper, we identify a family of counterfactual estimators which subsumes most such estimators proposed to date. Our analysis of this family identifies a new estimator - called Continuous Adaptive Blending (CAB) - which enjoys many advantageous theoretical and practical properties. In particular, it can be substantially less biased than clipped Inverse Propensity Score (IPS) weighting and the Direct Method, and it can have less variance than Doubly Robust and IPS estimators. In addition, it is sub-differentiable such that it can be used for learning, unlike the SWITCH estimator. Experimental results show that CAB provides excellent evaluation accuracy and outperforms other counterfactual estimators in terms of learning performance.", "bibtex": "@InProceedings{pmlr-v97-su19a,\n title = \t {{CAB}: Continuous Adaptive Blending for Policy Evaluation and Learning},\n author = {Su, Yi and Wang, Lequn and Santacatterina, Michele and Joachims, Thorsten},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6005--6014},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/su19a/su19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/su19a.html},\n abstract = \t {The ability to perform offline A/B-testing and off-policy learning using logged contextual bandit feedback is highly desirable in a broad range of applications, including recommender systems, search engines, ad placement, and personalized health care. Both offline A/B-testing and off-policy learning require a counterfactual estimator that evaluates how some new policy would have performed, if it had been used instead of the logging policy. In this paper, we identify a family of counterfactual estimators which subsumes most such estimators proposed to date. Our analysis of this family identifies a new estimator - called Continuous Adaptive Blending (CAB) - which enjoys many advantageous theoretical and practical properties. In particular, it can be substantially less biased than clipped Inverse Propensity Score (IPS) weighting and the Direct Method, and it can have less variance than Doubly Robust and IPS estimators. In addition, it is sub-differentiable such that it can be used for learning, unlike the SWITCH estimator. Experimental results show that CAB provides excellent evaluation accuracy and outperforms other counterfactual estimators in terms of learning performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/su19a/su19a.pdf", "supp": "", "pdf_size": 1175405, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10358496187319137735&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "Cornell University, Ithaca, USA+Cornell TRIPODS Center for Data Science, Ithaca, USA; Cornell University, Ithaca, USA+Cornell TRIPODS Center for Data Science, Ithaca, USA; Cornell University, Ithaca, USA; Cornell University, Ithaca, USA", "aff_domain": "cornell.edu;cornell.edu;cornell.edu;cs.cornell.edu", "email": "cornell.edu;cornell.edu;cornell.edu;cs.cornell.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/su19a.html", "aff_unique_index": "0+0;0+0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Ithaca", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "CHiVE: Varying Prosody in Speech Synthesis with a Linguistically Driven Dynamic Hierarchical Conditional Variational Network", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3864", "id": "3864", "author_site": "Tom Kenter, Vincent Wan, Chun-an Chan, Robert Clark, Jakub Vit", "author": "Tom Kenter; Vincent Wan; Chun-An Chan; Rob Clark; Jakub Vit", "abstract": "The prosodic aspects of speech signals produced by current text-to-speech systems are typically averaged over training material, and as such lack the variety and liveliness found in natural speech. To avoid monotony and averaged prosody contours, it is desirable to have a way of modeling the variation in the prosodic aspects of speech, so audio signals can be synthesized in multiple ways for a given text. We present a new, hierarchically structured conditional variational auto-encoder to generate prosodic features (fundamental frequency, energy and duration) suitable for use with a vocoder or a generative model like WaveNet. At inference time, an embedding representing the prosody of a sentence may be sampled from the variational layer to allow for prosodic variation. To efficiently capture the hierarchical nature of the linguistic input (words, syllables and phones), both the encoder and decoder parts of the auto-encoder are hierarchical, in line with the linguistic structure, with layers being clocked dynamically at the respective rates. We show in our experiments that our dynamic hierarchical network outperforms a non-hierarchical state-of-the-art baseline, and, additionally, that prosody transfer across sentences is possible by employing the prosody embedding of one sentence to generate the speech signal of another.", "bibtex": "@InProceedings{pmlr-v97-kenter19a,\n title = \t {{CH}i{VE}: Varying Prosody in Speech Synthesis with a Linguistically Driven Dynamic Hierarchical Conditional Variational Network},\n author = {Kenter, Tom and Wan, Vincent and Chan, Chun-An and Clark, Rob and Vit, Jakub},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3331--3340},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kenter19a/kenter19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kenter19a.html},\n abstract = \t {The prosodic aspects of speech signals produced by current text-to-speech systems are typically averaged over training material, and as such lack the variety and liveliness found in natural speech. To avoid monotony and averaged prosody contours, it is desirable to have a way of modeling the variation in the prosodic aspects of speech, so audio signals can be synthesized in multiple ways for a given text. We present a new, hierarchically structured conditional variational auto-encoder to generate prosodic features (fundamental frequency, energy and duration) suitable for use with a vocoder or a generative model like WaveNet. At inference time, an embedding representing the prosody of a sentence may be sampled from the variational layer to allow for prosodic variation. To efficiently capture the hierarchical nature of the linguistic input (words, syllables and phones), both the encoder and decoder parts of the auto-encoder are hierarchical, in line with the linguistic structure, with layers being clocked dynamically at the respective rates. We show in our experiments that our dynamic hierarchical network outperforms a non-hierarchical state-of-the-art baseline, and, additionally, that prosody transfer across sentences is possible by employing the prosody embedding of one sentence to generate the speech signal of another.}\n}", "pdf": "http://proceedings.mlr.press/v97/kenter19a/kenter19a.pdf", "supp": "", "pdf_size": 3898303, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13818984800980951519&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "TTS Research, Google UK, London; TTS Research, Google UK, London; TTS Research, Google UK, London; University of West Bohemia, work carried out whilst at Google; TTS Research, Google UK, London", "aff_domain": "google.com;google.com;google.com;google.com;google.com", "email": "google.com;google.com;google.com;google.com;google.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kenter19a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Google;University of West Bohemia", "aff_unique_dep": "TTS Research;", "aff_unique_url": "https://www.google.com;https://www.zcu.cz", "aff_unique_abbr": "Google;UWB", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United Kingdom;Czech Republic" }, { "title": "COMIC: Multi-view Clustering Without Parameter Selection", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3777", "id": "3777", "author_site": "Xi Peng, Zhenyu Huang, Jiancheng Lv, Hongyuan Zhu, Joey Tianyi Zhou", "author": "Xi Peng; Zhenyu Huang; Jiancheng Lv; Hongyuan Zhu; Joey Tianyi Zhou", "abstract": "In this paper, we study two challenges in clustering analysis, namely, how to cluster multi-view data and how to perform clustering without parameter selection on cluster size. To this end, we propose a novel objective function to project raw data into one space in which the projection embraces the geometric consistency (GC) and the cluster assignment consistency (CAC). To be specific, the GC aims to learn a connection graph from a projection space wherein the data points are connected if and only if they belong to the same cluster. The CAC aims to minimize the discrepancy of pairwise connection graphs induced from different views based on the view-consensus assumption,", "bibtex": "@InProceedings{pmlr-v97-peng19a,\n title = \t {{COMIC}: Multi-view Clustering Without Parameter Selection},\n author = {Peng, Xi and Huang, Zhenyu and Lv, Jiancheng and Zhu, Hongyuan and Zhou, Joey Tianyi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5092--5101},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/peng19a/peng19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/peng19a.html},\n abstract = \t {In this paper, we study two challenges in clustering analysis, namely, how to cluster multi-view data and how to perform clustering without parameter selection on cluster size. To this end, we propose a novel objective function to project raw data into one space in which the projection embraces the geometric consistency (GC) and the cluster assignment consistency (CAC). To be specific, the GC aims to learn a connection graph from a projection space wherein the data points are connected if and only if they belong to the same cluster. The CAC aims to minimize the discrepancy of pairwise connection graphs induced from different views based on the view-consensus assumption,", "pdf": "http://proceedings.mlr.press/v97/peng19a/peng19a.pdf", "supp": "", "pdf_size": 1047550, "gs_citation": 374, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2409659189727820554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "College of Computer Science, Sichuan University, Chengdu, China; College of Computer Science, Sichuan University, Chengdu, China; College of Computer Science, Sichuan University, Chengdu, China; Institute for Infocomm Research, A*STAR, Singapore; Institute of Performance Computing, A*STAR, Singapore", "aff_domain": "gmail.com; ; ; ;gmail.com", "email": "gmail.com; ; ; ;gmail.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/peng19a.html", "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Sichuan University;Institute for Infocomm Research;A*STAR", "aff_unique_dep": "College of Computer Science;;Institute of Performance Computing", "aff_unique_url": "https://www.scu.edu.cn;https://www.i2r.a-star.edu.sg;https://www.a-star.edu.sg", "aff_unique_abbr": "SCU;I2R;A*STAR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Chengdu;", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "China;Singapore" }, { "title": "CURIOUS: Intrinsically Motivated Modular Multi-Goal Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3722", "id": "3722", "author_site": "C\u00e9dric Colas, Pierre-Yves Oudeyer, Olivier Sigaud, Pierre Fournier, Mohamed Chetouani", "author": "C\u00e9dric Colas; Pierre Fournier; Mohamed Chetouani; Olivier Sigaud; Pierre-Yves Oudeyer", "abstract": "In open-ended environments, autonomous learning agents must set their own goals and build their own curriculum through an intrinsically motivated exploration. They may consider a large diversity of goals, aiming to discover what is controllable in their environments, and what is not. Because some goals might prove easy and some impossible, agents must actively select which goal to practice at any moment, to maximize their overall mastery on the set of learnable goals. This paper proposes CURIOUS , an algorithm that leverages 1) a modular Universal Value Function Approximator with hindsight learning to achieve a diversity of goals of different kinds within a unique policy and 2) an automated curriculum learning mechanism that biases the attention of the agent towards goals maximizing the absolute learning progress. Agents focus sequentially on goals of increasing complexity, and focus back on goals that are being forgotten. Experiments conducted in a new modular-goal robotic environment show the resulting developmental self-organization of a learning curriculum, and demonstrate properties of robustness to distracting goals, forgetting and changes in body properties.", "bibtex": "@InProceedings{pmlr-v97-colas19a,\n title = \t {{CURIOUS}: Intrinsically Motivated Modular Multi-Goal Reinforcement Learning},\n author = {Colas, C{\\'e}dric and Fournier, Pierre and Chetouani, Mohamed and Sigaud, Olivier and Oudeyer, Pierre-Yves},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1331--1340},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/colas19a/colas19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/colas19a.html},\n abstract = \t {In open-ended environments, autonomous learning agents must set their own goals and build their own curriculum through an intrinsically motivated exploration. They may consider a large diversity of goals, aiming to discover what is controllable in their environments, and what is not. Because some goals might prove easy and some impossible, agents must actively select which goal to practice at any moment, to maximize their overall mastery on the set of learnable goals. This paper proposes CURIOUS , an algorithm that leverages 1) a modular Universal Value Function Approximator with hindsight learning to achieve a diversity of goals of different kinds within a unique policy and 2) an automated curriculum learning mechanism that biases the attention of the agent towards goals maximizing the absolute learning progress. Agents focus sequentially on goals of increasing complexity, and focus back on goals that are being forgotten. Experiments conducted in a new modular-goal robotic environment show the resulting developmental self-organization of a learning curriculum, and demonstrate properties of robustness to distracting goals, forgetting and changes in body properties.}\n}", "pdf": "http://proceedings.mlr.press/v97/colas19a/colas19a.pdf", "supp": "", "pdf_size": 3118522, "gs_citation": 218, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=329489517258350795&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Flowers Team, Inria and Ensta ParisTech, FR; ISIR, Sorbonne Univ., Paris, FR; ISIR, Sorbonne Univ., Paris, FR; ISIR, Sorbonne Univ., Paris, FR; Flowers Team, Inria and Ensta ParisTech, FR", "aff_domain": "inria.fr; ; ; ; ", "email": "inria.fr; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/colas19a.html", "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "INRIA;Sorbonne University", "aff_unique_dep": "Flowers Team;Institut des Sciences de l'Ing\u00e9nierie de Robotique", "aff_unique_url": "https://www.inria.fr;https://www.sorbonne-universite.fr", "aff_unique_abbr": "Inria;Sorbonne Univ.", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "France" }, { "title": "Calibrated Approximate Bayesian Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4074", "id": "4074", "author_site": "Hanwen Xing, Geoff Nicholls, Jeong Lee", "author": "Hanwen Xing; Geoff Nicholls; Jeong Lee", "abstract": "We give a general purpose computational framework for estimating the bias in coverage resulting from making approximations in Bayesian inference. Coverage is the probability credible sets cover true parameter values. We show how to estimate the actual coverage an approximation scheme achieves when the ideal observation model and the prior can be simulated, but have been replaced, in the Monte Carlo, with approximations as they are intractable. Coverage estimation procedures given in Lee et al. (2018) work well on simple problems, but are biased, and do not scale well, as those authors note. For example, the methods of Lee et al. (2018) fail for calibration of an approximate completely collapsed MCMC algorithm for partition structure in a Dirichlet process for clustering group labels in a hierarchical model. By exploiting the symmetry of the coverage error under permutation of low level group labels and smoothing with Bayesian Additive Regression Trees, we are able to show that the original approximate inference had poor coverage and should not be trusted.", "bibtex": "@InProceedings{pmlr-v97-xing19a,\n title = \t {Calibrated Approximate {B}ayesian Inference},\n author = {Xing, Hanwen and Nicholls, Geoff and Lee, Jeong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6912--6920},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xing19a/xing19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/xing19a.html},\n abstract = \t {We give a general purpose computational framework for estimating the bias in coverage resulting from making approximations in Bayesian inference. Coverage is the probability credible sets cover true parameter values. We show how to estimate the actual coverage an approximation scheme achieves when the ideal observation model and the prior can be simulated, but have been replaced, in the Monte Carlo, with approximations as they are intractable. Coverage estimation procedures given in Lee et al. (2018) work well on simple problems, but are biased, and do not scale well, as those authors note. For example, the methods of Lee et al. (2018) fail for calibration of an approximate completely collapsed MCMC algorithm for partition structure in a Dirichlet process for clustering group labels in a hierarchical model. By exploiting the symmetry of the coverage error under permutation of low level group labels and smoothing with Bayesian Additive Regression Trees, we are able to show that the original approximate inference had poor coverage and should not be trusted.}\n}", "pdf": "http://proceedings.mlr.press/v97/xing19a/xing19a.pdf", "supp": "", "pdf_size": 395665, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14447677340398726198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, University of Oxford, UK; Department of Statistics, University of Oxford, UK; Department of Statistics, University of Auckland, New Zealand", "aff_domain": "stx.ox.ac.uk; ; ", "email": "stx.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/xing19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Oxford;University of Auckland", "aff_unique_dep": "Department of Statistics;Department of Statistics", "aff_unique_url": "https://www.ox.ac.uk;https://www.auckland.ac.nz", "aff_unique_abbr": "Oxford;UoA", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Oxford;Auckland", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United Kingdom;New Zealand" }, { "title": "Calibrated Model-Based Deep Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3997", "id": "3997", "author_site": "Ali Malik, Volodymyr Kuleshov, Jiaming Song, Danny Nemer, Harlan Seymour, Stefano Ermon", "author": "Ali Malik; Volodymyr Kuleshov; Jiaming Song; Danny Nemer; Harlan Seymour; Stefano Ermon", "abstract": "Estimates of predictive uncertainty are important for accurate model-based planning and reinforcement learning. However, predictive uncertainties \u2014 especially ones derived from modern deep learning systems \u2014 can be inaccurate and impose a bottleneck on performance. This paper explores which uncertainties are needed for model-based reinforcement learning and argues that ideal uncertainties should be calibrated, i.e. their probabilities should match empirical frequencies of predicted events. We describe a simple way to augment any model-based reinforcement learning agent with a calibrated model and show that doing so consistently improves planning, sample complexity, and exploration. On the \\textsc{HalfCheetah} MuJoCo task, our system achieves state-of-the-art performance using 50% fewer samples than the current leading approach. Our findings suggest that calibration can improve the performance of model-based reinforcement learning with minimal computational and implementation overhead.", "bibtex": "@InProceedings{pmlr-v97-malik19a,\n title = \t {Calibrated Model-Based Deep Reinforcement Learning},\n author = {Malik, Ali and Kuleshov, Volodymyr and Song, Jiaming and Nemer, Danny and Seymour, Harlan and Ermon, Stefano},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4314--4323},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/malik19a/malik19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/malik19a.html},\n abstract = \t {Estimates of predictive uncertainty are important for accurate model-based planning and reinforcement learning. However, predictive uncertainties \u2014 especially ones derived from modern deep learning systems \u2014 can be inaccurate and impose a bottleneck on performance. This paper explores which uncertainties are needed for model-based reinforcement learning and argues that ideal uncertainties should be calibrated, i.e. their probabilities should match empirical frequencies of predicted events. We describe a simple way to augment any model-based reinforcement learning agent with a calibrated model and show that doing so consistently improves planning, sample complexity, and exploration. On the \\textsc{HalfCheetah} MuJoCo task, our system achieves state-of-the-art performance using 50% fewer samples than the current leading approach. Our findings suggest that calibration can improve the performance of model-based reinforcement learning with minimal computational and implementation overhead.}\n}", "pdf": "http://proceedings.mlr.press/v97/malik19a/malik19a.pdf", "supp": "", "pdf_size": 925503, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4253913195601739122&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Stanford University + Afresh Technologies; Department of Computer Science, Stanford University; Afresh Technologies; Afresh Technologies; Department of Computer Science, Stanford University", "aff_domain": "stanford.edu;cs.stanford.edu; ; ; ;", "email": "stanford.edu;cs.stanford.edu; ; ; ;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/malik19a.html", "aff_unique_index": "0;0+1;0;1;1;0", "aff_unique_norm": "Stanford University;Afresh Technologies", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.stanford.edu;", "aff_unique_abbr": "Stanford;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "title": "CapsAndRuns: An Improved Method for Approximately Optimal Algorithm Configuration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3650", "id": "3650", "author_site": "Gell\u00e9rt Weisz, Andr\u00e1s Gy\u00f6rgy, Csaba Szepesvari", "author": "Gellert Weisz; Andras Gyorgy; Csaba Szepesvari", "abstract": "We consider the problem of configuring general-purpose solvers to run efficiently on problem instances drawn from an unknown distribution, a problem of major interest in solver autoconfiguration. Following previous work, we focus on designing algorithms that find a configuration with near-optimal expected capped runtime while doing the least amount of work, with the cap chosen in a configuration-specific way so that most instances are solved. In this paper we present a new algorithm, CapsAndRuns, which finds a near-optimal configuration while using time that scales (in a problem dependent way) with the optimal expected capped runtime, significantly strengthening previous results which could only guarantee a bound that scaled with the potentially much larger optimal expected uncapped runtime. The new algorithm is simpler and more intuitive than the previous methods: first it estimates the optimal runtime cap for each configuration, then it uses a Bernstein race to find a near optimal configuration given the caps. Experiments verify that our method can significantly outperform its competitors.", "bibtex": "@InProceedings{pmlr-v97-weisz19a,\n title = \t {{C}aps{A}nd{R}uns: An Improved Method for Approximately Optimal Algorithm Configuration},\n author = {Weisz, Gellert and Gyorgy, Andras and Szepesvari, Csaba},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6707--6715},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/weisz19a/weisz19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/weisz19a.html},\n abstract = \t {We consider the problem of configuring general-purpose solvers to run efficiently on problem instances drawn from an unknown distribution, a problem of major interest in solver autoconfiguration. Following previous work, we focus on designing algorithms that find a configuration with near-optimal expected capped runtime while doing the least amount of work, with the cap chosen in a configuration-specific way so that most instances are solved. In this paper we present a new algorithm, CapsAndRuns, which finds a near-optimal configuration while using time that scales (in a problem dependent way) with the optimal expected capped runtime, significantly strengthening previous results which could only guarantee a bound that scaled with the potentially much larger optimal expected uncapped runtime. The new algorithm is simpler and more intuitive than the previous methods: first it estimates the optimal runtime cap for each configuration, then it uses a Bernstein race to find a near optimal configuration given the caps. Experiments verify that our method can significantly outperform its competitors.}\n}", "pdf": "http://proceedings.mlr.press/v97/weisz19a/weisz19a.pdf", "supp": "", "pdf_size": 492188, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7703392022575034680&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "DeepMind, London, UK + Imperial College London, London, UK; DeepMind, London, UK + University of Alberta, Edmonton, AB, Canada; DeepMind, London, UK + University of Alberta, Edmonton, AB, Canada", "aff_domain": "google.com;google.com;google.com", "email": "google.com;google.com;google.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/weisz19a.html", "aff_unique_index": "0+1;0+2;0+2", "aff_unique_norm": "DeepMind;Imperial College London;University of Alberta", "aff_unique_dep": ";;", "aff_unique_url": "https://deepmind.com;https://www.imperial.ac.uk;https://www.ualberta.ca", "aff_unique_abbr": "DeepMind;ICL;UAlberta", "aff_campus_unique_index": "0+0;0+1;0+1", "aff_campus_unique": "London;Edmonton", "aff_country_unique_index": "0+0;0+1;0+1", "aff_country_unique": "United Kingdom;Canada" }, { "title": "Categorical Feature Compression via Submodular Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4276", "id": "4276", "author_site": "Mohammad Hossein Bateni, Lin Chen, Hossein Esfandiari, Thomas Fu, Vahab Mirrokni, Afshin Rostamizadeh", "author": "Mohammadhossein Bateni; Lin Chen; Hossein Esfandiari; Thomas Fu; Vahab Mirrokni; Afshin Rostamizadeh", "abstract": "In the era of big data, learning from categorical features with very large vocabularies (e.g., 28 million for the Criteo click prediction dataset) has become a practical challenge for machine learning researchers and practitioners. We design a highly-scalable vocabulary compression algorithm that seeks to maximize the mutual information between the compressed categorical feature and the target binary labels and we furthermore show that its solution is guaranteed to be within a $1-1/e \\approx 63%$ factor of the global optimal solution. Although in some settings, entropy-based set functions are known to be submodular, this is not the case for the mutual information objective we consider (mutual information with respect to the target labels). To address this, we introduce a novel re-parametrization of the mutual information objective, which we prove is submodular, and also design a data structure to query the submodular function in amortized $O(\\log n )$ time (where $n$ is the input vocabulary size). Our complete algorithm is shown to operate in $O(n \\log n )$ time. Additionally, we design a distributed implementation in which the query data structure is decomposed across $O(k)$ machines such that each machine only requires $O(\\frac n k)$ space, while still preserving the approximation guarantee and using only logarithmic rounds of computation. We also provide analysis of simple alternative heuristic compression methods to demonstrate they cannot achieve any approximation guarantee. Using the large-scale Criteo learning task, we demonstrate better performance in retaining mutual information and also verify competitive learning performance compared to other baseline methods.", "bibtex": "@InProceedings{pmlr-v97-bateni19a,\n title = \t {Categorical Feature Compression via Submodular Optimization},\n author = {Bateni, Mohammadhossein and Chen, Lin and Esfandiari, Hossein and Fu, Thomas and Mirrokni, Vahab and Rostamizadeh, Afshin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {515--523},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bateni19a/bateni19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bateni19a.html},\n abstract = \t {In the era of big data, learning from categorical features with very large vocabularies (e.g., 28 million for the Criteo click prediction dataset) has become a practical challenge for machine learning researchers and practitioners. We design a highly-scalable vocabulary compression algorithm that seeks to maximize the mutual information between the compressed categorical feature and the target binary labels and we furthermore show that its solution is guaranteed to be within a $1-1/e \\approx 63%$ factor of the global optimal solution. Although in some settings, entropy-based set functions are known to be submodular, this is not the case for the mutual information objective we consider (mutual information with respect to the target labels). To address this, we introduce a novel re-parametrization of the mutual information objective, which we prove is submodular, and also design a data structure to query the submodular function in amortized $O(\\log n )$ time (where $n$ is the input vocabulary size). Our complete algorithm is shown to operate in $O(n \\log n )$ time. Additionally, we design a distributed implementation in which the query data structure is decomposed across $O(k)$ machines such that each machine only requires $O(\\frac n k)$ space, while still preserving the approximation guarantee and using only logarithmic rounds of computation. We also provide analysis of simple alternative heuristic compression methods to demonstrate they cannot achieve any approximation guarantee. Using the large-scale Criteo learning task, we demonstrate better performance in retaining mutual information and also verify competitive learning performance compared to other baseline methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/bateni19a/bateni19a.pdf", "supp": "", "pdf_size": 360061, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17288693505725090263&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google, New York, NY, USA+Department of Electrical Engineering, Yale University, New Haven, CT, USA; Google, New York, NY, USA; Google, New York, NY, USA; Google, New York, NY, USA; Google, New York, NY, USA; Google, New York, NY, USA", "aff_domain": "yale.edu; ; ; ; ; ", "email": "yale.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/bateni19a.html", "aff_unique_index": "0+1;0;0;0;0;0", "aff_unique_norm": "Google;Yale University", "aff_unique_dep": "Google;Department of Electrical Engineering", "aff_unique_url": "https://www.google.com;https://www.yale.edu", "aff_unique_abbr": "Google;Yale", "aff_campus_unique_index": "0+1;0;0;0;0;0", "aff_campus_unique": "New York;New Haven", "aff_country_unique_index": "0+0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Causal Discovery and Forecasting in Nonstationary Environments with State-Space Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4045", "id": "4045", "author_site": "Biwei Huang, Kun Zhang, Mingming Gong, Clark Glymour", "author": "Biwei Huang; Kun Zhang; Mingming Gong; Clark Glymour", "abstract": "In many scientific fields, such as economics and neuroscience, we are often faced with nonstationary time series, and concerned with both finding causal relations and forecasting the values of variables of interest, both of which are particularly challenging in such nonstationary environments. In this paper, we study causal discovery and forecasting for nonstationary time series. By exploiting a particular type of state-space model to represent the processes, we show that nonstationarity helps to identify the causal structure, and that forecasting naturally benefits from learned causal knowledge. Specifically, we allow changes in both causal strengths and noise variances in the nonlinear state-space models, which, interestingly, renders both the causal structure and model parameters identifiable. Given the causal model, we treat forecasting as a problem in Bayesian inference in the causal model, which exploits the time-varying property of the data and adapts to new observations in a principled manner. Experimental results on synthetic and real-world data sets demonstrate the efficacy of the proposed methods.", "bibtex": "@InProceedings{pmlr-v97-huang19g,\n title = \t {Causal Discovery and Forecasting in Nonstationary Environments with State-Space Models},\n author = {Huang, Biwei and Zhang, Kun and Gong, Mingming and Glymour, Clark},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2901--2910},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/huang19g/huang19g.pdf},\n url = \t {https://proceedings.mlr.press/v97/huang19g.html},\n abstract = \t {In many scientific fields, such as economics and neuroscience, we are often faced with nonstationary time series, and concerned with both finding causal relations and forecasting the values of variables of interest, both of which are particularly challenging in such nonstationary environments. In this paper, we study causal discovery and forecasting for nonstationary time series. By exploiting a particular type of state-space model to represent the processes, we show that nonstationarity helps to identify the causal structure, and that forecasting naturally benefits from learned causal knowledge. Specifically, we allow changes in both causal strengths and noise variances in the nonlinear state-space models, which, interestingly, renders both the causal structure and model parameters identifiable. Given the causal model, we treat forecasting as a problem in Bayesian inference in the causal model, which exploits the time-varying property of the data and adapts to new observations in a principled manner. Experimental results on synthetic and real-world data sets demonstrate the efficacy of the proposed methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/huang19g/huang19g.pdf", "supp": "", "pdf_size": 507679, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5202972012982886996&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "Department of Philosophy, Carnegie Mellon University, Pittsburgh; Department of Philosophy, Carnegie Mellon University, Pittsburgh; Department of Philosophy, Carnegie Mellon University, Pittsburgh + Department of Biomedical Informatics, University of Pittsburgh, Pittsburgh; Department of Philosophy, Carnegie Mellon University, Pittsburgh", "aff_domain": "andrew.cmu.edu; ; ; ", "email": "andrew.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/huang19g.html", "aff_unique_index": "0;0;0+1;0", "aff_unique_norm": "Carnegie Mellon University;University of Pittsburgh", "aff_unique_dep": "Department of Philosophy;Department of Biomedical Informatics", "aff_unique_url": "https://www.cmu.edu;https://www.pitt.edu", "aff_unique_abbr": "CMU;Pitt", "aff_campus_unique_index": "0;0;0+0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "Causal Identification under Markov Equivalence: Completeness Results", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3841", "id": "3841", "author_site": "Amin Jaber, Jiji Zhang, Elias Bareinboim", "author": "Amin Jaber; Jiji Zhang; Elias Bareinboim", "abstract": "Causal effect identification is the task of determining whether a causal distribution is computable from the combination of an observational distribution and substantive knowledge about the domain under investigation. One of the most studied versions of this problem assumes that knowledge is articulated in the form of a fully known causal diagram, which is arguably a strong assumption in many settings. In this paper, we relax this requirement and consider that the knowledge is articulated in the form of an equivalence class of causal diagrams, in particular, a partial ancestral graph (PAG). This is attractive because a PAG can be learned directly from data, and the scientist does not need to commit to a particular, unique diagram. There are different sufficient conditions for identification in PAGs, but none is complete. We derive a complete algorithm for identification given a PAG. This implies that whenever the causal effect is identifiable, the algorithm returns a valid identification expression; alternatively, it will throw a failure condition, which means that the effect is provably not identifiable. We further provide a graphical characterization of non-identifiability of causal effects in PAGs.", "bibtex": "@InProceedings{pmlr-v97-jaber19a,\n title = \t {Causal Identification under {M}arkov Equivalence: Completeness Results},\n author = {Jaber, Amin and Zhang, Jiji and Bareinboim, Elias},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2981--2989},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jaber19a/jaber19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jaber19a.html},\n abstract = \t {Causal effect identification is the task of determining whether a causal distribution is computable from the combination of an observational distribution and substantive knowledge about the domain under investigation. One of the most studied versions of this problem assumes that knowledge is articulated in the form of a fully known causal diagram, which is arguably a strong assumption in many settings. In this paper, we relax this requirement and consider that the knowledge is articulated in the form of an equivalence class of causal diagrams, in particular, a partial ancestral graph (PAG). This is attractive because a PAG can be learned directly from data, and the scientist does not need to commit to a particular, unique diagram. There are different sufficient conditions for identification in PAGs, but none is complete. We derive a complete algorithm for identification given a PAG. This implies that whenever the causal effect is identifiable, the algorithm returns a valid identification expression; alternatively, it will throw a failure condition, which means that the effect is provably not identifiable. We further provide a graphical characterization of non-identifiability of causal effects in PAGs.}\n}", "pdf": "http://proceedings.mlr.press/v97/jaber19a/jaber19a.pdf", "supp": "", "pdf_size": 291246, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7262896135619600146&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "aff": "Department of Computer Science, Purdue University, West Lafayette, USA; Department of Philosophy, Lingnan University, NT, HK; Department of Computer Science, Purdue University, West Lafayette, USA", "aff_domain": "purdue.edu; ;purdue.edu", "email": "purdue.edu; ;purdue.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/jaber19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Purdue University;Lingnan University", "aff_unique_dep": "Department of Computer Science;Department of Philosophy", "aff_unique_url": "https://www.purdue.edu;https://www.ln.edu.hk", "aff_unique_abbr": "Purdue;Lingnan", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "West Lafayette;Hong Kong SAR", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "title": "Cautious Regret Minimization: Online Optimization with Long-Term Budget Constraints", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4210", "id": "4210", "author_site": "Nikolaos Liakopoulos, Apostolos Destounis, Georgios Paschos, Thrasyvoulos Spyropoulos, Panayotis Mertikopoulos", "author": "Nikolaos Liakopoulos; Apostolos Destounis; Georgios Paschos; Thrasyvoulos Spyropoulos; Panayotis Mertikopoulos", "abstract": "We study a class of online convex optimization problems with long-term budget constraints that arise naturally as reliability guarantees or total consumption constraints. In this general setting, prior work by Mannor et al. (2009) has shown that achieving no regret is impossible if the functions defining the agent\u2019s budget are chosen by an adversary. To overcome this obstacle, we refine the agent\u2019s regret metric by introducing the notion of a \"K-benchmark\", i.e., a comparator which meets the problem\u2019s allotted budget over any window of length K. The impossibility analysis of Mannor et al. (2009) is recovered when K=T; however, for K=o(T), we show that it is possible to minimize regret while still meeting the problem\u2019s long-term budget constraints. We achieve this via an online learning policy based on Cautious Online Lagrangiant Descent (COLD) for which we derive explicit bounds, in terms of both the incurred regret and the residual budget violations.", "bibtex": "@InProceedings{pmlr-v97-liakopoulos19a,\n title = \t {Cautious Regret Minimization: Online Optimization with Long-Term Budget Constraints},\n author = {Liakopoulos, Nikolaos and Destounis, Apostolos and Paschos, Georgios and Spyropoulos, Thrasyvoulos and Mertikopoulos, Panayotis},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3944--3952},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liakopoulos19a/liakopoulos19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/liakopoulos19a.html},\n abstract = \t {We study a class of online convex optimization problems with long-term budget constraints that arise naturally as reliability guarantees or total consumption constraints. In this general setting, prior work by Mannor et al. (2009) has shown that achieving no regret is impossible if the functions defining the agent\u2019s budget are chosen by an adversary. To overcome this obstacle, we refine the agent\u2019s regret metric by introducing the notion of a \"K-benchmark\", i.e., a comparator which meets the problem\u2019s allotted budget over any window of length K. The impossibility analysis of Mannor et al. (2009) is recovered when K=T; however, for K=o(T), we show that it is possible to minimize regret while still meeting the problem\u2019s long-term budget constraints. We achieve this via an online learning policy based on Cautious Online Lagrangiant Descent (COLD) for which we derive explicit bounds, in terms of both the incurred regret and the residual budget violations.}\n}", "pdf": "http://proceedings.mlr.press/v97/liakopoulos19a/liakopoulos19a.pdf", "supp": "", "pdf_size": 725045, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6053265533820042967&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "Paris Research Center, Huawei Technologies, Paris, France+Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, Grenoble, France; Paris Research Center, Huawei Technologies, Paris, France; Paris Research Center, Huawei Technologies, Paris, France; EURECOM, Sophia-Antipolis, France; Univ. Grenoble Alpes, CNRS, Inria, Grenoble INP, LIG, Grenoble, France", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/liakopoulos19a.html", "aff_unique_index": "0+1;0;0;2;1", "aff_unique_norm": "Huawei;Universite Grenoble Alpes;EURECOM", "aff_unique_dep": "Paris Research Center;;", "aff_unique_url": "https://www.huawei.com;https://www.univ-grenoble-alpes.fr;https://www.eurecom.fr", "aff_unique_abbr": "Huawei;UGA;", "aff_campus_unique_index": "0+1;0;0;2;1", "aff_campus_unique": "Paris;Grenoble;Sophia-Antipolis", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "France" }, { "title": "Certified Adversarial Robustness via Randomized Smoothing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3907", "id": "3907", "author_site": "Jeremy Cohen, Elan Rosenfeld, Zico Kolter", "author": "Jeremy Cohen; Elan Rosenfeld; Zico Kolter", "abstract": "We show how to turn any classifier that classifies well under Gaussian noise into a new classifier that is certifiably robust to adversarial perturbations under the L2 norm. While this \"randomized smoothing\" technique has been proposed before in the literature, we are the first to provide a tight analysis, which establishes a close connection between L2 robustness and Gaussian noise. We use the technique to train an ImageNet classifier with e.g. a certified top-1 accuracy of 49% under adversarial perturbations with L2 norm less than 0.5 (=127/255). Smoothing is the only approach to certifiably robust classification which has been shown feasible on full-resolution ImageNet. On smaller-scale datasets where competing approaches to certified L2 robustness are viable, smoothing delivers higher certified accuracies. The empirical success of the approach suggests that provable methods based on randomization at prediction time are a promising direction for future research into adversarially robust classification.", "bibtex": "@InProceedings{pmlr-v97-cohen19c,\n title = \t {Certified Adversarial Robustness via Randomized Smoothing},\n author = {Cohen, Jeremy and Rosenfeld, Elan and Kolter, Zico},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1310--1320},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cohen19c/cohen19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/cohen19c.html},\n abstract = \t {We show how to turn any classifier that classifies well under Gaussian noise into a new classifier that is certifiably robust to adversarial perturbations under the L2 norm. While this \"randomized smoothing\" technique has been proposed before in the literature, we are the first to provide a tight analysis, which establishes a close connection between L2 robustness and Gaussian noise. We use the technique to train an ImageNet classifier with e.g. a certified top-1 accuracy of 49% under adversarial perturbations with L2 norm less than 0.5 (=127/255). Smoothing is the only approach to certifiably robust classification which has been shown feasible on full-resolution ImageNet. On smaller-scale datasets where competing approaches to certified L2 robustness are viable, smoothing delivers higher certified accuracies. The empirical success of the approach suggests that provable methods based on randomization at prediction time are a promising direction for future research into adversarially robust classification.}\n}", "pdf": "http://proceedings.mlr.press/v97/cohen19c/cohen19c.pdf", "supp": "", "pdf_size": 6215188, "gs_citation": 2480, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7039519782328477041&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University + Bosch Center for AI", "aff_domain": "cmu.edu; ; ", "email": "cmu.edu; ; ", "github": "http://github.com/locuslab/smoothing", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/cohen19c.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "Carnegie Mellon University;Bosch Center for AI", "aff_unique_dep": ";Center for AI", "aff_unique_url": "https://www.cmu.edu;https://www.bosch-ai.com", "aff_unique_abbr": "CMU;BCAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1", "aff_country_unique": "United States;Germany" }, { "title": "Challenging Common Assumptions in the Unsupervised Learning of Disentangled Representations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4085", "id": "4085", "author_site": "Francesco Locatello, Stefan Bauer, Mario Lucic, Gunnar Ratsch, Sylvain Gelly, Bernhard Sch\u00f6lkopf, Olivier Bachem", "author": "Francesco Locatello; Stefan Bauer; Mario Lucic; Gunnar Raetsch; Sylvain Gelly; Bernhard Sch\u00f6lkopf; Olivier Bachem", "abstract": "The key idea behind the unsupervised learning of disentangled representations is that real-world data is generated by a few explanatory factors of variation which can be recovered by unsupervised learning algorithms. In this paper, we provide a sober look at recent progress in the field and challenge some common assumptions. We first theoretically show that the unsupervised learning of disentangled representations is fundamentally impossible without inductive biases on both the models and the data. Then, we train more than $12000$ models covering most prominent methods and evaluation metrics in a reproducible large-scale experimental study on seven different data sets. We observe that while the different methods successfully enforce properties \u201cencouraged\u201d by the corresponding losses, well-disentangled models seemingly cannot be identified without supervision. Furthermore, increased disentanglement does not seem to lead to a decreased sample complexity of learning for downstream tasks. Our results suggest that future work on disentanglement learning should be explicit about the role of inductive biases and (implicit) supervision, investigate concrete benefits of enforcing disentanglement of the learned representations, and consider a reproducible experimental setup covering several data sets.", "bibtex": "@InProceedings{pmlr-v97-locatello19a,\n title = \t {Challenging Common Assumptions in the Unsupervised Learning of Disentangled Representations},\n author = {Locatello, Francesco and Bauer, Stefan and Lucic, Mario and Raetsch, Gunnar and Gelly, Sylvain and Sch{\\\"o}lkopf, Bernhard and Bachem, Olivier},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4114--4124},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/locatello19a/locatello19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/locatello19a.html},\n abstract = \t {The key idea behind the unsupervised learning of disentangled representations is that real-world data is generated by a few explanatory factors of variation which can be recovered by unsupervised learning algorithms. In this paper, we provide a sober look at recent progress in the field and challenge some common assumptions. We first theoretically show that the unsupervised learning of disentangled representations is fundamentally impossible without inductive biases on both the models and the data. Then, we train more than $12000$ models covering most prominent methods and evaluation metrics in a reproducible large-scale experimental study on seven different data sets. We observe that while the different methods successfully enforce properties \u201cencouraged\u201d by the corresponding losses, well-disentangled models seemingly cannot be identified without supervision. Furthermore, increased disentanglement does not seem to lead to a decreased sample complexity of learning for downstream tasks. Our results suggest that future work on disentanglement learning should be explicit about the role of inductive biases and (implicit) supervision, investigate concrete benefits of enforcing disentanglement of the learned representations, and consider a reproducible experimental setup covering several data sets.}\n}", "pdf": "http://proceedings.mlr.press/v97/locatello19a/locatello19a.pdf", "supp": "", "pdf_size": 1143382, "gs_citation": 1772, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6335279708048204312&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/locatello19a.html" }, { "title": "Characterization of Convex Objective Functions and Optimal Expected Convergence Rates for SGD", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3843", "id": "3843", "author_site": "Marten van Dijk, Lam Nguyen, PHUONG_HA NGUYEN, Dzung Phan", "author": "Marten Van Dijk; Lam Nguyen; Phuong Ha Nguyen; Dzung Phan", "abstract": "We study Stochastic Gradient Descent (SGD) with diminishing step sizes for convex objective functions. We introduce a definitional framework and theory that defines and characterizes a core property, called curvature, of convex objective functions. In terms of curvature we can derive a new inequality that can be used to compute an optimal sequence of diminishing step sizes by solving a differential equation. Our exact solutions confirm known results in literature and allows us to fully characterize a new regularizer with its corresponding expected convergence rates.", "bibtex": "@InProceedings{pmlr-v97-van-dijk19a,\n title = \t {Characterization of Convex Objective Functions and Optimal Expected Convergence Rates for {SGD}},\n author = {Van Dijk, Marten and Nguyen, Lam and Nguyen, Phuong Ha and Phan, Dzung},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6392--6400},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/van-dijk19a/van-dijk19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/van-dijk19a.html},\n abstract = \t {We study Stochastic Gradient Descent (SGD) with diminishing step sizes for convex objective functions. We introduce a definitional framework and theory that defines and characterizes a core property, called curvature, of convex objective functions. In terms of curvature we can derive a new inequality that can be used to compute an optimal sequence of diminishing step sizes by solving a differential equation. Our exact solutions confirm known results in literature and allows us to fully characterize a new regularizer with its corresponding expected convergence rates.}\n}", "pdf": "http://proceedings.mlr.press/v97/van-dijk19a/van-dijk19a.pdf", "supp": "", "pdf_size": 406383, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3343994868665991028&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Electrical and Computer Engineering, University of Connecticut, CT, USA; IBM Research, Thomas J. Watson Research Center, NY, USA; Department of Electrical and Computer Engineering, University of Connecticut, CT, USA; IBM Research, Thomas J. Watson Research Center, NY, USA", "aff_domain": "uconn.edu;ibm.com;gmail.com;us.ibm.com", "email": "uconn.edu;ibm.com;gmail.com;us.ibm.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/van-dijk19a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "University of Connecticut;IBM", "aff_unique_dep": "Department of Electrical and Computer Engineering;IBM Research", "aff_unique_url": "https://www.uconn.edu;https://www.ibm.com/research", "aff_unique_abbr": "UConn;IBM", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Connecticut;Thomas J. Watson Research Center", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Characterizing Well-Behaved vs. Pathological Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4205", "id": "4205", "author": "Antoine Labatie", "abstract": "We introduce a novel approach, requiring only mild assumptions, for the characterization of deep neural networks at initialization. Our approach applies both to fully-connected and convolutional networks and easily incorporates batch normalization and skip-connections. Our key insight is to consider the evolution with depth of statistical moments of signal and noise, thereby characterizing the presence or absence of pathologies in the hypothesis space encoded by the choice of hyperparameters. We establish: (i) for feedforward networks, with and without batch normalization, the multiplicativity of layer composition inevitably leads to ill-behaved moments and pathologies; (ii) for residual networks with batch normalization, on the other hand, skip-connections induce power-law rather than exponential behaviour, leading to well-behaved moments and no pathology.", "bibtex": "@InProceedings{pmlr-v97-labatie19a,\n title = \t {Characterizing Well-Behaved vs. Pathological Deep Neural Networks},\n author = {Labatie, Antoine},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3611--3621},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/labatie19a/labatie19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/labatie19a.html},\n abstract = \t {We introduce a novel approach, requiring only mild assumptions, for the characterization of deep neural networks at initialization. Our approach applies both to fully-connected and convolutional networks and easily incorporates batch normalization and skip-connections. Our key insight is to consider the evolution with depth of statistical moments of signal and noise, thereby characterizing the presence or absence of pathologies in the hypothesis space encoded by the choice of hyperparameters. We establish: (i) for feedforward networks, with and without batch normalization, the multiplicativity of layer composition inevitably leads to ill-behaved moments and pathologies; (ii) for residual networks with batch normalization, on the other hand, skip-connections induce power-law rather than exponential behaviour, leading to well-behaved moments and no pathology.}\n}", "pdf": "http://proceedings.mlr.press/v97/labatie19a/labatie19a.pdf", "supp": "", "pdf_size": 895927, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3271469999043438586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Labatie-AI", "aff_domain": "labatie.ai", "email": "labatie.ai", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/labatie19a.html", "aff_unique_index": "0", "aff_unique_norm": "Labatie-AI", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "" }, { "title": "Cheap Orthogonal Constraints in Neural Networks: A Simple Parametrization of the Orthogonal and Unitary Group", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3707", "id": "3707", "author_site": "Mario Lezcano Casado, David Mart\u00ednez-Rubio", "author": "Mario Lezcano-Casado; David Mart\u0131\u0301nez-Rubio", "abstract": "We introduce a novel approach to perform first-order optimization with orthogonal and unitary constraints. This approach is based on a parametrization stemming from Lie group theory through the exponential map. The parametrization transforms the constrained optimization problem into an unconstrained one over a Euclidean space, for which common first-order optimization methods can be used. The theoretical results presented are general enough to cover the special orthogonal group, the unitary group and, in general, any connected compact Lie group. We discuss how this and other parametrizations can be computed efficiently through an implementation trick, making numerically complex parametrizations usable at a negligible runtime cost in neural networks. In particular, we apply our results to RNNs with orthogonal recurrent weights, yielding a new architecture called expRNN. We demonstrate how our method constitutes a more robust approach to optimization with orthogonal constraints, showing faster, accurate, and more stable convergence in several tasks designed to test RNNs.", "bibtex": "@InProceedings{pmlr-v97-lezcano-casado19a,\n title = \t {Cheap Orthogonal Constraints in Neural Networks: A Simple Parametrization of the Orthogonal and Unitary Group},\n author = {Lezcano-Casado, Mario and Mart\\'{\\i}nez-Rubio, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3794--3803},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lezcano-casado19a/lezcano-casado19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lezcano-casado19a.html},\n abstract = \t {We introduce a novel approach to perform first-order optimization with orthogonal and unitary constraints. This approach is based on a parametrization stemming from Lie group theory through the exponential map. The parametrization transforms the constrained optimization problem into an unconstrained one over a Euclidean space, for which common first-order optimization methods can be used. The theoretical results presented are general enough to cover the special orthogonal group, the unitary group and, in general, any connected compact Lie group. We discuss how this and other parametrizations can be computed efficiently through an implementation trick, making numerically complex parametrizations usable at a negligible runtime cost in neural networks. In particular, we apply our results to RNNs with orthogonal recurrent weights, yielding a new architecture called expRNN. We demonstrate how our method constitutes a more robust approach to optimization with orthogonal constraints, showing faster, accurate, and more stable convergence in several tasks designed to test RNNs.}\n}", "pdf": "http://proceedings.mlr.press/v97/lezcano-casado19a/lezcano-casado19a.pdf", "supp": "", "pdf_size": 486406, "gs_citation": 256, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17536814525953471769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Mathematical Institute, University of Oxford, Oxford, United Kingdom; Department of Computer Science, University of Oxford, Oxford, United Kingdom", "aff_domain": "maths.ox.ac.uk; ", "email": "maths.ox.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/lezcano-casado19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Mathematical Institute", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Circuit-GNN: Graph Neural Networks for Distributed Circuit Design", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4281", "id": "4281", "author_site": "GUO ZHANG, Hao He, Dina Katabi", "author": "Guo Zhang; Hao He; Dina Katabi", "abstract": "We present Circuit-GNN, a graph neural network (GNN) model for designing distributed circuits. Today, designing distributed circuits is a slow process that can take months from an expert engineer. Our model both automates and speeds up the process. The model learns to simulate the electromagnetic (EM) properties of distributed circuits. Hence, it can be used to replace traditional EM simulators, which typically take tens of minutes for each design iteration. Further, by leveraging neural networks\u2019 differentiability, we can use our model to solve the inverse problem \u2013 i.e., given desirable EM specifications, we propagate the gradient to optimize the circuit parameters and topology to satisfy the specifications. We exploit the flexibility of GNN to create one model that works for different circuit topologies. We compare our model with a commercial simulator showing that it reduces simulation time by four orders of magnitude. We also demonstrate the value of our model by using it to design a Terahertz channelizer, a difficult task that requires a specialized expert. The results show that our model produces a channelizer whose performance is as good as a manually optimized design, and can save the expert several weeks of topology and parameter optimization. Most interestingly, our model comes up with new designs that differ from the limited templates commonly used by engineers in the field, hence significantly expanding the design space.", "bibtex": "@InProceedings{pmlr-v97-zhang19e,\n title = \t {Circuit-{GNN}: Graph Neural Networks for Distributed Circuit Design},\n author = {Zhang, Guo and He, Hao and Katabi, Dina},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7364--7373},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19e/zhang19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19e.html},\n abstract = \t {We present Circuit-GNN, a graph neural network (GNN) model for designing distributed circuits. Today, designing distributed circuits is a slow process that can take months from an expert engineer. Our model both automates and speeds up the process. The model learns to simulate the electromagnetic (EM) properties of distributed circuits. Hence, it can be used to replace traditional EM simulators, which typically take tens of minutes for each design iteration. Further, by leveraging neural networks\u2019 differentiability, we can use our model to solve the inverse problem \u2013 i.e., given desirable EM specifications, we propagate the gradient to optimize the circuit parameters and topology to satisfy the specifications. We exploit the flexibility of GNN to create one model that works for different circuit topologies. We compare our model with a commercial simulator showing that it reduces simulation time by four orders of magnitude. We also demonstrate the value of our model by using it to design a Terahertz channelizer, a difficult task that requires a specialized expert. The results show that our model produces a channelizer whose performance is as good as a manually optimized design, and can save the expert several weeks of topology and parameter optimization. Most interestingly, our model comes up with new designs that differ from the limited templates commonly used by engineers in the field, hence significantly expanding the design space.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19e/zhang19e.pdf", "supp": "", "pdf_size": 1543957, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14954427418588027035&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "EECS, Massachusetts Institute of Technology, Cambridge, MA, USA; EECS, Massachusetts Institute of Technology, Cambridge, MA, USA; EECS, Massachusetts Institute of Technology, Cambridge, MA, USA", "aff_domain": "mit.edu; ; ", "email": "mit.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhang19e.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "EECS", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Classification from Positive, Unlabeled and Biased Negative Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3625", "id": "3625", "author_site": "Yu-Guan Hsieh, Gang Niu, Masashi Sugiyama", "author": "Yu-Guan Hsieh; Gang Niu; Masashi Sugiyama", "abstract": "In binary classification, there are situations where negative (N) data are too diverse to be fully labeled and we often resort to positive-unlabeled (PU) learning in these scenarios. However, collecting a non-representative N set that contains only a small portion of all possible N data can often be much easier in practice. This paper studies a novel classification framework which incorporates such biased N (bN) data in PU learning. We provide a method based on empirical risk minimization to address this PUbN classification problem. Our approach can be regarded as a novel example-weighting algorithm, with the weight of each example computed through a preliminary step that draws inspiration from PU learning. We also derive an estimation error bound for the proposed method. Experimental results demonstrate the effectiveness of our algorithm in not only PUbN learning scenarios but also ordinary PU learning scenarios on several benchmark datasets.", "bibtex": "@InProceedings{pmlr-v97-hsieh19c,\n title = \t {Classification from Positive, Unlabeled and Biased Negative Data},\n author = {Hsieh, Yu-Guan and Niu, Gang and Sugiyama, Masashi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2820--2829},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hsieh19c/hsieh19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/hsieh19c.html},\n abstract = \t {In binary classification, there are situations where negative (N) data are too diverse to be fully labeled and we often resort to positive-unlabeled (PU) learning in these scenarios. However, collecting a non-representative N set that contains only a small portion of all possible N data can often be much easier in practice. This paper studies a novel classification framework which incorporates such biased N (bN) data in PU learning. We provide a method based on empirical risk minimization to address this PUbN classification problem. Our approach can be regarded as a novel example-weighting algorithm, with the weight of each example computed through a preliminary step that draws inspiration from PU learning. We also derive an estimation error bound for the proposed method. Experimental results demonstrate the effectiveness of our algorithm in not only PUbN learning scenarios but also ordinary PU learning scenarios on several benchmark datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/hsieh19c/hsieh19c.pdf", "supp": "", "pdf_size": 1287413, "gs_citation": 102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14812922163751509802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "\u00b4Ecole Normale Sup \u00b4erieure, Paris, France+RIKEN, Tokyo, Japan; RIKEN, Tokyo, Japan; The University of Tokyo, Tokyo, Japan", "aff_domain": "ens.fr; ; ", "email": "ens.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hsieh19c.html", "aff_unique_index": "0+1;1;2", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;RIKEN;University of Tokyo", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ens.fr;https://www.riken.jp;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "ENS;RIKEN;UTokyo", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "Paris;Tokyo", "aff_country_unique_index": "0+1;1;1", "aff_country_unique": "France;Japan" }, { "title": "Classifying Treatment Responders Under Causal Effect Monotonicity", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4296", "id": "4296", "author": "Nathan Kallus", "abstract": "In the context of individual-level causal inference, we study the problem of predicting whether someone will respond or not to a treatment based on their features and past examples of features, treatment indicator (e.g., drug/no drug), and a binary outcome (e.g., recovery from disease). As a classification task, the problem is made difficult by not knowing the example outcomes under the opposite treatment indicators. We assume the effect is monotonic, as in advertising\u2019s effect on a purchase or bail-setting\u2019s effect on reappearance in court: either it would have happened regardless of treatment, not happened regardless, or happened only depending on exposure to treatment. Predicting whether the latter is latently the case is our focus. While previous work focuses on conditional average treatment effect estimation, formulating the problem as a classification task allows us to develop new tools more suited to this problem. By leveraging monotonicity, we develop new discriminative and generative algorithms for the responder-classification problem. We explore and discuss connections to corrupted data and policy learning. We provide an empirical study with both synthetic and real datasets to compare these specialized algorithms to standard benchmarks.", "bibtex": "@InProceedings{pmlr-v97-kallus19a,\n title = \t {Classifying Treatment Responders Under Causal Effect Monotonicity},\n author = {Kallus, Nathan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3201--3210},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kallus19a/kallus19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kallus19a.html},\n abstract = \t {In the context of individual-level causal inference, we study the problem of predicting whether someone will respond or not to a treatment based on their features and past examples of features, treatment indicator (e.g., drug/no drug), and a binary outcome (e.g., recovery from disease). As a classification task, the problem is made difficult by not knowing the example outcomes under the opposite treatment indicators. We assume the effect is monotonic, as in advertising\u2019s effect on a purchase or bail-setting\u2019s effect on reappearance in court: either it would have happened regardless of treatment, not happened regardless, or happened only depending on exposure to treatment. Predicting whether the latter is latently the case is our focus. While previous work focuses on conditional average treatment effect estimation, formulating the problem as a classification task allows us to develop new tools more suited to this problem. By leveraging monotonicity, we develop new discriminative and generative algorithms for the responder-classification problem. We explore and discuss connections to corrupted data and policy learning. We provide an empirical study with both synthetic and real datasets to compare these specialized algorithms to standard benchmarks.}\n}", "pdf": "http://proceedings.mlr.press/v97/kallus19a/kallus19a.pdf", "supp": "", "pdf_size": 5171430, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10851717276916793970&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Operations Research and Information Engineering and Cornell Tech, Cornell University", "aff_domain": "cornell.edu", "email": "cornell.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/kallus19a.html", "aff_unique_index": "0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "School of Operations Research and Information Engineering", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "0", "aff_campus_unique": "Ithaca", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Co-Representation Network for Generalized Zero-Shot Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3811", "id": "3811", "author_site": "Fei Zhang, Guangming Shi", "author": "Fei Zhang; Guangming Shi", "abstract": "Generalized zero-shot learning is a significant topic but faced with bias problem, which leads to unseen classes being easily misclassified into seen classes. Hence we propose a embedding model called co-representation network to learn a more uniform visual embedding space that effectively alleviates the bias problem and helps with classification. We mathematically analyze our model and find it learns a projection with high local linearity, which is proved to cause less bias problem. The network consists of a cooperation module for representation and a relation module for classification, it is simple in structure and can be easily trained in an end-to-end manner. Experiments show that our method outperforms existing generalized zero-shot learning methods on several benchmark datasets.", "bibtex": "@InProceedings{pmlr-v97-zhang19l,\n title = \t {Co-Representation Network for Generalized Zero-Shot Learning},\n author = {Zhang, Fei and Shi, Guangming},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7434--7443},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19l/zhang19l.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19l.html},\n abstract = \t {Generalized zero-shot learning is a significant topic but faced with bias problem, which leads to unseen classes being easily misclassified into seen classes. Hence we propose a embedding model called co-representation network to learn a more uniform visual embedding space that effectively alleviates the bias problem and helps with classification. We mathematically analyze our model and find it learns a projection with high local linearity, which is proved to cause less bias problem. The network consists of a cooperation module for representation and a relation module for classification, it is simple in structure and can be easily trained in an end-to-end manner. Experiments show that our method outperforms existing generalized zero-shot learning methods on several benchmark datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19l/zhang19l.pdf", "supp": "", "pdf_size": 1191106, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7158886248467537965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Artificial Intelligence, Xidian University, China; School of Artificial Intelligence, Xidian University, China", "aff_domain": "xidian.edu.cn;xidian.edu.cn", "email": "xidian.edu.cn;xidian.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhang19l.html", "aff_unique_index": "0;0", "aff_unique_norm": "Xidian University", "aff_unique_dep": "School of Artificial Intelligence", "aff_unique_url": "http://www.xidian.edu.cn/", "aff_unique_abbr": "Xidian", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Co-manifold learning with missing data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3946", "id": "3946", "author_site": "Gal Mishne, Eric Chi, Ronald Coifman", "author": "Gal Mishne; Eric Chi; Ronald Coifman", "abstract": "Representation learning is typically applied to only one mode of a data matrix, either its rows or columns. Yet in many applications, there is an underlying geometry to both the rows and the columns. We propose utilizing this coupled structure to perform co-manifold learning: uncovering the underlying geometry of both the rows and the columns of a given matrix, where we focus on a missing data setting. Our unsupervised approach consists of three components. We first solve a family of optimization problems to estimate a complete matrix at multiple scales of smoothness. We then use this collection of smooth matrix estimates to compute pairwise distances on the rows and columns based on a new multi-scale metric that implicitly introduces a coupling between the rows and the columns. Finally, we construct row and column representations from these multi-scale metrics. We demonstrate that our approach outperforms competing methods in both data visualization and clustering.", "bibtex": "@InProceedings{pmlr-v97-mishne19a,\n title = \t {Co-manifold learning with missing data},\n author = {Mishne, Gal and Chi, Eric and Coifman, Ronald},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4605--4614},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mishne19a/mishne19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mishne19a.html},\n abstract = \t {Representation learning is typically applied to only one mode of a data matrix, either its rows or columns. Yet in many applications, there is an underlying geometry to both the rows and the columns. We propose utilizing this coupled structure to perform co-manifold learning: uncovering the underlying geometry of both the rows and the columns of a given matrix, where we focus on a missing data setting. Our unsupervised approach consists of three components. We first solve a family of optimization problems to estimate a complete matrix at multiple scales of smoothness. We then use this collection of smooth matrix estimates to compute pairwise distances on the rows and columns based on a new multi-scale metric that implicitly introduces a coupling between the rows and the columns. Finally, we construct row and column representations from these multi-scale metrics. We demonstrate that our approach outperforms competing methods in both data visualization and clustering.}\n}", "pdf": "http://proceedings.mlr.press/v97/mishne19a/mishne19a.pdf", "supp": "", "pdf_size": 3763359, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17186109268561397667&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Mathematics, Yale University, New Haven, CT, USA; Department of Statistics, North Carolina State University, Raleigh, NC, USA; Department of Mathematics, Yale University, New Haven, CT, USA", "aff_domain": "yale.edu; ; ", "email": "yale.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mishne19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Yale University;North Carolina State University", "aff_unique_dep": "Department of Mathematics;Department of Statistics", "aff_unique_url": "https://www.yale.edu;https://www.ncsu.edu", "aff_unique_abbr": "Yale;NCSU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "New Haven;Raleigh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "CoT: Cooperative Training for Generative Modeling of Discrete Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3885", "id": "3885", "author_site": "Sidi Lu, Lantao Yu, Siyuan Feng, Yaoming Zhu, Weinan Zhang", "author": "Sidi Lu; Lantao Yu; Siyuan Feng; Yaoming Zhu; Weinan Zhang", "abstract": "In this paper, we study the generative models of sequential discrete data. To tackle the exposure bias problem inherent in maximum likelihood estimation (MLE), generative adversarial networks (GANs) are introduced to penalize the unrealistic generated samples. To exploit the supervision signal from the discriminator, most previous models leverage REINFORCE to address the non-differentiable problem of sequential discrete data. However, because of the unstable property of the training signal during the dynamic process of adversarial training, the effectiveness of REINFORCE, in this case, is hardly guaranteed. To deal with such a problem, we propose a novel approach called Cooperative Training (CoT) to improve the training of sequence generative models. CoT transforms the min-max game of GANs into a joint maximization framework and manages to explicitly estimate and optimize Jensen-Shannon divergence. Moreover, CoT works without the necessity of pre-training via MLE, which is crucial to the success of previous methods. In the experiments, compared to existing state-of-the-art methods, CoT shows superior or at least competitive performance on sample quality, diversity, as well as training stability.", "bibtex": "@InProceedings{pmlr-v97-lu19d,\n title = \t {{C}o{T}: Cooperative Training for Generative Modeling of Discrete Data},\n author = {Lu, Sidi and Yu, Lantao and Feng, Siyuan and Zhu, Yaoming and Zhang, Weinan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4164--4172},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lu19d/lu19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/lu19d.html},\n abstract = \t {In this paper, we study the generative models of sequential discrete data. To tackle the exposure bias problem inherent in maximum likelihood estimation (MLE), generative adversarial networks (GANs) are introduced to penalize the unrealistic generated samples. To exploit the supervision signal from the discriminator, most previous models leverage REINFORCE to address the non-differentiable problem of sequential discrete data. However, because of the unstable property of the training signal during the dynamic process of adversarial training, the effectiveness of REINFORCE, in this case, is hardly guaranteed. To deal with such a problem, we propose a novel approach called Cooperative Training (CoT) to improve the training of sequence generative models. CoT transforms the min-max game of GANs into a joint maximization framework and manages to explicitly estimate and optimize Jensen-Shannon divergence. Moreover, CoT works without the necessity of pre-training via MLE, which is crucial to the success of previous methods. In the experiments, compared to existing state-of-the-art methods, CoT shows superior or at least competitive performance on sample quality, diversity, as well as training stability.}\n}", "pdf": "http://proceedings.mlr.press/v97/lu19d/lu19d.pdf", "supp": "", "pdf_size": 1582448, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4231322493080735140&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/lu19d.html" }, { "title": "Cognitive model priors for predicting human decisions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3762", "id": "3762", "author_site": "Joshua C Peterson, David D Bourgin, Daniel Reichman, Thomas Griffiths, Stuart Russell", "author": "David D. Bourgin; Joshua C. Peterson; Daniel Reichman; Stuart J. Russell; Thomas L. Griffiths", "abstract": "Human decision-making underlies all economic behavior. For the past four decades, human decision-making under uncertainty has continued to be explained by theoretical models based on prospect theory, a framework that was awarded the Nobel Prize in Economic Sciences. However, theoretical models of this kind have developed slowly, and robust, high-precision predictive models of human decisions remain a challenge. While machine learning is a natural candidate for solving these problems, it is currently unclear to what extent it can improve predictions obtained by current theories. We argue that this is mainly due to data scarcity, since noisy human behavior requires massive sample sizes to be accurately captured by off-the-shelf machine learning methods. To solve this problem, what is needed are machine learning models with appropriate inductive biases for capturing human behavior, and larger datasets. We offer two contributions towards this end: first, we construct \u201ccognitive model priors\u201d by pretraining neural networks with synthetic data generated by cognitive models (i.e., theoretical models developed by cognitive psychologists). We find that fine-tuning these networks on small datasets of real human decisions results in unprecedented state-of-the-art improvements on two benchmark datasets. Second, we present the first large-scale dataset for human decision-making, containing over 240,000 human judgments across over 13,000 decision problems. This dataset reveals the circumstances where cognitive model priors are useful, and provides a new standard for benchmarking prediction of human decisions under uncertainty.", "bibtex": "@InProceedings{pmlr-v97-peterson19a,\n title = \t {Cognitive model priors for predicting human decisions},\n author = {Bourgin, David D. and Peterson, Joshua C. and Reichman, Daniel and Russell, Stuart J. and Griffiths, Thomas L.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5133--5141},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/peterson19a/peterson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/peterson19a.html},\n abstract = \t {Human decision-making underlies all economic behavior. For the past four decades, human decision-making under uncertainty has continued to be explained by theoretical models based on prospect theory, a framework that was awarded the Nobel Prize in Economic Sciences. However, theoretical models of this kind have developed slowly, and robust, high-precision predictive models of human decisions remain a challenge. While machine learning is a natural candidate for solving these problems, it is currently unclear to what extent it can improve predictions obtained by current theories. We argue that this is mainly due to data scarcity, since noisy human behavior requires massive sample sizes to be accurately captured by off-the-shelf machine learning methods. To solve this problem, what is needed are machine learning models with appropriate inductive biases for capturing human behavior, and larger datasets. We offer two contributions towards this end: first, we construct \u201ccognitive model priors\u201d by pretraining neural networks with synthetic data generated by cognitive models (i.e., theoretical models developed by cognitive psychologists). We find that fine-tuning these networks on small datasets of real human decisions results in unprecedented state-of-the-art improvements on two benchmark datasets. Second, we present the first large-scale dataset for human decision-making, containing over 240,000 human judgments across over 13,000 decision problems. This dataset reveals the circumstances where cognitive model priors are useful, and provides a new standard for benchmarking prediction of human decisions under uncertainty.}\n}", "pdf": "http://proceedings.mlr.press/v97/peterson19a/peterson19a.pdf", "supp": "", "pdf_size": 617654, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12083739906070424027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "University of California, Berkeley; Princeton University; Princeton University; University of California, Berkeley; Princeton University", "aff_domain": "gmail.com;gmail.com; ; ; ", "email": "gmail.com;gmail.com; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/peterson19a.html", "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "University of California, Berkeley;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.princeton.edu", "aff_unique_abbr": "UC Berkeley;Princeton", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Collaborative Channel Pruning for Deep Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3988", "id": "3988", "author_site": "Hanyu Peng, Jiaxiang Wu, Shifeng Chen, Junzhou Huang", "author": "Hanyu Peng; Jiaxiang Wu; Shifeng Chen; Junzhou Huang", "abstract": "Deep networks have achieved impressive performance in various domains, but their applications are largely limited by the prohibitive computational overhead. In this paper, we propose a novel algorithm, namely collaborative channel pruning (CCP), to reduce the computational overhead with negligible performance degradation. The joint impact of pruned/preserved channels on the loss function is quantitatively analyzed, and such interchannel dependency is exploited to determine which channels to be pruned. The channel selection problem is then reformulated as a constrained 0-1 quadratic optimization problem, and the Hessian matrix, which is essential in constructing the above optimization, can be efficiently approximated. Empirical evaluation on two benchmark data sets indicates that our proposed CCP algorithm achieves higher classification accuracy with similar computational complexity than other stateof-the-art channel pruning algorithms", "bibtex": "@InProceedings{pmlr-v97-peng19c,\n title = \t {Collaborative Channel Pruning for Deep Networks},\n author = {Peng, Hanyu and Wu, Jiaxiang and Chen, Shifeng and Huang, Junzhou},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5113--5122},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/peng19c/peng19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/peng19c.html},\n abstract = \t {Deep networks have achieved impressive performance in various domains, but their applications are largely limited by the prohibitive computational overhead. In this paper, we propose a novel algorithm, namely collaborative channel pruning (CCP), to reduce the computational overhead with negligible performance degradation. The joint impact of pruned/preserved channels on the loss function is quantitatively analyzed, and such interchannel dependency is exploited to determine which channels to be pruned. The channel selection problem is then reformulated as a constrained 0-1 quadratic optimization problem, and the Hessian matrix, which is essential in constructing the above optimization, can be efficiently approximated. Empirical evaluation on two benchmark data sets indicates that our proposed CCP algorithm achieves higher classification accuracy with similar computational complexity than other stateof-the-art channel pruning algorithms}\n}", "pdf": "http://proceedings.mlr.press/v97/peng19c/peng19c.pdf", "supp": "", "pdf_size": 708819, "gs_citation": 198, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7940639983444673727&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Multimedia Laboratory, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences + Tencent AI Lab; Tencent AI Lab; Multimedia Laboratory, Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences; Department of CSE, The University of Texas at Arlington", "aff_domain": "siat.ac.cn;tencent.com;siat.ac.cn;uta.edu", "email": "siat.ac.cn;tencent.com;siat.ac.cn;uta.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/peng19c.html", "aff_unique_index": "0+1;1;0;2", "aff_unique_norm": "Chinese Academy of Sciences;Tencent;University of Texas at Arlington", "aff_unique_dep": "Multimedia Laboratory;Tencent AI Lab;Department of CSE", "aff_unique_url": "http://www.cas.cn/;https://ai.tencent.com;https://www.uta.edu", "aff_unique_abbr": "CAS;Tencent AI Lab;UTA", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Shenzhen;;Arlington", "aff_country_unique_index": "0+0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Collaborative Evolutionary Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3992", "id": "3992", "author_site": "Shauharda Khadka, Somdeb Majumdar, Tarek Nassar, Zach Dwiel, Evren Tumer, Santiago Miret, Yinyin Liu, Kagan Tumer", "author": "Shauharda Khadka; Somdeb Majumdar; Tarek Nassar; Zach Dwiel; Evren Tumer; Santiago Miret; Yinyin Liu; Kagan Tumer", "abstract": "Deep reinforcement learning algorithms have been successfully applied to a range of challenging control tasks. However, these methods typically struggle with achieving effective exploration and are extremely sensitive to the choice of hyperparameters. One reason is that most approaches use a noisy version of their operating policy to explore - thereby limiting the range of exploration. In this paper, we introduce Collaborative Evolutionary Reinforcement Learning (CERL), a scalable framework that comprises a portfolio of policies that simultaneously explore and exploit diverse regions of the solution space. A collection of learners - typically proven algorithms like TD3 - optimize over varying time-horizons leading to this diverse portfolio. All learners contribute to and use a shared replay buffer to achieve greater sample efficiency. Computational resources are dynamically distributed to favor the best learners as a form of online algorithm selection. Neuroevolution binds this entire process to generate a single emergent learner that exceeds the capabilities of any individual learner. Experiments in a range of continuous control benchmarks demonstrate that the emergent learner significantly outperforms its composite learners while remaining overall more sample-efficient - notably solving the Mujoco Humanoid benchmark where all of its composite learners (TD3) fail entirely in isolation.", "bibtex": "@InProceedings{pmlr-v97-khadka19a,\n title = \t {Collaborative Evolutionary Reinforcement Learning},\n author = {Khadka, Shauharda and Majumdar, Somdeb and Nassar, Tarek and Dwiel, Zach and Tumer, Evren and Miret, Santiago and Liu, Yinyin and Tumer, Kagan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3341--3350},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/khadka19a/khadka19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/khadka19a.html},\n abstract = \t {Deep reinforcement learning algorithms have been successfully applied to a range of challenging control tasks. However, these methods typically struggle with achieving effective exploration and are extremely sensitive to the choice of hyperparameters. One reason is that most approaches use a noisy version of their operating policy to explore - thereby limiting the range of exploration. In this paper, we introduce Collaborative Evolutionary Reinforcement Learning (CERL), a scalable framework that comprises a portfolio of policies that simultaneously explore and exploit diverse regions of the solution space. A collection of learners - typically proven algorithms like TD3 - optimize over varying time-horizons leading to this diverse portfolio. All learners contribute to and use a shared replay buffer to achieve greater sample efficiency. Computational resources are dynamically distributed to favor the best learners as a form of online algorithm selection. Neuroevolution binds this entire process to generate a single emergent learner that exceeds the capabilities of any individual learner. Experiments in a range of continuous control benchmarks demonstrate that the emergent learner significantly outperforms its composite learners while remaining overall more sample-efficient - notably solving the Mujoco Humanoid benchmark where all of its composite learners (TD3) fail entirely in isolation.}\n}", "pdf": "http://proceedings.mlr.press/v97/khadka19a/khadka19a.pdf", "supp": "", "pdf_size": 887817, "gs_citation": 157, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17431562445096471732&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Intel AI Lab + Collaborative Robotics and Intelligent Systems Institute, Oregon State University; Intel AI Lab; Intel AI Lab; Intel AI Lab; Intel AI Lab; Intel AI Lab; Intel AI Lab; Collaborative Robotics and Intelligent Systems Institute, Oregon State University", "aff_domain": "intel.com;intel.com; ; ; ; ; ; ", "email": "intel.com;intel.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/khadka19a.html", "aff_unique_index": "0+1;0;0;0;0;0;0;1", "aff_unique_norm": "Intel;Oregon State University", "aff_unique_dep": "Intel AI Lab;Collaborative Robotics and Intelligent Systems Institute", "aff_unique_url": "https://www.intel.com;https://oregonstate.edu", "aff_unique_abbr": "Intel;OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Collective Model Fusion for Multiple Black-Box Experts", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3952", "id": "3952", "author_site": "Minh Hoang, Nghia Hoang, Bryan Kian Hsiang Low, Carleton Kingsford", "author": "Minh Hoang; Nghia Hoang; Bryan Kian Hsiang Low; Carleton Kingsford", "abstract": "Model fusion is a fundamental problem in collec-tive machine learning (ML) where independentexperts with heterogeneous learning architecturesare required to combine expertise to improve pre-dictive performance. This is particularly chal-lenging in information-sensitive domains whereexperts do not have access to each other\u2019s internalarchitecture and local data. This paper presentsthe first collective model fusion framework formultiple experts with heterogeneous black-box ar-chitectures. The proposed method will enable thisby addressing the key issues of how black-boxexperts interact to understand the predictive be-haviors of one another; how these understandingscan be represented and shared efficiently amongthemselves; and how the shared understandingscan be combined to generate high-quality consen-sus prediction. The performance of the resultingframework is analyzed theoretically and demon-strated empirically on several datasets.", "bibtex": "@InProceedings{pmlr-v97-hoang19a,\n title = \t {Collective Model Fusion for Multiple Black-Box Experts},\n author = {Hoang, Minh and Hoang, Nghia and Low, Bryan Kian Hsiang and Kingsford, Carleton},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2742--2750},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hoang19a/hoang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hoang19a.html},\n abstract = \t {Model fusion is a fundamental problem in collec-tive machine learning (ML) where independentexperts with heterogeneous learning architecturesare required to combine expertise to improve pre-dictive performance. This is particularly chal-lenging in information-sensitive domains whereexperts do not have access to each other\u2019s internalarchitecture and local data. This paper presentsthe first collective model fusion framework formultiple experts with heterogeneous black-box ar-chitectures. The proposed method will enable thisby addressing the key issues of how black-boxexperts interact to understand the predictive be-haviors of one another; how these understandingscan be represented and shared efficiently amongthemselves; and how the shared understandingscan be combined to generate high-quality consen-sus prediction. The performance of the resultingframework is analyzed theoretically and demon-strated empirically on several datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/hoang19a/hoang19a.pdf", "supp": "", "pdf_size": 499445, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13434682310214292157&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Carnegie Mellon University; MIT-IBM Watson AI Lab, IBM Research Cambridge; National University of Singapore; Carnegie Mellon University", "aff_domain": "andrew.cmu.edu; ; ; ", "email": "andrew.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/hoang19a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Carnegie Mellon University;IBM;National University of Singapore", "aff_unique_dep": ";AI Lab;", "aff_unique_url": "https://www.cmu.edu;https://www.ibmwatsonai.org/;https://www.nus.edu.sg", "aff_unique_abbr": "CMU;MIT-IBM AI Lab;NUS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Singapore" }, { "title": "Combating Label Noise in Deep Learning using Abstention", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4141", "id": "4141", "author_site": "Sunil Thulasidasan, Tanmoy Bhattacharya, Jeff Bilmes, Gopinath Chennupati, Jamal Mohd-Yusof", "author": "Sunil Thulasidasan; Tanmoy Bhattacharya; Jeff Bilmes; Gopinath Chennupati; Jamal Mohd-Yusof", "abstract": "We introduce a novel method to combat label noise when training deep neural networks for classification. We propose a loss function that permits abstention during training thereby allowing the DNN to abstain on confusing samples while continuing to learn and improve classification performance on the non-abstained samples. We show how such a deep abstaining classifier (DAC) can be used for robust learning in the presence of different types of label noise. In the case of structured or systematic label noise {\u2013} where noisy training labels or confusing examples are correlated with underlying features of the data{\u2013} training with abstention enables representation learning for features that are associated with unreliable labels. In the case of unstructured (arbitrary) label noise, abstention during training enables the DAC to be used as an effective data cleaner by identifying samples that are likely to have label noise. We provide analytical results on the loss function behavior that enable dynamic adaption of abstention rates based on learning progress during training. We demonstrate the utility of the deep abstaining classifier for various image classification tasks under different types of label noise; in the case of arbitrary label noise, we show significant im- provements over previously published results on multiple image benchmarks.", "bibtex": "@InProceedings{pmlr-v97-thulasidasan19a,\n title = \t {Combating Label Noise in Deep Learning using Abstention},\n author = {Thulasidasan, Sunil and Bhattacharya, Tanmoy and Bilmes, Jeff and Chennupati, Gopinath and Mohd-Yusof, Jamal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6234--6243},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/thulasidasan19a/thulasidasan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/thulasidasan19a.html},\n abstract = \t {We introduce a novel method to combat label noise when training deep neural networks for classification. We propose a loss function that permits abstention during training thereby allowing the DNN to abstain on confusing samples while continuing to learn and improve classification performance on the non-abstained samples. We show how such a deep abstaining classifier (DAC) can be used for robust learning in the presence of different types of label noise. In the case of structured or systematic label noise {\u2013} where noisy training labels or confusing examples are correlated with underlying features of the data{\u2013} training with abstention enables representation learning for features that are associated with unreliable labels. In the case of unstructured (arbitrary) label noise, abstention during training enables the DAC to be used as an effective data cleaner by identifying samples that are likely to have label noise. We provide analytical results on the loss function behavior that enable dynamic adaption of abstention rates based on learning progress during training. We demonstrate the utility of the deep abstaining classifier for various image classification tasks under different types of label noise; in the case of arbitrary label noise, we show significant im- provements over previously published results on multiple image benchmarks.}\n}", "pdf": "http://proceedings.mlr.press/v97/thulasidasan19a/thulasidasan19a.pdf", "supp": "", "pdf_size": 7178367, "gs_citation": 228, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13352196764325122860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Los Alamos National Laboratory; Los Alamos National Laboratory; Department of Electrical & Computer Engineering, University of Washington; Los Alamos National Laboratory; Los Alamos National Laboratory", "aff_domain": "lanl.gov; ; ; ; ", "email": "lanl.gov; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/thulasidasan19a.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Los Alamos National Laboratory;University of Washington", "aff_unique_dep": ";Department of Electrical & Computer Engineering", "aff_unique_url": "https://www.lanl.gov;https://www.washington.edu", "aff_unique_abbr": "LANL;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Combining parametric and nonparametric models for off-policy evaluation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4093", "id": "4093", "author_site": "Omer Gottesman, Yao Liu, Scott Sussex, Emma Brunskill, Finale Doshi-Velez", "author": "Omer Gottesman; Yao Liu; Scott Sussex; Emma Brunskill; Finale Doshi-Velez", "abstract": "We consider a model-based approach to perform batch off-policy evaluation in reinforcement learning. Our method takes a mixture-of-experts approach to combine parametric and non-parametric models of the environment such that the final value estimate has the least expected error. We do so by first estimating the local accuracy of each model and then using a planner to select which model to use at every time step as to minimize the return error estimate along entire trajectories. Across a variety of domains, our mixture-based approach outperforms the individual models alone as well as state-of-the-art importance sampling-based estimators.", "bibtex": "@InProceedings{pmlr-v97-gottesman19a,\n title = \t {Combining parametric and nonparametric models for off-policy evaluation},\n author = {Gottesman, Omer and Liu, Yao and Sussex, Scott and Brunskill, Emma and Doshi-Velez, Finale},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2366--2375},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gottesman19a/gottesman19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gottesman19a.html},\n abstract = \t {We consider a model-based approach to perform batch off-policy evaluation in reinforcement learning. Our method takes a mixture-of-experts approach to combine parametric and non-parametric models of the environment such that the final value estimate has the least expected error. We do so by first estimating the local accuracy of each model and then using a planner to select which model to use at every time step as to minimize the return error estimate along entire trajectories. Across a variety of domains, our mixture-based approach outperforms the individual models alone as well as state-of-the-art importance sampling-based estimators.}\n}", "pdf": "http://proceedings.mlr.press/v97/gottesman19a/gottesman19a.pdf", "supp": "", "pdf_size": 824311, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5066391292071299163&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Harvard University; Stanford University; Harvard University; Stanford University; Harvard University", "aff_domain": "fas.harvard.edu; ; ; ; ", "email": "fas.harvard.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/gottesman19a.html", "aff_unique_index": "0;1;0;1;0", "aff_unique_norm": "Harvard University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.harvard.edu;https://www.stanford.edu", "aff_unique_abbr": "Harvard;Stanford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Communication Complexity in Locally Private Distribution Estimation and Heavy Hitters", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3570", "id": "3570", "author_site": "Jayadev Acharya, Ziteng Sun", "author": "Jayadev Acharya; Ziteng Sun", "abstract": "We consider the problems of distribution estimation, and heavy hitter (frequency) estimation under privacy, and communication constraints. While the constraints have been studied separately, optimal schemes for one are sub-optimal for the other. We propose a sample-optimal $\\eps$-locally differentially private (LDP) scheme for distribution estimation, where each user communicates one bit, and requires", "bibtex": "@InProceedings{pmlr-v97-acharya19c,\n title = \t {Communication Complexity in Locally Private Distribution Estimation and Heavy Hitters},\n author = {Acharya, Jayadev and Sun, Ziteng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {51--60},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/acharya19c/acharya19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/acharya19c.html},\n abstract = \t {We consider the problems of distribution estimation, and heavy hitter (frequency) estimation under privacy, and communication constraints. While the constraints have been studied separately, optimal schemes for one are sub-optimal for the other. We propose a sample-optimal $\\eps$-locally differentially private (LDP) scheme for distribution estimation, where each user communicates one bit, and requires", "pdf": "http://proceedings.mlr.press/v97/acharya19c/acharya19c.pdf", "supp": "", "pdf_size": 1263624, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10911375228806552321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Electrical and Computer Engineering, Cornell University; School of Electrical and Computer Engineering, Cornell University", "aff_domain": "cornell.edu; ", "email": "cornell.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/acharya19c.html", "aff_unique_index": "0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "School of Electrical and Computer Engineering", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ithaca", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Communication-Constrained Inference and the Role of Shared Randomness", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3569", "id": "3569", "author_site": "Jayadev Acharya, Cl\u00e9ment Canonne, Himanshu Tyagi", "author": "Jayadev Acharya; Clement Canonne; Himanshu Tyagi", "abstract": "A central server needs to perform statistical inference based on samples that are distributed over multiple users who can each send a message of limited length to the center. We study problems of distribution learning and identity testing in this distributed inference setting and examine the role of shared randomness as a resource. We propose a general purpose", "bibtex": "@InProceedings{pmlr-v97-acharya19a,\n title = \t {Communication-Constrained Inference and the Role of Shared Randomness},\n author = {Acharya, Jayadev and Canonne, Clement and Tyagi, Himanshu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {30--39},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/acharya19a/acharya19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/acharya19a.html},\n abstract = \t {A central server needs to perform statistical inference based on samples that are distributed over multiple users who can each send a message of limited length to the center. We study problems of distribution learning and identity testing in this distributed inference setting and examine the role of shared randomness as a resource. We propose a general purpose", "pdf": "http://proceedings.mlr.press/v97/acharya19a/acharya19a.pdf", "supp": "", "pdf_size": 421927, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12350198857620197679&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Cornell University; Stanford University; Indian Institute of Science Institute of Technology", "aff_domain": "cs.stanford.edu; ; ", "email": "cs.stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/acharya19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Cornell University;Stanford University;Indian Institute of Science", "aff_unique_dep": ";;Institute of Technology", "aff_unique_url": "https://www.cornell.edu;https://www.stanford.edu;https://www.iisc.ac.in", "aff_unique_abbr": "Cornell;Stanford;IISc", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;India" }, { "title": "CompILE: Compositional Imitation Learning and Execution", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4035", "id": "4035", "author_site": "Thomas Kipf, Yujia Li, Hanjun Dai, Vinicius Zambaldi, Alvaro Sanchez-Gonzalez, Edward Grefenstette, Pushmeet Kohli, Peter Battaglia", "author": "Thomas Kipf; Yujia Li; Hanjun Dai; Vinicius Zambaldi; Alvaro Sanchez-Gonzalez; Edward Grefenstette; Pushmeet Kohli; Peter Battaglia", "abstract": "We introduce Compositional Imitation Learning and Execution (CompILE): a framework for learning reusable, variable-length segments of hierarchically-structured behavior from demonstration data. CompILE uses a novel unsupervised, fully-differentiable sequence segmentation module to learn latent encodings of sequential data that can be re-composed and executed to perform new tasks. Once trained, our model generalizes to sequences of longer length and from environment instances not seen during training. We evaluate CompILE in a challenging 2D multi-task environment and a continuous control task, and show that it can find correct task boundaries and event encodings in an unsupervised manner. Latent codes and associated behavior policies discovered by CompILE can be used by a hierarchical agent, where the high-level policy selects actions in the latent code space, and the low-level, task-specific policies are simply the learned decoders. We found that our CompILE-based agent could learn given only sparse rewards, where agents without task-specific policies struggle.", "bibtex": "@InProceedings{pmlr-v97-kipf19a,\n title = \t {{C}omp{ILE}: Compositional Imitation Learning and Execution},\n author = {Kipf, Thomas and Li, Yujia and Dai, Hanjun and Zambaldi, Vinicius and Sanchez-Gonzalez, Alvaro and Grefenstette, Edward and Kohli, Pushmeet and Battaglia, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3418--3428},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kipf19a/kipf19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kipf19a.html},\n abstract = \t {We introduce Compositional Imitation Learning and Execution (CompILE): a framework for learning reusable, variable-length segments of hierarchically-structured behavior from demonstration data. CompILE uses a novel unsupervised, fully-differentiable sequence segmentation module to learn latent encodings of sequential data that can be re-composed and executed to perform new tasks. Once trained, our model generalizes to sequences of longer length and from environment instances not seen during training. We evaluate CompILE in a challenging 2D multi-task environment and a continuous control task, and show that it can find correct task boundaries and event encodings in an unsupervised manner. Latent codes and associated behavior policies discovered by CompILE can be used by a hierarchical agent, where the high-level policy selects actions in the latent code space, and the low-level, task-specific policies are simply the learned decoders. We found that our CompILE-based agent could learn given only sparse rewards, where agents without task-specific policies struggle.}\n}", "pdf": "http://proceedings.mlr.press/v97/kipf19a/kipf19a.pdf", "supp": "", "pdf_size": 2437507, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12302759254570528216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "email": ";;;;;;;", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/kipf19a.html" }, { "title": "Competing Against Nash Equilibria in Adversarially Changing Zero-Sum Games", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3866", "id": "3866", "author_site": "Adrian Rivera Cardoso, Jacob Abernethy, He Wang, Huan Xu", "author": "Adrian Rivera Cardoso; Jacob Abernethy; He Wang; Huan Xu", "abstract": "We study the problem of repeated play in a zero-sum game in which the payoff matrix may change, in a possibly adversarial fashion, on each round; we call these Online Matrix Games. Finding the Nash Equilibrium (NE) of a two player zero-sum game is core to many problems in statistics, optimization, and economics, and for a fixed game matrix this can be easily reduced to solving a linear program. But when the payoff matrix evolves over time our goal is to find a sequential algorithm that can compete with, in a certain sense, the NE of the long-term-averaged payoff matrix. We design an algorithm with small NE regret\u2013that is, we ensure that the long-term payoff of both players is close to minimax optimum in hindsight. Our algorithm achieves near-optimal dependence with respect to the number of rounds and depends poly-logarithmically on the number of available actions of the players. Additionally, we show that the naive reduction, where each player simply minimizes its own regret, fails to achieve the stated objective regardless of which algorithm is used. Lastly, we consider the so-called bandit setting, where the feedback is significantly limited, and we provide an algorithm with small NE regret using one-point estimates of each payoff matrix.", "bibtex": "@InProceedings{pmlr-v97-cardoso19a,\n title = \t {Competing Against {N}ash Equilibria in Adversarially Changing Zero-Sum Games},\n author = {Cardoso, Adrian Rivera and Abernethy, Jacob and Wang, He and Xu, Huan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {921--930},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cardoso19a/cardoso19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cardoso19a.html},\n abstract = \t {We study the problem of repeated play in a zero-sum game in which the payoff matrix may change, in a possibly adversarial fashion, on each round; we call these Online Matrix Games. Finding the Nash Equilibrium (NE) of a two player zero-sum game is core to many problems in statistics, optimization, and economics, and for a fixed game matrix this can be easily reduced to solving a linear program. But when the payoff matrix evolves over time our goal is to find a sequential algorithm that can compete with, in a certain sense, the NE of the long-term-averaged payoff matrix. We design an algorithm with small NE regret\u2013that is, we ensure that the long-term payoff of both players is close to minimax optimum in hindsight. Our algorithm achieves near-optimal dependence with respect to the number of rounds and depends poly-logarithmically on the number of available actions of the players. Additionally, we show that the naive reduction, where each player simply minimizes its own regret, fails to achieve the stated objective regardless of which algorithm is used. Lastly, we consider the so-called bandit setting, where the feedback is significantly limited, and we provide an algorithm with small NE regret using one-point estimates of each payoff matrix.}\n}", "pdf": "http://proceedings.mlr.press/v97/cardoso19a/cardoso19a.pdf", "supp": "", "pdf_size": 283933, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13686052968515324302&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Industrial and Systems Engineering, Georgia Institute of Technology, GA, USA; Department of Computer Science, Georgia Institute of Technology, GA, USA; Department of Industrial and Systems Engineering, Georgia Institute of Technology, GA, USA; Department of Industrial and Systems Engineering, Georgia Institute of Technology, GA, USA", "aff_domain": "gatech.edu; ; ; ", "email": "gatech.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/cardoso19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "Department of Industrial and Systems Engineering", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Georgia", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Complementary-Label Learning for Arbitrary Losses and Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4046", "id": "4046", "author_site": "Takashi Ishida, Gang Niu, Aditya Menon, Masashi Sugiyama", "author": "Takashi Ishida; Gang Niu; Aditya Menon; Masashi Sugiyama", "abstract": "In contrast to the standard classification paradigm where the true class is given to each training pattern, complementary-label learning only uses training patterns each equipped with a complementary label, which only specifies one of the classes that the pattern does not belong to. The goal of this paper is to derive a novel framework of complementary-label learning with an unbiased estimator of the classification risk, for arbitrary losses and models\u2014all existing methods have failed to achieve this goal. Not only is this beneficial for the learning stage, it also makes model/hyper-parameter selection (through cross-validation) possible without the need of any ordinarily labeled validation data, while using any linear/non-linear models or convex/non-convex loss functions. We further improve the risk estimator by a non-negative correction and gradient ascent trick, and demonstrate its superiority through experiments.", "bibtex": "@InProceedings{pmlr-v97-ishida19a,\n title = \t {Complementary-Label Learning for Arbitrary Losses and Models},\n author = {Ishida, Takashi and Niu, Gang and Menon, Aditya and Sugiyama, Masashi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2971--2980},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ishida19a/ishida19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ishida19a.html},\n abstract = \t {In contrast to the standard classification paradigm where the true class is given to each training pattern, complementary-label learning only uses training patterns each equipped with a complementary label, which only specifies one of the classes that the pattern does not belong to. The goal of this paper is to derive a novel framework of complementary-label learning with an unbiased estimator of the classification risk, for arbitrary losses and models\u2014all existing methods have failed to achieve this goal. Not only is this beneficial for the learning stage, it also makes model/hyper-parameter selection (through cross-validation) possible without the need of any ordinarily labeled validation data, while using any linear/non-linear models or convex/non-convex loss functions. We further improve the risk estimator by a non-negative correction and gradient ascent trick, and demonstrate its superiority through experiments.}\n}", "pdf": "http://proceedings.mlr.press/v97/ishida19a/ishida19a.pdf", "supp": "", "pdf_size": 1762974, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4663196775584030091&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "The University of Tokyo+RIKEN; RIKEN; Google Research; The University of Tokyo+RIKEN", "aff_domain": "ms.k.u-tokyo.ac.jp; ; ; ", "email": "ms.k.u-tokyo.ac.jp; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/ishida19a.html", "aff_unique_index": "0+1;1;2;0+1", "aff_unique_norm": "University of Tokyo;RIKEN;Google", "aff_unique_dep": ";;Google Research", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.riken.jp;https://research.google", "aff_unique_abbr": "UTokyo;RIKEN;Google Research", "aff_campus_unique_index": ";1;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;1;0+0", "aff_country_unique": "Japan;United States" }, { "title": "Complexity of Linear Regions in Deep Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3567", "id": "3567", "author_site": "Boris Hanin, David Rolnick", "author": "Boris Hanin; David Rolnick", "abstract": "It is well-known that the expressivity of a neural network depends on its architecture, with deeper networks expressing more complex functions. In the case of networks that compute piecewise linear functions, such as those with ReLU activation, the number of distinct linear regions is a natural measure of expressivity. It is possible to construct networks with merely a single region, or for which the number of linear regions grows exponentially with depth; it is not clear where within this range most networks fall in practice, either before or after training. In this paper, we provide a mathematical framework to count the number of linear regions of a piecewise linear network and measure the volume of the boundaries between these regions. In particular, we prove that for networks at initialization, the average number of regions along any one-dimensional subspace grows linearly in the total number of neurons, far below the exponential upper bound. We also find that the average distance to the nearest region boundary at initialization scales like the inverse of the number of neurons. Our theory suggests that, even after training, the number of linear regions is far below exponential, an intuition that matches our empirical observations. We conclude that the practical expressivity of neural networks is likely far below that of the theoretical maximum, and that this gap can be quantified.", "bibtex": "@InProceedings{pmlr-v97-hanin19a,\n title = \t {Complexity of Linear Regions in Deep Networks},\n author = {Hanin, Boris and Rolnick, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2596--2604},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hanin19a/hanin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hanin19a.html},\n abstract = \t {It is well-known that the expressivity of a neural network depends on its architecture, with deeper networks expressing more complex functions. In the case of networks that compute piecewise linear functions, such as those with ReLU activation, the number of distinct linear regions is a natural measure of expressivity. It is possible to construct networks with merely a single region, or for which the number of linear regions grows exponentially with depth; it is not clear where within this range most networks fall in practice, either before or after training. In this paper, we provide a mathematical framework to count the number of linear regions of a piecewise linear network and measure the volume of the boundaries between these regions. In particular, we prove that for networks at initialization, the average number of regions along any one-dimensional subspace grows linearly in the total number of neurons, far below the exponential upper bound. We also find that the average distance to the nearest region boundary at initialization scales like the inverse of the number of neurons. Our theory suggests that, even after training, the number of linear regions is far below exponential, an intuition that matches our empirical observations. We conclude that the practical expressivity of neural networks is likely far below that of the theoretical maximum, and that this gap can be quantified.}\n}", "pdf": "http://proceedings.mlr.press/v97/hanin19a/hanin19a.pdf", "supp": "", "pdf_size": 2168225, "gs_citation": 292, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6572900177737298614&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Mathematics, Texas A&M University + Facebook AI Research, New York; University of Pennsylvania", "aff_domain": "tamu.edu;seas.upenn.edu", "email": "tamu.edu;seas.upenn.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/hanin19a.html", "aff_unique_index": "0+1;2", "aff_unique_norm": "Texas A&M University;Meta;University of Pennsylvania", "aff_unique_dep": "Department of Mathematics;Facebook AI Research;", "aff_unique_url": "https://www.tamu.edu;https://research.facebook.com;https://www.upenn.edu", "aff_unique_abbr": "TAMU;FAIR;UPenn", "aff_campus_unique_index": "1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Composable Core-sets for Determinant Maximization: A Simple Near-Optimal Algorithm", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3957", "id": "3957", "author_site": "Sepideh Mahabadi, Piotr Indyk, Shayan Oveis Gharan, Alireza Rezaei", "author": "Sepideh Mahabadi; Piotr Indyk; Shayan Oveis Gharan; Alireza Rezaei", "abstract": "\u201cComposable core-sets\u201d are an efficient framework for solving optimization problems in massive data models. In this work, we consider efficient construction of composable core-sets for the determinant maximization problem. This can also be cast as the MAP inference task for \u201cdeterminantal point processes\", that have recently gained a lot of interest for modeling diversity and fairness. The problem was recently studied in \\cite{indyk2018composable}, where they designed composable core-sets with the optimal approximation bound of $O(k)^k$. On the other hand, the more practical \u201cGreedy\" algorithm has been previously used in similar contexts. In this work, first we provide a theoretical approximation guarantee of $C^{k^2}$ for the Greedy algorithm in the context of composable core-sets; Further, we propose to use a \u201cLocal Search\" based algorithm that while being still practical, achieves a nearly optimal approximation bound of $O(k)^{2k}$; Finally, we implement all three algorithms and show the effectiveness of our proposed algorithm on standard data sets.", "bibtex": "@InProceedings{pmlr-v97-mahabadi19a,\n title = \t {Composable Core-sets for Determinant Maximization: A Simple Near-Optimal Algorithm},\n author = {Mahabadi, Sepideh and Indyk, Piotr and Gharan, Shayan Oveis and Rezaei, Alireza},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4254--4263},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mahabadi19a/mahabadi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mahabadi19a.html},\n abstract = \t {\u201cComposable core-sets\u201d are an efficient framework for solving optimization problems in massive data models. In this work, we consider efficient construction of composable core-sets for the determinant maximization problem. This can also be cast as the MAP inference task for \u201cdeterminantal point processes\", that have recently gained a lot of interest for modeling diversity and fairness. The problem was recently studied in \\cite{indyk2018composable}, where they designed composable core-sets with the optimal approximation bound of $O(k)^k$. On the other hand, the more practical \u201cGreedy\" algorithm has been previously used in similar contexts. In this work, first we provide a theoretical approximation guarantee of $C^{k^2}$ for the Greedy algorithm in the context of composable core-sets; Further, we propose to use a \u201cLocal Search\" based algorithm that while being still practical, achieves a nearly optimal approximation bound of $O(k)^{2k}$; Finally, we implement all three algorithms and show the effectiveness of our proposed algorithm on standard data sets.}\n}", "pdf": "http://proceedings.mlr.press/v97/mahabadi19a/mahabadi19a.pdf", "supp": "", "pdf_size": 534787, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1626275717015575304&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, Cambridge, Massachusetts, USA; Toyota Technological Institute at Chicago, Chicago, Illinois, USA; Department of Computer Science, University of Washington, Seattle, USA; Department of Computer Science, University of Washington, Seattle, USA", "aff_domain": "mit.edu;ttic.edu;cs.washington.edu;cs.washington.edu", "email": "mit.edu;ttic.edu;cs.washington.edu;cs.washington.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/mahabadi19a.html", "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Massachusetts Institute of Technology;Toyota Technological Institute at Chicago;University of Washington", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;;Department of Computer Science", "aff_unique_url": "https://web.mit.edu;https://www.tti-chicago.org;https://www.washington.edu", "aff_unique_abbr": "MIT;TTI Chicago;UW", "aff_campus_unique_index": "0;1;2;2", "aff_campus_unique": "Cambridge;Chicago;Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Composing Entropic Policies using Divergence Correction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3678", "id": "3678", "author_site": "Jonathan Hunt, Andre Barreto, Timothy Lillicrap, Nicolas Heess", "author": "Jonathan Hunt; Andre Barreto; Timothy Lillicrap; Nicolas Heess", "abstract": "Composing skills mastered in one task to solve novel tasks promises dramatic improvements in the data efficiency of reinforcement learning. Here, we analyze two recent works composing behaviors represented in the form of action-value functions and show that they perform poorly in some situations. As part of this analysis, we extend an important generalization of policy improvement to the maximum entropy framework and introduce an algorithm for the practical implementation of successor features in continuous action spaces. Then we propose a novel approach which addresses the failure cases of prior work and, in principle, recovers the optimal policy during transfer. This method works by explicitly learning the (discounted, future) divergence between base policies. We study this approach in the tabular case and on non-trivial continuous control problems with compositional structure and show that it outperforms or matches existing methods across all tasks considered.", "bibtex": "@InProceedings{pmlr-v97-hunt19a,\n title = \t {Composing Entropic Policies using Divergence Correction},\n author = {Hunt, Jonathan and Barreto, Andre and Lillicrap, Timothy and Heess, Nicolas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2911--2920},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hunt19a/hunt19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hunt19a.html},\n abstract = \t {Composing skills mastered in one task to solve novel tasks promises dramatic improvements in the data efficiency of reinforcement learning. Here, we analyze two recent works composing behaviors represented in the form of action-value functions and show that they perform poorly in some situations. As part of this analysis, we extend an important generalization of policy improvement to the maximum entropy framework and introduce an algorithm for the practical implementation of successor features in continuous action spaces. Then we propose a novel approach which addresses the failure cases of prior work and, in principle, recovers the optimal policy during transfer. This method works by explicitly learning the (discounted, future) divergence between base policies. We study this approach in the tabular case and on non-trivial continuous control problems with compositional structure and show that it outperforms or matches existing methods across all tasks considered.}\n}", "pdf": "http://proceedings.mlr.press/v97/hunt19a/hunt19a.pdf", "supp": "", "pdf_size": 2040823, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15842209432999245153&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "DeepMind; DeepMind; DeepMind; DeepMind", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/hunt19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Composing Value Functions in Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3993", "id": "3993", "author_site": "Benjamin van Niekerk, Steven James, Adam Earle, Benjamin Rosman", "author": "Benjamin Van Niekerk; Steven James; Adam Earle; Benjamin Rosman", "abstract": "An important property for lifelong-learning agents is the ability to combine existing skills to solve new unseen tasks. In general, however, it is unclear how to compose existing skills in a principled manner. Under the assumption of deterministic dynamics, we prove that optimal value function composition can be achieved in entropy-regularised reinforcement learning (RL), and extend this result to the standard RL setting. Composition is demonstrated in a high-dimensional video game, where an agent with an existing library of skills is immediately able to solve new tasks without the need for further learning.", "bibtex": "@InProceedings{pmlr-v97-van-niekerk19a,\n title = \t {Composing Value Functions in Reinforcement Learning},\n author = {Van Niekerk, Benjamin and James, Steven and Earle, Adam and Rosman, Benjamin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6401--6409},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/van-niekerk19a/van-niekerk19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/van-niekerk19a.html},\n abstract = \t {An important property for lifelong-learning agents is the ability to combine existing skills to solve new unseen tasks. In general, however, it is unclear how to compose existing skills in a principled manner. Under the assumption of deterministic dynamics, we prove that optimal value function composition can be achieved in entropy-regularised reinforcement learning (RL), and extend this result to the standard RL setting. Composition is demonstrated in a high-dimensional video game, where an agent with an existing library of skills is immediately able to solve new tasks without the need for further learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/van-niekerk19a/van-niekerk19a.pdf", "supp": "", "pdf_size": 2483374, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17114310425187268734&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "School of Computer Science and Applied Mathematics, University of the Witwatersrand, Johannesburg, South Africa+Council for Scientific and Industrial Research, Pretoria, South Africa; School of Computer Science and Applied Mathematics, University of the Witwatersrand, Johannesburg, South Africa+Council for Scientific and Industrial Research, Pretoria, South Africa; School of Computer Science and Applied Mathematics, University of the Witwatersrand, Johannesburg, South Africa; School of Computer Science and Applied Mathematics, University of the Witwatersrand, Johannesburg, South Africa+Council for Scientific and Industrial Research, Pretoria, South Africa", "aff_domain": "students.wits.ac.za;wits.ac.za; ; ", "email": "students.wits.ac.za;wits.ac.za; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/van-niekerk19a.html", "aff_unique_index": "0+1;0+1;0;0+1", "aff_unique_norm": "University of the Witwatersrand;Council for Scientific and Industrial Research", "aff_unique_dep": "School of Computer Science and Applied Mathematics;", "aff_unique_url": "https://www.wits.ac.za;https://www.csir.co.za", "aff_unique_abbr": "Wits;CSIR", "aff_campus_unique_index": "0+1;0+1;0;0+1", "aff_campus_unique": "Johannesburg;Pretoria", "aff_country_unique_index": "0+0;0+0;0;0+0", "aff_country_unique": "South Africa" }, { "title": "Compositional Fairness Constraints for Graph Embeddings", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3936", "id": "3936", "author_site": "Avishek Bose, William Hamilton", "author": "Avishek Bose; William Hamilton", "abstract": "Learning high-quality node embeddings is a key building block for machine learning models that operate on graph data, such as social networks and recommender systems. However, existing graph embedding techniques are unable to cope with fairness constraints, e.g., ensuring that the learned representations do not correlate with certain attributes, such as age or gender. Here, we introduce an adversarial framework to enforce fairness constraints on graph embeddings. Our approach is", "bibtex": "@InProceedings{pmlr-v97-bose19a,\n title = \t {Compositional Fairness Constraints for Graph Embeddings},\n author = {Bose, Avishek and Hamilton, William},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {715--724},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bose19a/bose19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bose19a.html},\n abstract = \t {Learning high-quality node embeddings is a key building block for machine learning models that operate on graph data, such as social networks and recommender systems. However, existing graph embedding techniques are unable to cope with fairness constraints, e.g., ensuring that the learned representations do not correlate with certain attributes, such as age or gender. Here, we introduce an adversarial framework to enforce fairness constraints on graph embeddings. Our approach is", "pdf": "http://proceedings.mlr.press/v97/bose19a/bose19a.pdf", "supp": "", "pdf_size": 390486, "gs_citation": 328, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2983154672519525426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "McGill University+Mila+Facebook AI Research; McGill University+Mila+Facebook AI Research", "aff_domain": "mail.mcgill.ca; ", "email": "mail.mcgill.ca; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/bose19a.html", "aff_unique_index": "0+1+2;0+1+2", "aff_unique_norm": "McGill University;Mila;Meta", "aff_unique_dep": ";Quebec Artificial Intelligence Institute;Facebook AI Research", "aff_unique_url": "https://www.mcgill.ca;https://mila.quebec;https://research.facebook.com", "aff_unique_abbr": "McGill;Mila;FAIR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;0+0+1", "aff_country_unique": "Canada;United States" }, { "title": "Compressed Factorization: Fast and Accurate Low-Rank Factorization of Compressively-Sensed Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3747", "id": "3747", "author_site": "Vatsal Sharan, Kai Sheng Tai, Peter Bailis, Gregory Valiant", "author": "Vatsal Sharan; Kai Sheng Tai; Peter Bailis; Gregory Valiant", "abstract": "What learning algorithms can be run directly on compressively-sensed data? In this work, we consider the question of accurately and efficiently computing low-rank matrix or tensor factorizations given data compressed via random projections. We examine the approach of first performing factorization in the compressed domain, and then reconstructing the original high-dimensional factors from the recovered (compressed) factors. In both the matrix and tensor settings, we establish conditions under which this natural approach will provably recover the original factors. While it is well-known that random projections preserve a number of geometric properties of a dataset, our work can be viewed as showing that they can also preserve certain solutions of non-convex, NP-Hard problems like non-negative matrix factorization. We support these theoretical results with experiments on synthetic data and demonstrate the practical applicability of compressed factorization on real-world gene expression and EEG time series datasets.", "bibtex": "@InProceedings{pmlr-v97-sharan19a,\n title = \t {Compressed Factorization: Fast and Accurate Low-Rank Factorization of Compressively-Sensed Data},\n author = {Sharan, Vatsal and Tai, Kai Sheng and Bailis, Peter and Valiant, Gregory},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5690--5700},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sharan19a/sharan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/sharan19a.html},\n abstract = \t {What learning algorithms can be run directly on compressively-sensed data? In this work, we consider the question of accurately and efficiently computing low-rank matrix or tensor factorizations given data compressed via random projections. We examine the approach of first performing factorization in the compressed domain, and then reconstructing the original high-dimensional factors from the recovered (compressed) factors. In both the matrix and tensor settings, we establish conditions under which this natural approach will provably recover the original factors. While it is well-known that random projections preserve a number of geometric properties of a dataset, our work can be viewed as showing that they can also preserve certain solutions of non-convex, NP-Hard problems like non-negative matrix factorization. We support these theoretical results with experiments on synthetic data and demonstrate the practical applicability of compressed factorization on real-world gene expression and EEG time series datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/sharan19a/sharan19a.pdf", "supp": "", "pdf_size": 609720, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15610085484638163367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Stanford University; Stanford University; Stanford University; Stanford University", "aff_domain": "stanford.edu;cs.stanford.edu; ; ", "email": "stanford.edu;cs.stanford.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/sharan19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Compressing Gradient Optimizers via Count-Sketches", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3666", "id": "3666", "author_site": "Ryan Spring, Anastasios Kyrillidis, Vijai Mohan, Anshumali Shrivastava", "author": "Ryan Spring; Anastasios Kyrillidis; Vijai Mohan; Anshumali Shrivastava", "abstract": "Many popular first-order optimization methods accelerate the convergence rate of deep learning models. However, these algorithms require auxiliary variables, which cost additional memory proportional to the number of parameters in the model. The problem is becoming more severe as models grow larger to learn from complex, large-scale datasets. Our proposed solution is to maintain a linear sketch to compress the auxiliary variables. Our approach has the same performance as the full-sized baseline, while using less space for the auxiliary variables. Theoretically, we prove that count-sketch optimization maintains the SGD convergence rate, while gracefully reducing memory usage for large-models. We show a rigorous evaluation on popular architectures such as ResNet-18 and Transformer-XL. On the 1-Billion Word dataset, we save 25% of the memory used during training (7.7 GB instead of 10.8 GB) with minimal accuracy and performance loss. For an Amazon extreme classification task with over 49.5 million classes, we also reduce the training time by 38%, by increasing the mini-batch size 3.5x using our count-sketch optimizer.", "bibtex": "@InProceedings{pmlr-v97-spring19a,\n title = \t {Compressing Gradient Optimizers via Count-Sketches},\n author = {Spring, Ryan and Kyrillidis, Anastasios and Mohan, Vijai and Shrivastava, Anshumali},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5946--5955},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/spring19a/spring19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/spring19a.html},\n abstract = \t {Many popular first-order optimization methods accelerate the convergence rate of deep learning models. However, these algorithms require auxiliary variables, which cost additional memory proportional to the number of parameters in the model. The problem is becoming more severe as models grow larger to learn from complex, large-scale datasets. Our proposed solution is to maintain a linear sketch to compress the auxiliary variables. Our approach has the same performance as the full-sized baseline, while using less space for the auxiliary variables. Theoretically, we prove that count-sketch optimization maintains the SGD convergence rate, while gracefully reducing memory usage for large-models. We show a rigorous evaluation on popular architectures such as ResNet-18 and Transformer-XL. On the 1-Billion Word dataset, we save 25% of the memory used during training (7.7 GB instead of 10.8 GB) with minimal accuracy and performance loss. For an Amazon extreme classification task with over 49.5 million classes, we also reduce the training time by 38%, by increasing the mini-batch size 3.5x using our count-sketch optimizer.}\n}", "pdf": "http://proceedings.mlr.press/v97/spring19a/spring19a.pdf", "supp": "", "pdf_size": 632612, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1104222702149426557&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, Rice University, Houston, TX, USA+Amazon Search, Palo Alto, CA, USA; Department of Computer Science, Rice University, Houston, TX, USA; Amazon Search, Palo Alto, CA, USA; Department of Computer Science, Rice University, Houston, TX, USA+Amazon Search, Palo Alto, CA, USA", "aff_domain": "rice.edu; ; ; ", "email": "rice.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/spring19a.html", "aff_unique_index": "0+1;0;1;0+1", "aff_unique_norm": "Rice University;Amazon", "aff_unique_dep": "Department of Computer Science;Amazon Search", "aff_unique_url": "https://www.rice.edu;https://www.amazon.com", "aff_unique_abbr": "Rice;Amazon", "aff_campus_unique_index": "0+1;0;1;0+1", "aff_campus_unique": "Houston;Palo Alto", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Concentration Inequalities for Conditional Value at Risk", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4265", "id": "4265", "author_site": "Philip Thomas, Erik Learned-Miller", "author": "Philip Thomas; Erik Learned-Miller", "abstract": "In this paper we derive new concentration inequalities for the conditional value at risk (CVaR) of a random variable, and compare them to the previous state of the art (Brown, 2007). We show analytically that our lower bound is strictly tighter than Brown\u2019s, and empirically that this difference is significant. While our upper bound may be looser than Brown\u2019s in some cases, we show empirically that in most cases our bound is significantly tighter. After discussing when each upper bound is superior, we conclude with empirical results which suggest that both of our bounds will often be significantly tighter than Brown\u2019s.", "bibtex": "@InProceedings{pmlr-v97-thomas19a,\n title = \t {Concentration Inequalities for Conditional Value at Risk},\n author = {Thomas, Philip and Learned-Miller, Erik},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6225--6233},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/thomas19a/thomas19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/thomas19a.html},\n abstract = \t {In this paper we derive new concentration inequalities for the conditional value at risk (CVaR) of a random variable, and compare them to the previous state of the art (Brown, 2007). We show analytically that our lower bound is strictly tighter than Brown\u2019s, and empirically that this difference is significant. While our upper bound may be looser than Brown\u2019s in some cases, we show empirically that in most cases our bound is significantly tighter. After discussing when each upper bound is superior, we conclude with empirical results which suggest that both of our bounds will often be significantly tighter than Brown\u2019s.}\n}", "pdf": "http://proceedings.mlr.press/v97/thomas19a/thomas19a.pdf", "supp": "", "pdf_size": 4004664, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10022679265658656887&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "College of Information and Computer Sciences, University of Massachusetts Amherst; College of Information and Computer Sciences, University of Massachusetts Amherst", "aff_domain": "cs.umass.com; ", "email": "cs.umass.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/thomas19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Massachusetts Amherst", "aff_unique_dep": "College of Information and Computer Sciences", "aff_unique_url": "https://www.umass.edu", "aff_unique_abbr": "UMass Amherst", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Concrete Autoencoders: Differentiable Feature Selection and Reconstruction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4217", "id": "4217", "author_site": "Muhammed Fatih Bal\u0131n, Abubakar Abid, James Zou", "author": "Muhammed Fatih Bal\u0131n; Abubakar Abid; James Zou", "abstract": "We introduce the concrete autoencoder, an end-to-end differentiable method for global feature selection, which efficiently identifies a subset of the most informative features and simultaneously learns a neural network to reconstruct the input data from the selected features. Our method is unsupervised, and is based on using a concrete selector layer as the encoder and using a standard neural network as the decoder. During the training phase, the temperature of the concrete selector layer is gradually decreased, which encourages a user-specified number of discrete features to be learned; during test time, the selected features can be used with the decoder network to reconstruct the remaining input features. We evaluate concrete autoencoders on a variety of datasets, where they significantly outperform state-of-the-art methods for feature selection and data reconstruction. In particular, on a large-scale gene expression dataset, the concrete autoencoder selects a small subset of genes whose expression levels can be used to impute the expression levels of the remaining genes; in doing so, it improves on the current widely-used expert-curated L1000 landmark genes, potentially reducing measurement costs by 20%. The concrete autoencoder can be implemented by adding just a few lines of code to a standard autoencoder, and the code for the algorithm and experiments is publicly available.", "bibtex": "@InProceedings{pmlr-v97-balin19a,\n title = \t {Concrete Autoencoders: Differentiable Feature Selection and Reconstruction},\n author = {Bal{\\i}n, Muhammed Fatih and Abid, Abubakar and Zou, James},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {444--453},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/balin19a/balin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/balin19a.html},\n abstract = \t {We introduce the concrete autoencoder, an end-to-end differentiable method for global feature selection, which efficiently identifies a subset of the most informative features and simultaneously learns a neural network to reconstruct the input data from the selected features. Our method is unsupervised, and is based on using a concrete selector layer as the encoder and using a standard neural network as the decoder. During the training phase, the temperature of the concrete selector layer is gradually decreased, which encourages a user-specified number of discrete features to be learned; during test time, the selected features can be used with the decoder network to reconstruct the remaining input features. We evaluate concrete autoencoders on a variety of datasets, where they significantly outperform state-of-the-art methods for feature selection and data reconstruction. In particular, on a large-scale gene expression dataset, the concrete autoencoder selects a small subset of genes whose expression levels can be used to impute the expression levels of the remaining genes; in doing so, it improves on the current widely-used expert-curated L1000 landmark genes, potentially reducing measurement costs by 20%. The concrete autoencoder can be implemented by adding just a few lines of code to a standard autoencoder, and the code for the algorithm and experiments is publicly available.}\n}", "pdf": "http://proceedings.mlr.press/v97/balin19a/balin19a.pdf", "supp": "", "pdf_size": 951383, "gs_citation": 193, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7315289169215618460&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Electrical Engineering, Stanford University, Stanford, United States + Department of Biomedical Data Sciences, Stanford University, Stanford, United States + Chan-Zuckerberg Biohub, San Francisco, United States; Department of Computer Engineering, Bogazici University, Istanbul, Turkey; Department of Electrical Engineering, Stanford University, Stanford, United States + Department of Biomedical Data Sciences, Stanford University, Stanford, United States + Chan-Zuckerberg Biohub, San Francisco, United States", "aff_domain": "stanford.edu; ;stanford.edu", "email": "stanford.edu; ;stanford.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/balin19a.html", "aff_unique_index": "0+0+1;2;0+0+1", "aff_unique_norm": "Stanford University;Chan-Zuckerberg Biohub;Bogazici University", "aff_unique_dep": "Department of Electrical Engineering;;Department of Computer Engineering", "aff_unique_url": "https://www.stanford.edu;https://www.chanzuckerberg.com/biohub;https://www.boun.edu.tr", "aff_unique_abbr": "Stanford;;BU", "aff_campus_unique_index": "0+0+1;2;0+0+1", "aff_campus_unique": "Stanford;San Francisco;Istanbul", "aff_country_unique_index": "0+0+0;1;0+0+0", "aff_country_unique": "United States;T\u00fcrkiye" }, { "title": "Conditional Gradient Methods via Stochastic Path-Integrated Differential Estimator", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4115", "id": "4115", "author_site": "Alp Yurtsever, Suvrit Sra, Volkan Cevher", "author": "Alp Yurtsever; Suvrit Sra; Volkan Cevher", "abstract": "We propose a class of variance-reduced stochastic conditional gradient methods. By adopting the recent stochastic path-integrated differential estimator technique (SPIDER) of Fang et. al. (2018) for the classical Frank-Wolfe (FW) method, we introduce SPIDER-FW for finite-sum minimization as well as the more general expectation minimization problems. SPIDER-FW enjoys superior complexity guarantees in the non-convex setting, while matching the best known FW variants in the convex case. We also extend our framework a la conditional gradient sliding (CGS) of Lan & Zhou. (2016), and propose SPIDER-CGS.", "bibtex": "@InProceedings{pmlr-v97-yurtsever19b,\n title = \t {Conditional Gradient Methods via Stochastic Path-Integrated Differential Estimator},\n author = {Yurtsever, Alp and Sra, Suvrit and Cevher, Volkan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7282--7291},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yurtsever19b/yurtsever19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/yurtsever19b.html},\n abstract = \t {We propose a class of variance-reduced stochastic conditional gradient methods. By adopting the recent stochastic path-integrated differential estimator technique (SPIDER) of Fang et. al. (2018) for the classical Frank-Wolfe (FW) method, we introduce SPIDER-FW for finite-sum minimization as well as the more general expectation minimization problems. SPIDER-FW enjoys superior complexity guarantees in the non-convex setting, while matching the best known FW variants in the convex case. We also extend our framework a la conditional gradient sliding (CGS) of Lan & Zhou. (2016), and propose SPIDER-CGS.}\n}", "pdf": "http://proceedings.mlr.press/v97/yurtsever19b/yurtsever19b.pdf", "supp": "", "pdf_size": 278384, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5872662151081598882&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Ecole Polytechnique F\u00e9d\u00e9rale de Lausanne, Switzerland; Massachusetts Institute of Technology, USA; Ecole Polytechnique F\u00e9d\u00e9rale de Lausanne, Switzerland", "aff_domain": "epfl.ch; ; ", "email": "epfl.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yurtsever19b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://web.mit.edu", "aff_unique_abbr": "EPFL;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United States" }, { "title": "Conditional Independence in Testing Bayesian Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4324", "id": "4324", "author_site": "Yujia Shen, Haiying Huang, Arthur Choi, Adnan Darwiche", "author": "Yujia Shen; Haiying Huang; Arthur Choi; Adnan Darwiche", "abstract": "Testing Bayesian Networks (TBNs) were introduced recently to represent a set of distributions, one of which is selected based on the given evidence and used for reasoning. TBNs are more expressive than classical Bayesian Networks (BNs): Marginal queries correspond to multi-linear functions in BNs and to piecewise multi-linear functions in TBNs. Moreover, TBN queries are universal approximators, like neural networks. In this paper, we study conditional independence in TBNs, showing that it can be inferred from d-separation as in BNs. We also study the role of TBN expressiveness and independence in dealing with the problem of learning with incomplete models (i.e., ones that miss nodes or edges from the data-generating model). Finally, we illustrate our results on a number of concrete examples, including a case study on Hidden Markov Models.", "bibtex": "@InProceedings{pmlr-v97-shen19a,\n title = \t {Conditional Independence in Testing {B}ayesian Networks},\n author = {Shen, Yujia and Huang, Haiying and Choi, Arthur and Darwiche, Adnan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5701--5709},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shen19a/shen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/shen19a.html},\n abstract = \t {Testing Bayesian Networks (TBNs) were introduced recently to represent a set of distributions, one of which is selected based on the given evidence and used for reasoning. TBNs are more expressive than classical Bayesian Networks (BNs): Marginal queries correspond to multi-linear functions in BNs and to piecewise multi-linear functions in TBNs. Moreover, TBN queries are universal approximators, like neural networks. In this paper, we study conditional independence in TBNs, showing that it can be inferred from d-separation as in BNs. We also study the role of TBN expressiveness and independence in dealing with the problem of learning with incomplete models (i.e., ones that miss nodes or edges from the data-generating model). Finally, we illustrate our results on a number of concrete examples, including a case study on Hidden Markov Models.}\n}", "pdf": "http://proceedings.mlr.press/v97/shen19a/shen19a.pdf", "supp": "", "pdf_size": 1153608, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11992169934050187622&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Computer Science Department, University of California, Los Angeles, California, USA; Computer Science Department, University of California, Los Angeles, California, USA; Computer Science Department, University of California, Los Angeles, California, USA; Computer Science Department, University of California, Los Angeles, California, USA", "aff_domain": "cs.ucla.edu; ; ; ", "email": "cs.ucla.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/shen19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Conditioning by adaptive sampling for robust design", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4261", "id": "4261", "author_site": "David Brookes, Jennifer Listgarten", "author": "David Brookes; Hahnbeom Park; Jennifer Listgarten", "abstract": "We present a method for design problems wherein the goal is to maximize or specify the value of one or more properties of interest (e.g. maximizing the fluorescence of a protein). We assume access to black box, stochastic \u201coracle\" predictive functions, each of which maps from design space to a distribution over properties of interest. Because many state-of-the-art predictive models are known to suffer from pathologies, especially for data far from the training distribution, the problem becomes different from directly optimizing the oracles. Herein, we propose a method to solve this problem that uses model-based adaptive sampling to estimate a distribution over the design space, conditioned on the desired properties.", "bibtex": "@InProceedings{pmlr-v97-brookes19a,\n title = \t {Conditioning by adaptive sampling for robust design},\n author = {Brookes, David and Park, Hahnbeom and Listgarten, Jennifer},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {773--782},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/brookes19a/brookes19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/brookes19a.html},\n abstract = \t {We present a method for design problems wherein the goal is to maximize or specify the value of one or more properties of interest (e.g. maximizing the fluorescence of a protein). We assume access to black box, stochastic \u201coracle\" predictive functions, each of which maps from design space to a distribution over properties of interest. Because many state-of-the-art predictive models are known to suffer from pathologies, especially for data far from the training distribution, the problem becomes different from directly optimizing the oracles. Herein, we propose a method to solve this problem that uses model-based adaptive sampling to estimate a distribution over the design space, conditioned on the desired properties.}\n}", "pdf": "http://proceedings.mlr.press/v97/brookes19a/brookes19a.pdf", "supp": "", "pdf_size": 2596445, "gs_citation": 260, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=481579979328113877&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Biophysics Graduate Group, UC Berkeley, CA+EECS Department, UC Berkeley, CA; Department of Biochemistry, University of Washington, Seattle, WA+Institute for Protein Design, University of Washington, Seattle, WA; EECS Department, UC Berkeley, CA", "aff_domain": "berkeley.edu;washington.edu;berkeley.edu", "email": "berkeley.edu;washington.edu;berkeley.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/brookes19a.html", "aff_unique_index": "0+0;1+1;0", "aff_unique_norm": "University of California, Berkeley;University of Washington", "aff_unique_dep": "Biophysics Graduate Group;Department of Biochemistry", "aff_unique_url": "https://www.berkeley.edu;https://www.washington.edu", "aff_unique_abbr": "UC Berkeley;UW", "aff_campus_unique_index": "0+0;1+1;0", "aff_campus_unique": "Berkeley;Seattle", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Connectivity-Optimized Representation Learning via Persistent Homology", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3818", "id": "3818", "author_site": "Christoph Hofer, Roland Kwitt, Marc Niethammer, Mandar Dixit", "author": "Christoph Hofer; Roland Kwitt; Marc Niethammer; Mandar Dixit", "abstract": "We study the problem of learning representations with controllable connectivity properties. This is beneficial in situations when the imposed structure can be leveraged upstream. In particular, we control the connectivity of an autoencoder\u2019s latent space via a novel type of loss, operating on information from persistent homology. Under mild conditions, this loss is differentiable and we present a theoretical analysis of the properties induced by the loss. We choose one-class learning as our upstream task and demonstrate that the imposed structure enables informed parameter selection for modeling the in-class distribution via kernel density estimators. Evaluated on computer vision data, these one-class models exhibit competitive performance and, in a low sample size regime, outperform other methods by a large margin. Notably, our results indicate that a single autoencoder, trained on auxiliary (unlabeled) data, yields a mapping into latent space that can be reused across datasets for one-class learning.", "bibtex": "@InProceedings{pmlr-v97-hofer19a,\n title = \t {Connectivity-Optimized Representation Learning via Persistent Homology},\n author = {Hofer, Christoph and Kwitt, Roland and Niethammer, Marc and Dixit, Mandar},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2751--2760},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hofer19a/hofer19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hofer19a.html},\n abstract = \t {We study the problem of learning representations with controllable connectivity properties. This is beneficial in situations when the imposed structure can be leveraged upstream. In particular, we control the connectivity of an autoencoder\u2019s latent space via a novel type of loss, operating on information from persistent homology. Under mild conditions, this loss is differentiable and we present a theoretical analysis of the properties induced by the loss. We choose one-class learning as our upstream task and demonstrate that the imposed structure enables informed parameter selection for modeling the in-class distribution via kernel density estimators. Evaluated on computer vision data, these one-class models exhibit competitive performance and, in a low sample size regime, outperform other methods by a large margin. Notably, our results indicate that a single autoencoder, trained on auxiliary (unlabeled) data, yields a mapping into latent space that can be reused across datasets for one-class learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/hofer19a/hofer19a.pdf", "supp": "", "pdf_size": 3596541, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6723358631694302455&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, University of Salzburg, Austria; Department of Computer Science, University of Salzburg, Austria; Microsoft; UNC Chapel Hill", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/hofer19a.html", "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Salzburg;Microsoft;University of North Carolina at Chapel Hill", "aff_unique_dep": "Department of Computer Science;Microsoft Corporation;", "aff_unique_url": "https://www.uni-salzburg.at;https://www.microsoft.com;https://www.unc.edu", "aff_unique_abbr": ";Microsoft;UNC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Chapel Hill", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "Austria;United States" }, { "title": "Context-Aware Zero-Shot Learning for Object Recognition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4039", "id": "4039", "author_site": "Eloi Zablocki, Patrick Bordes, Laure Soulier, Benjamin Piwowarski, Patrick Gallinari", "author": "Eloi Zablocki; Patrick Bordes; Laure Soulier; Benjamin Piwowarski; Patrick Gallinari", "abstract": "Zero-Shot Learning (ZSL) aims at classifying unlabeled objects by leveraging auxiliary knowledge, such as semantic representations. A limitation of previous approaches is that only intrinsic properties of objects, e.g. their visual appearance, are taken into account while their context, e.g. the surrounding objects in the image, is ignored. Following the intuitive principle that objects tend to be found in certain contexts but not others, we propose a new and challenging approach, context-aware ZSL, that leverages semantic representations in a new way to model the conditional likelihood of an object to appear in a given context. Finally, through extensive experiments conducted on Visual Genome, we show that contextual information can substantially improve the standard ZSL approach and is robust to unbalanced classes.", "bibtex": "@InProceedings{pmlr-v97-zablocki19a,\n title = \t {Context-Aware Zero-Shot Learning for Object Recognition},\n author = {Zablocki, Eloi and Bordes, Patrick and Soulier, Laure and Piwowarski, Benjamin and Gallinari, Patrick},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7292--7303},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zablocki19a/zablocki19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zablocki19a.html},\n abstract = \t {Zero-Shot Learning (ZSL) aims at classifying unlabeled objects by leveraging auxiliary knowledge, such as semantic representations. A limitation of previous approaches is that only intrinsic properties of objects, e.g. their visual appearance, are taken into account while their context, e.g. the surrounding objects in the image, is ignored. Following the intuitive principle that objects tend to be found in certain contexts but not others, we propose a new and challenging approach, context-aware ZSL, that leverages semantic representations in a new way to model the conditional likelihood of an object to appear in a given context. Finally, through extensive experiments conducted on Visual Genome, we show that contextual information can substantially improve the standard ZSL approach and is robust to unbalanced classes.}\n}", "pdf": "http://proceedings.mlr.press/v97/zablocki19a/zablocki19a.pdf", "supp": "", "pdf_size": 1941291, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6406523453815676924&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Sorbonne Universit\u00e9, CNRS, Laboratoire d\u2019Informatique de Paris 6, LIP6, F-75005 Paris, France; Sorbonne Universit\u00e9, CNRS, Laboratoire d\u2019Informatique de Paris 6, LIP6, F-75005 Paris, France; Sorbonne Universit\u00e9, CNRS, Laboratoire d\u2019Informatique de Paris 6, LIP6, F-75005 Paris, France; Sorbonne Universit\u00e9, CNRS, Laboratoire d\u2019Informatique de Paris 6, LIP6, F-75005 Paris, France; Sorbonne Universit\u00e9, CNRS, Laboratoire d\u2019Informatique de Paris 6, LIP6, F-75005 Paris, France + Criteo AI Lab, Paris", "aff_domain": "lip6.fr; ; ; ; ", "email": "lip6.fr; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zablocki19a.html", "aff_unique_index": "0;0;0;0;0+1", "aff_unique_norm": "Sorbonne Universit\u00e9;Criteo", "aff_unique_dep": "Laboratoire d\u2019Informatique de Paris 6;Criteo AI Lab", "aff_unique_url": "https://www.sorbonne-universite.fr;https://www.criteo.com", "aff_unique_abbr": "Sorbonne U;Criteo", "aff_campus_unique_index": "0;0;0;0;0+0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0;0;0+0", "aff_country_unique": "France" }, { "title": "Contextual Memory Trees", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4174", "id": "4174", "author_site": "Wen Sun, Alina Beygelzimer, Hal Daum\u00e9 III, John Langford, Paul Mineiro", "author": "Wen Sun; Alina Beygelzimer; Hal Daum\u00e9 Iii; John Langford; Paul Mineiro", "abstract": "We design and study a Contextual Memory Tree (CMT), a learning memory controller that inserts new memories into an experience store of unbounded size. It operates online and is designed to efficiently query for memories from that store, supporting logarithmic time insertion and retrieval operations. Hence CMT can be integrated into existing statistical learning algorithms as an augmented memory unit without substantially increasing training and inference computation. Furthermore CMT operates as a reduction to classification, allowing it to benefit from advances in representation or architecture. We demonstrate the efficacy of CMT by augmenting existing multi-class and multi-label classification algorithms with CMT and observe statistical improvement. We also test CMT learning on several image-captioning tasks to demonstrate that it performs computationally better than a simple nearest neighbors memory system while benefitting from reward learning.", "bibtex": "@InProceedings{pmlr-v97-sun19a,\n title = \t {Contextual Memory Trees},\n author = {Sun, Wen and Beygelzimer, Alina and Iii, Hal Daum{\\'e} and Langford, John and Mineiro, Paul},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6026--6035},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sun19a/sun19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/sun19a.html},\n abstract = \t {We design and study a Contextual Memory Tree (CMT), a learning memory controller that inserts new memories into an experience store of unbounded size. It operates online and is designed to efficiently query for memories from that store, supporting logarithmic time insertion and retrieval operations. Hence CMT can be integrated into existing statistical learning algorithms as an augmented memory unit without substantially increasing training and inference computation. Furthermore CMT operates as a reduction to classification, allowing it to benefit from advances in representation or architecture. We demonstrate the efficacy of CMT by augmenting existing multi-class and multi-label classification algorithms with CMT and observe statistical improvement. We also test CMT learning on several image-captioning tasks to demonstrate that it performs computationally better than a simple nearest neighbors memory system while benefitting from reward learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/sun19a/sun19a.pdf", "supp": "", "pdf_size": 842729, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15142240583529015949&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Robotics Institute, Carnegie Mellon University, USA; Yahoo! Research, New York, NY, USA; Microsoft Research, New York, NY, USA; Microsoft Research, New York, NY, USA; Microsoft, USA", "aff_domain": "cs.cmu.edu; ; ; ; ", "email": "cs.cmu.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/sun19a.html", "aff_unique_index": "0;1;2;2;2", "aff_unique_norm": "Carnegie Mellon University;Yahoo! Research;Microsoft", "aff_unique_dep": "Robotics Institute;;Microsoft Research", "aff_unique_url": "https://www.cmu.edu;https://research.yahoo.com;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CMU;Yahoo! Res;MSR", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";New York", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Contextual Multi-armed Bandit Algorithm for Semiparametric Reward Model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3696", "id": "3696", "author_site": "Gi-Soo Kim, Myunghee Cho Paik", "author": "Gi-Soo Kim; Myunghee Cho Paik", "abstract": "Contextual multi-armed bandit (MAB) algorithms have been shown promising for maximizing cumulative rewards in sequential decision tasks such as news article recommendation systems, web page ad placement algorithms, and mobile health. However, most of the proposed contextual MAB algorithms assume linear relationships between the reward and the context of the action. This paper proposes a new contextual MAB algorithm for a relaxed, semiparametric reward model that supports nonstationarity. The proposed method is less restrictive, easier to implement and faster than two alternative algorithms that consider the same model, while achieving a tight regret upper bound. We prove that the high-probability upper bound of the regret incurred by the proposed algorithm has the same order as the Thompson sampling algorithm for linear reward models. The proposed and existing algorithms are evaluated via simulation and also applied to Yahoo! news article recommendation log data.", "bibtex": "@InProceedings{pmlr-v97-kim19d,\n title = \t {Contextual Multi-armed Bandit Algorithm for Semiparametric Reward Model},\n author = {Kim, Gi-Soo and Paik, Myunghee Cho},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3389--3397},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kim19d/kim19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/kim19d.html},\n abstract = \t {Contextual multi-armed bandit (MAB) algorithms have been shown promising for maximizing cumulative rewards in sequential decision tasks such as news article recommendation systems, web page ad placement algorithms, and mobile health. However, most of the proposed contextual MAB algorithms assume linear relationships between the reward and the context of the action. This paper proposes a new contextual MAB algorithm for a relaxed, semiparametric reward model that supports nonstationarity. The proposed method is less restrictive, easier to implement and faster than two alternative algorithms that consider the same model, while achieving a tight regret upper bound. We prove that the high-probability upper bound of the regret incurred by the proposed algorithm has the same order as the Thompson sampling algorithm for linear reward models. The proposed and existing algorithms are evaluated via simulation and also applied to Yahoo! news article recommendation log data.}\n}", "pdf": "http://proceedings.mlr.press/v97/kim19d/kim19d.pdf", "supp": "", "pdf_size": 412720, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15798031005034659587&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Statistics, Seoul National University, Seoul, Korea; Department of Statistics, Seoul National University, Seoul, Korea", "aff_domain": "snu.ac.kr;snu.ac.kr", "email": "snu.ac.kr;snu.ac.kr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/kim19d.html", "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Control Regularization for Reduced Variance Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3640", "id": "3640", "author_site": "Richard Cheng, Abhinav Verma, Gabor Orosz, Swarat Chaudhuri, Yisong Yue, Joel Burdick", "author": "Richard Cheng; Abhinav Verma; Gabor Orosz; Swarat Chaudhuri; Yisong Yue; Joel Burdick", "abstract": "Dealing with high variance is a significant challenge in model-free reinforcement learning (RL). Existing methods are unreliable, exhibiting high variance in performance from run to run using different initializations/seeds. Focusing on problems arising in continuous control, we propose a functional regularization approach to augmenting model-free RL. In particular, we regularize the behavior of the deep policy to be similar to a policy prior, i.e., we regularize in function space. We show that functional regularization yields a bias-variance trade-off, and propose an adaptive tuning strategy to optimize this trade-off. When the policy prior has control-theoretic stability guarantees, we further show that this regularization approximately preserves those stability guarantees throughout learning. We validate our approach empirically on a range of settings, and demonstrate significantly reduced variance, guaranteed dynamic stability, and more efficient learning than deep RL alone.", "bibtex": "@InProceedings{pmlr-v97-cheng19a,\n title = \t {Control Regularization for Reduced Variance Reinforcement Learning},\n author = {Cheng, Richard and Verma, Abhinav and Orosz, Gabor and Chaudhuri, Swarat and Yue, Yisong and Burdick, Joel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1141--1150},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cheng19a/cheng19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cheng19a.html},\n abstract = \t {Dealing with high variance is a significant challenge in model-free reinforcement learning (RL). Existing methods are unreliable, exhibiting high variance in performance from run to run using different initializations/seeds. Focusing on problems arising in continuous control, we propose a functional regularization approach to augmenting model-free RL. In particular, we regularize the behavior of the deep policy to be similar to a policy prior, i.e., we regularize in function space. We show that functional regularization yields a bias-variance trade-off, and propose an adaptive tuning strategy to optimize this trade-off. When the policy prior has control-theoretic stability guarantees, we further show that this regularization approximately preserves those stability guarantees throughout learning. We validate our approach empirically on a range of settings, and demonstrate significantly reduced variance, guaranteed dynamic stability, and more efficient learning than deep RL alone.}\n}", "pdf": "http://proceedings.mlr.press/v97/cheng19a/cheng19a.pdf", "supp": "", "pdf_size": 1211702, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4210711157444974813&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 19, "aff": "California Institute of Technology; Rice University; University of Michigan; Rice University; California Institute of Technology; California Institute of Technology", "aff_domain": "caltech.edu; ; ; ;caltech.edu;caltech.edu", "email": "caltech.edu; ; ; ;caltech.edu;caltech.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/cheng19a.html", "aff_unique_index": "0;1;2;1;0;0", "aff_unique_norm": "California Institute of Technology;Rice University;University of Michigan", "aff_unique_dep": ";;", "aff_unique_url": "https://www.caltech.edu;https://www.rice.edu;https://www.umich.edu", "aff_unique_abbr": "Caltech;Rice;UM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Pasadena;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Convolutional Poisson Gamma Belief Network", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3613", "id": "3613", "author_site": "CHAOJIE WANG, Bo Chen, SUCHENG XIAO, Mingyuan Zhou", "author": "Chaojie Wang; Bo Chen; Sucheng Xiao; Mingyuan Zhou", "abstract": "For text analysis, one often resorts to a lossy representation that either completely ignores word order or embeds each word as a low-dimensional dense feature vector. In this paper, we propose convolutional Poisson factor analysis (CPFA) that directly operates on a lossless representation that processes the words in each document as a sequence of high-dimensional one-hot vectors. To boost its performance, we further propose the convolutional Poisson gamma belief network (CPGBN) that couples CPFA with the gamma belief network via a novel probabilistic pooling layer. CPFA forms words into phrases and captures very specific phrase-level topics, and CPGBN further builds a hierarchy of increasingly more general phrase-level topics. For efficient inference, we develop both a Gibbs sampler and a Weibull distribution based convolutional variational auto-encoder. Experimental results demonstrate that CPGBN can extract high-quality text latent representations that capture the word order information, and hence can be leveraged as a building block to enrich a wide variety of existing latent variable models that ignore word order.", "bibtex": "@InProceedings{pmlr-v97-wang19b,\n title = \t {Convolutional Poisson Gamma Belief Network},\n author = {Wang, Chaojie and Chen, Bo and Xiao, Sucheng and Zhou, Mingyuan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6515--6525},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19b/wang19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19b.html},\n abstract = \t {For text analysis, one often resorts to a lossy representation that either completely ignores word order or embeds each word as a low-dimensional dense feature vector. In this paper, we propose convolutional Poisson factor analysis (CPFA) that directly operates on a lossless representation that processes the words in each document as a sequence of high-dimensional one-hot vectors. To boost its performance, we further propose the convolutional Poisson gamma belief network (CPGBN) that couples CPFA with the gamma belief network via a novel probabilistic pooling layer. CPFA forms words into phrases and captures very specific phrase-level topics, and CPGBN further builds a hierarchy of increasingly more general phrase-level topics. For efficient inference, we develop both a Gibbs sampler and a Weibull distribution based convolutional variational auto-encoder. Experimental results demonstrate that CPGBN can extract high-quality text latent representations that capture the word order information, and hence can be leveraged as a building block to enrich a wide variety of existing latent variable models that ignore word order.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19b/wang19b.pdf", "supp": "", "pdf_size": 623361, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15155863973996783420&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "National Laboratory of Radar Signal Processing, Collaborative Innovation Center of Information Sensing and Understanding, Xidian University, Xi'an, Shaanxi, China; National Laboratory of Radar Signal Processing, Collaborative Innovation Center of Information Sensing and Understanding, Xidian University, Xi'an, Shaanxi, China; National Laboratory of Radar Signal Processing, Collaborative Innovation Center of Information Sensing and Understanding, Xidian University, Xi'an, Shaanxi, China; McCombs School of Business, The University of Texas at Austin, Austin, Texas 78712, USA", "aff_domain": "mail.xidian.edu.cn; ; ; ", "email": "mail.xidian.edu.cn; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wang19b.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Xidian University;University of Texas at Austin", "aff_unique_dep": "National Laboratory of Radar Signal Processing;McCombs School of Business", "aff_unique_url": "http://www.xidian.edu.cn/;https://www.mccombs.utexas.edu", "aff_unique_abbr": "Xidian;UT Austin", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Xi'an;Austin", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Coresets for Ordered Weighted Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3750", "id": "3750", "author_site": "Vladimir Braverman, Shaofeng Jiang, Robert Krauthgamer, Xuan Wu", "author": "Vladimir Braverman; Shaofeng H.-C. Jiang; Robert Krauthgamer; Xuan Wu", "abstract": "We design coresets for Ordered k-Median, a generalization of classical clustering problems such as k-Median and k-Center. Its objective function is defined via the Ordered Weighted Averaging (OWA) paradigm of Yager (1988), where data points are weighted according to a predefined weight vector, but in order of their contribution to the objective (distance from the centers). A powerful data-reduction technique, called a coreset, is to summarize a point set $X$ in $\\mathbb{R}^d$ into a small (weighted) point set $X\u2019$, such that for every set of $k$ potential centers, the objective value of the coreset $X\u2019$ approximates that of $X$ within factor $1\\pm \\epsilon$. When there are multiple objectives (weights), the above standard coreset might have limited usefulness, whereas in a", "bibtex": "@InProceedings{pmlr-v97-braverman19a,\n title = \t {Coresets for Ordered Weighted Clustering},\n author = {Braverman, Vladimir and Jiang, Shaofeng H.-C. and Krauthgamer, Robert and Wu, Xuan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {744--753},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/braverman19a/braverman19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/braverman19a.html},\n abstract = \t {We design coresets for Ordered k-Median, a generalization of classical clustering problems such as k-Median and k-Center. Its objective function is defined via the Ordered Weighted Averaging (OWA) paradigm of Yager (1988), where data points are weighted according to a predefined weight vector, but in order of their contribution to the objective (distance from the centers). A powerful data-reduction technique, called a coreset, is to summarize a point set $X$ in $\\mathbb{R}^d$ into a small (weighted) point set $X\u2019$, such that for every set of $k$ potential centers, the objective value of the coreset $X\u2019$ approximates that of $X$ within factor $1\\pm \\epsilon$. When there are multiple objectives (weights), the above standard coreset might have limited usefulness, whereas in a", "pdf": "http://proceedings.mlr.press/v97/braverman19a/braverman19a.pdf", "supp": "", "pdf_size": 1351360, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7122855710729422427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Johns Hopkins University, USA; Weizmann Institute of Science, Israel; Weizmann Institute of Science, Israel; Johns Hopkins University, USA", "aff_domain": "cs.jhu.edu;weizmann.ac.il;weizmann.ac.il;jhu.edu", "email": "cs.jhu.edu;weizmann.ac.il;weizmann.ac.il;jhu.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/braverman19a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Johns Hopkins University;Weizmann Institute of Science", "aff_unique_dep": ";", "aff_unique_url": "https://www.jhu.edu;https://www.weizmann.org.il", "aff_unique_abbr": "JHU;Weizmann", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;Israel" }, { "title": "Correlated Variational Auto-Encoders", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3953", "id": "3953", "author_site": "Da Tang, Dawen Liang, Tony Jebara, Nicholas Ruozzi", "author": "Da Tang; Dawen Liang; Tony Jebara; Nicholas Ruozzi", "abstract": "Variational Auto-Encoders (VAEs) are capable of learning latent representations for high dimensional data. However, due to the i.i.d. assumption, VAEs only optimize the singleton variational distributions and fail to account for the correlations between data points, which might be crucial for learning latent representations from dataset where a priori we know correlations exist. We propose Correlated Variational Auto-Encoders (CVAEs) that can take the correlation structure into consideration when learning latent representations with VAEs. CVAEs apply a prior based on the correlation structure. To address the intractability introduced by the correlated prior, we develop an approximation by average of a set of tractable lower bounds over all maximal acyclic subgraphs of the undirected correlation graph. Experimental results on matching and link prediction on public benchmark rating datasets and spectral clustering on a synthetic dataset show the effectiveness of the proposed method over baseline algorithms.", "bibtex": "@InProceedings{pmlr-v97-tang19b,\n title = \t {Correlated Variational Auto-Encoders},\n author = {Tang, Da and Liang, Dawen and Jebara, Tony and Ruozzi, Nicholas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6135--6144},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tang19b/tang19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/tang19b.html},\n abstract = \t {Variational Auto-Encoders (VAEs) are capable of learning latent representations for high dimensional data. However, due to the i.i.d. assumption, VAEs only optimize the singleton variational distributions and fail to account for the correlations between data points, which might be crucial for learning latent representations from dataset where a priori we know correlations exist. We propose Correlated Variational Auto-Encoders (CVAEs) that can take the correlation structure into consideration when learning latent representations with VAEs. CVAEs apply a prior based on the correlation structure. To address the intractability introduced by the correlated prior, we develop an approximation by average of a set of tractable lower bounds over all maximal acyclic subgraphs of the undirected correlation graph. Experimental results on matching and link prediction on public benchmark rating datasets and spectral clustering on a synthetic dataset show the effectiveness of the proposed method over baseline algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/tang19b/tang19b.pdf", "supp": "", "pdf_size": 489113, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14520356175099829641&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/tang19b.html" }, { "title": "Correlated bandits or: How to minimize mean-squared error online", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3870", "id": "3870", "author_site": "Vinay Praneeth Boda, Prashanth L.A.", "author": "Vinay Praneeth Boda; Prashanth L.A.", "abstract": "While the objective in traditional multi-armed bandit problems is to find the arm with the highest mean, in many settings, finding an arm that best captures information about other arms is of interest. This objective, however, requires learning the underlying correlation structure and not just the means. Sensors placement for industrial surveillance and cellular network monitoring are a few applications, where the underlying correlation structure plays an important role. Motivated by such applications, we formulate the correlated bandit problem, where the objective is to find the arm with the lowest mean-squared error (MSE) in estimating all the arms. To this end, we derive first an MSE estimator based on sample variances/covariances and show that our estimator exponentially concentrates around the true MSE. Under a best-arm identification framework, we propose a successive rejects type algorithm and provide bounds on the probability of error in identifying the best arm. Using minimax theory, we also derive fundamental performance limits for the correlated bandit problem.", "bibtex": "@InProceedings{pmlr-v97-boda19a,\n title = \t {Correlated bandits or: How to minimize mean-squared error online},\n author = {Boda, Vinay Praneeth and L.A., Prashanth},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {686--694},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/boda19a/boda19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/boda19a.html},\n abstract = \t {While the objective in traditional multi-armed bandit problems is to find the arm with the highest mean, in many settings, finding an arm that best captures information about other arms is of interest. This objective, however, requires learning the underlying correlation structure and not just the means. Sensors placement for industrial surveillance and cellular network monitoring are a few applications, where the underlying correlation structure plays an important role. Motivated by such applications, we formulate the correlated bandit problem, where the objective is to find the arm with the lowest mean-squared error (MSE) in estimating all the arms. To this end, we derive first an MSE estimator based on sample variances/covariances and show that our estimator exponentially concentrates around the true MSE. Under a best-arm identification framework, we propose a successive rejects type algorithm and provide bounds on the probability of error in identifying the best arm. Using minimax theory, we also derive fundamental performance limits for the correlated bandit problem.}\n}", "pdf": "http://proceedings.mlr.press/v97/boda19a/boda19a.pdf", "supp": "", "pdf_size": 361087, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1594458966073933425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "LinkedIn Corp. + University of Maryland College Park; Department of Computer Science and Engineering, Indian Institute of Technology Madras", "aff_domain": "gmail.com;cse.iitm.ac.in", "email": "gmail.com;cse.iitm.ac.in", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/boda19a.html", "aff_unique_index": "0+1;2", "aff_unique_norm": "LinkedIn Corporation;University of Maryland;Indian Institute of Technology Madras", "aff_unique_dep": ";;Department of Computer Science and Engineering", "aff_unique_url": "https://www.linkedin.com;https://www/umd.edu;https://www.iitm.ac.in", "aff_unique_abbr": "LinkedIn;UMD;IIT Madras", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";College Park;Madras", "aff_country_unique_index": "0+0;1", "aff_country_unique": "United States;India" }, { "title": "Counterfactual Off-Policy Evaluation with Gumbel-Max Structural Causal Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4272", "id": "4272", "author_site": "Michael Oberst, David Sontag", "author": "Michael Oberst; David Sontag", "abstract": "We introduce an off-policy evaluation procedure for highlighting episodes where applying a reinforcement learned (RL) policy is likely to have produced a substantially different outcome than the observed policy. In particular, we introduce a class of structural causal models (SCMs) for generating counterfactual trajectories in finite partially observable Markov Decision Processes (POMDPs). We see this as a useful procedure for off-policy \u201cdebugging\u201d in high-risk settings (e.g., healthcare); by decomposing the expected difference in reward between the RL and observed policy into specific episodes, we can identify episodes where the counterfactual difference in reward is most dramatic. This in turn can be used to facilitate review of specific episodes by domain experts. We demonstrate the utility of this procedure with a synthetic environment of sepsis management.", "bibtex": "@InProceedings{pmlr-v97-oberst19a,\n title = \t {Counterfactual Off-Policy Evaluation with {G}umbel-Max Structural Causal Models},\n author = {Oberst, Michael and Sontag, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4881--4890},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/oberst19a/oberst19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/oberst19a.html},\n abstract = \t {We introduce an off-policy evaluation procedure for highlighting episodes where applying a reinforcement learned (RL) policy is likely to have produced a substantially different outcome than the observed policy. In particular, we introduce a class of structural causal models (SCMs) for generating counterfactual trajectories in finite partially observable Markov Decision Processes (POMDPs). We see this as a useful procedure for off-policy \u201cdebugging\u201d in high-risk settings (e.g., healthcare); by decomposing the expected difference in reward between the RL and observed policy into specific episodes, we can identify episodes where the counterfactual difference in reward is most dramatic. This in turn can be used to facilitate review of specific episodes by domain experts. We demonstrate the utility of this procedure with a synthetic environment of sepsis management.}\n}", "pdf": "http://proceedings.mlr.press/v97/oberst19a/oberst19a.pdf", "supp": "", "pdf_size": 1486830, "gs_citation": 187, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3302653893277553179&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA; CSAIL, Massachusetts Institute of Technology, Cambridge, MA, USA", "aff_domain": "mit.edu; ", "email": "mit.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/oberst19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory (CSAIL)", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Counterfactual Visual Explanations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4068", "id": "4068", "author_site": "Yash Goyal, Ziyan Wu, Jan Ernst, Dhruv Batra, Devi Parikh, Stefan Lee", "author": "Yash Goyal; Ziyan Wu; Jan Ernst; Dhruv Batra; Devi Parikh; Stefan Lee", "abstract": "In this work, we develop a technique to produce counterfactual visual explanations. Given a \u2018query\u2019 image $I$ for which a vision system predicts class $c$, a counterfactual visual explanation identifies how $I$ could change such that the system would output a different specified class $c\u2019$. To do this, we select a \u2018distractor\u2019 image $I\u2019$ that the system predicts as class $c\u2019$ and identify spatial regions in $I$ and $I\u2019$ such that replacing the identified region in $I$ with the identified region in $I\u2019$ would push the system towards classifying $I$ as $c\u2019$. We apply our approach to multiple image classification datasets generating qualitative results showcasing the interpretability and discriminativeness of our counterfactual explanations. To explore the effectiveness of our explanations in teaching humans, we present machine teaching experiments for the task of fine-grained bird classification. We find that users trained to distinguish bird species fare better when given access to counterfactual explanations in addition to training examples.", "bibtex": "@InProceedings{pmlr-v97-goyal19a,\n title = \t {Counterfactual Visual Explanations},\n author = {Goyal, Yash and Wu, Ziyan and Ernst, Jan and Batra, Dhruv and Parikh, Devi and Lee, Stefan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2376--2384},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/goyal19a/goyal19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/goyal19a.html},\n abstract = \t {In this work, we develop a technique to produce counterfactual visual explanations. Given a \u2018query\u2019 image $I$ for which a vision system predicts class $c$, a counterfactual visual explanation identifies how $I$ could change such that the system would output a different specified class $c\u2019$. To do this, we select a \u2018distractor\u2019 image $I\u2019$ that the system predicts as class $c\u2019$ and identify spatial regions in $I$ and $I\u2019$ such that replacing the identified region in $I$ with the identified region in $I\u2019$ would push the system towards classifying $I$ as $c\u2019$. We apply our approach to multiple image classification datasets generating qualitative results showcasing the interpretability and discriminativeness of our counterfactual explanations. To explore the effectiveness of our explanations in teaching humans, we present machine teaching experiments for the task of fine-grained bird classification. We find that users trained to distinguish bird species fare better when given access to counterfactual explanations in addition to training examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/goyal19a/goyal19a.pdf", "supp": "", "pdf_size": 6650499, "gs_citation": 672, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10015728464424437212&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Georgia Institute of Technology; Siemens Corporation; Siemens Corporation; Georgia Institute of Technology; Georgia Institute of Technology; Georgia Institute of Technology", "aff_domain": "gatech.edu; ; ; ; ; ", "email": "gatech.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/goyal19a.html", "aff_unique_index": "0;1;1;0;0;0", "aff_unique_norm": "Georgia Institute of Technology;Siemens AG", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.siemens.com", "aff_unique_abbr": "Georgia Tech;Siemens", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;0;0", "aff_country_unique": "United States;Germany" }, { "title": "Cross-Domain 3D Equivariant Image Embeddings", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3729", "id": "3729", "author_site": "Carlos Esteves, Avneesh Sud, Zhengyi Luo, Kostas Daniilidis, Ameesh Makadia", "author": "Carlos Esteves; Avneesh Sud; Zhengyi Luo; Kostas Daniilidis; Ameesh Makadia", "abstract": "Spherical convolutional networks have been introduced recently as tools to learn powerful feature representations of 3D shapes. Spherical CNNs are equivariant to 3D rotations making them ideally suited to applications where 3D data may be observed in arbitrary orientations. In this paper we learn 2D image embeddings with a similar equivariant structure: embedding the image of a 3D object should commute with rotations of the object. We introduce a cross-domain embedding from 2D images into a spherical CNN latent space. This embedding encodes images with 3D shape properties and is equivariant to 3D rotations of the observed object. The model is supervised only by target embeddings obtained from a spherical CNN pretrained for 3D shape classification. We show that learning a rich embedding for images with appropriate geometric structure is sufficient for tackling varied applications, such as relative pose estimation and novel view synthesis, without requiring additional task-specific supervision.", "bibtex": "@InProceedings{pmlr-v97-esteves19a,\n title = \t {Cross-Domain 3{D} Equivariant Image Embeddings},\n author = {Esteves, Carlos and Sud, Avneesh and Luo, Zhengyi and Daniilidis, Kostas and Makadia, Ameesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1812--1822},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/esteves19a/esteves19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/esteves19a.html},\n abstract = \t {Spherical convolutional networks have been introduced recently as tools to learn powerful feature representations of 3D shapes. Spherical CNNs are equivariant to 3D rotations making them ideally suited to applications where 3D data may be observed in arbitrary orientations. In this paper we learn 2D image embeddings with a similar equivariant structure: embedding the image of a 3D object should commute with rotations of the object. We introduce a cross-domain embedding from 2D images into a spherical CNN latent space. This embedding encodes images with 3D shape properties and is equivariant to 3D rotations of the observed object. The model is supervised only by target embeddings obtained from a spherical CNN pretrained for 3D shape classification. We show that learning a rich embedding for images with appropriate geometric structure is sufficient for tackling varied applications, such as relative pose estimation and novel view synthesis, without requiring additional task-specific supervision.}\n}", "pdf": "http://proceedings.mlr.press/v97/esteves19a/esteves19a.pdf", "supp": "", "pdf_size": 2014585, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9110203752347650368&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "GRASP Laboratory, University of Pennsylvania+Google; Google Research; GRASP Laboratory, University of Pennsylvania; GRASP Laboratory, University of Pennsylvania; Google Research", "aff_domain": "seas.upenn.edu; ; ; ; ", "email": "seas.upenn.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/esteves19a.html", "aff_unique_index": "0+1;1;0;0;1", "aff_unique_norm": "University of Pennsylvania;Google", "aff_unique_dep": "GRASP Laboratory;Google", "aff_unique_url": "https://www.upenn.edu;https://www.google.com", "aff_unique_abbr": "UPenn;Google", "aff_campus_unique_index": "0+1;1;0;0;1", "aff_campus_unique": "Philadelphia;Mountain View", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Curiosity-Bottleneck: Exploration By Distilling Task-Specific Novelty", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3829", "id": "3829", "author_site": "Youngjin Kim, Daniel Nam, Hyunwoo Kim, Ji-Hoon Kim, Gunhee Kim", "author": "Youngjin Kim; Wontae Nam; Hyunwoo Kim; Ji-Hoon Kim; Gunhee Kim", "abstract": "Exploration based on state novelty has brought great success in challenging reinforcement learning problems with sparse rewards. However, existing novelty-based strategies become inefficient in real-world problems where observation contains not only task-dependent state novelty of our interest but also task-irrelevant information that should be ignored. We introduce an information- theoretic exploration strategy named Curiosity-Bottleneck that distills task-relevant information from observation. Based on the information bottleneck principle, our exploration bonus is quantified as the compressiveness of observation with respect to the learned representation of a compressive value network. With extensive experiments on static image classification, grid-world and three hard-exploration Atari games, we show that Curiosity-Bottleneck learns an effective exploration strategy by robustly measuring the state novelty in distractive environments where state-of-the-art exploration methods often degenerate.", "bibtex": "@InProceedings{pmlr-v97-kim19c,\n title = \t {Curiosity-Bottleneck: Exploration By Distilling Task-Specific Novelty},\n author = {Kim, Youngjin and Nam, Wontae and Kim, Hyunwoo and Kim, Ji-Hoon and Kim, Gunhee},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3379--3388},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kim19c/kim19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/kim19c.html},\n abstract = \t {Exploration based on state novelty has brought great success in challenging reinforcement learning problems with sparse rewards. However, existing novelty-based strategies become inefficient in real-world problems where observation contains not only task-dependent state novelty of our interest but also task-irrelevant information that should be ignored. We introduce an information- theoretic exploration strategy named Curiosity-Bottleneck that distills task-relevant information from observation. Based on the information bottleneck principle, our exploration bonus is quantified as the compressiveness of observation with respect to the learned representation of a compressive value network. With extensive experiments on static image classification, grid-world and three hard-exploration Atari games, we show that Curiosity-Bottleneck learns an effective exploration strategy by robustly measuring the state novelty in distractive environments where state-of-the-art exploration methods often degenerate.}\n}", "pdf": "http://proceedings.mlr.press/v97/kim19c/kim19c.pdf", "supp": "", "pdf_size": 2593489, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9413002291649085757&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kim19c.html" }, { "title": "Curvature-Exploiting Acceleration of Elastic Net Computations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4157", "id": "4157", "author_site": "Vien Mai, Mikael Johansson", "author": "Vien Mai; Mikael Johansson", "abstract": "This paper introduces an efficient second-order method for solving the elastic net problem. Its key innovation is a computationally efficient technique for injecting curvature information in the optimization process which admits a strong theoretical performance guarantee. In particular, we show improved run time over popular first-order methods and quantify the speed-up in terms of statistical measures of the data matrix. The improved time complexity is the result of an extensive exploitation of the problem structure and a careful combination of second-order information, variance reduction techniques, and momentum acceleration. Beside theoretical speed-up, experimental results demonstrate great practical performance benefits of curvature information, especially for ill-conditioned data sets.", "bibtex": "@InProceedings{pmlr-v97-mai19a,\n title = \t {Curvature-Exploiting Acceleration of Elastic Net Computations},\n author = {Mai, Vien and Johansson, Mikael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4294--4303},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mai19a/mai19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mai19a.html},\n abstract = \t {This paper introduces an efficient second-order method for solving the elastic net problem. Its key innovation is a computationally efficient technique for injecting curvature information in the optimization process which admits a strong theoretical performance guarantee. In particular, we show improved run time over popular first-order methods and quantify the speed-up in terms of statistical measures of the data matrix. The improved time complexity is the result of an extensive exploitation of the problem structure and a careful combination of second-order information, variance reduction techniques, and momentum acceleration. Beside theoretical speed-up, experimental results demonstrate great practical performance benefits of curvature information, especially for ill-conditioned data sets.}\n}", "pdf": "http://proceedings.mlr.press/v97/mai19a/mai19a.pdf", "supp": "", "pdf_size": 397892, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fKpsk0QVGE0J:scholar.google.com/&scioq=Curvature-Exploiting+Acceleration+of+Elastic+Net+Computations&hl=en&as_sdt=0,5", "gs_version_total": 8, "aff": "Department of Automatic Control, School of Electrical Engineering and Computer Science, Royal Institute of Technology (KTH), Stockholm, Sweden; Department of Automatic Control, School of Electrical Engineering and Computer Science, Royal Institute of Technology (KTH), Stockholm, Sweden", "aff_domain": "kth.se;kth.se", "email": "kth.se;kth.se", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/mai19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Royal Institute of Technology (KTH)", "aff_unique_dep": "Department of Automatic Control", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0;0", "aff_country_unique": "Sweden" }, { "title": "DAG-GNN: DAG Structure Learning with Graph Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4301", "id": "4301", "author_site": "Yue Yu, Jie Chen, Tian Gao, Mo Yu", "author": "Yue Yu; Jie Chen; Tian Gao; Mo Yu", "abstract": "Learning a faithful directed acyclic graph (DAG) from samples of a joint distribution is a challenging combinatorial problem, owing to the intractable search space superexponential in the number of graph nodes. A recent breakthrough formulates the problem as a continuous optimization with a structural constraint that ensures acyclicity (Zheng et al., 2018). The authors apply the approach to the linear structural equation model (SEM) and the least-squares loss function that are statistically well justified but nevertheless limited. Motivated by the widespread success of deep learning that is capable of capturing complex nonlinear mappings, in this work we propose a deep generative model and apply a variant of the structural constraint to learn the DAG. At the heart of the generative model is a variational autoencoder parameterized by a novel graph neural network architecture, which we coin DAG-GNN. In addition to the richer capacity, an advantage of the proposed model is that it naturally handles discrete variables as well as vector-valued ones. We demonstrate that on synthetic data sets, the proposed method learns more accurate graphs for nonlinearly generated samples; and on benchmark data sets with discrete variables, the learned graphs are reasonably close to the global optima. The code is available at \\url{https://github.com/fishmoon1234/DAG-GNN}.", "bibtex": "@InProceedings{pmlr-v97-yu19a,\n title = \t {{DAG}-{GNN}: {DAG} Structure Learning with Graph Neural Networks},\n author = {Yu, Yue and Chen, Jie and Gao, Tian and Yu, Mo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7154--7163},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yu19a/yu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yu19a.html},\n abstract = \t {Learning a faithful directed acyclic graph (DAG) from samples of a joint distribution is a challenging combinatorial problem, owing to the intractable search space superexponential in the number of graph nodes. A recent breakthrough formulates the problem as a continuous optimization with a structural constraint that ensures acyclicity (Zheng et al., 2018). The authors apply the approach to the linear structural equation model (SEM) and the least-squares loss function that are statistically well justified but nevertheless limited. Motivated by the widespread success of deep learning that is capable of capturing complex nonlinear mappings, in this work we propose a deep generative model and apply a variant of the structural constraint to learn the DAG. At the heart of the generative model is a variational autoencoder parameterized by a novel graph neural network architecture, which we coin DAG-GNN. In addition to the richer capacity, an advantage of the proposed model is that it naturally handles discrete variables as well as vector-valued ones. We demonstrate that on synthetic data sets, the proposed method learns more accurate graphs for nonlinearly generated samples; and on benchmark data sets with discrete variables, the learned graphs are reasonably close to the global optima. The code is available at \\url{https://github.com/fishmoon1234/DAG-GNN}.}\n}", "pdf": "http://proceedings.mlr.press/v97/yu19a/yu19a.pdf", "supp": "", "pdf_size": 614649, "gs_citation": 622, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12962909633357312064&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "Lehigh University; MIT-IBM Watson AI Lab + IBM Research; IBM Research; IBM Research", "aff_domain": "lehigh.edu;us.ibm.com; ; ", "email": "lehigh.edu;us.ibm.com; ; ", "github": "https://github.com/fishmoon1234/DAG-GNN", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/yu19a.html", "aff_unique_index": "0;1+2;2;2", "aff_unique_norm": "Lehigh University;Massachusetts Institute of Technology;IBM", "aff_unique_dep": ";IBM Watson AI Lab;IBM Research", "aff_unique_url": "https://www.lehigh.edu;https://www.mitibmwatsonailab.org;https://www.ibm.com/research", "aff_unique_abbr": "Lehigh;MIT-IBM AI Lab;IBM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "DBSCAN++: Towards fast and scalable density clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3978", "id": "3978", "author_site": "Jennifer Jang, Heinrich Jiang", "author": "Jennifer Jang; Heinrich Jiang", "abstract": "DBSCAN is a classical density-based clustering procedure with tremendous practical relevance. However, DBSCAN implicitly needs to compute the empirical density for each sample point, leading to a quadratic worst-case time complexity, which is too slow on large datasets. We propose DBSCAN++, a simple modification of DBSCAN which only requires computing the densities for a chosen subset of points. We show empirically that, compared to traditional DBSCAN, DBSCAN++ can provide not only competitive performance but also added robustness in the bandwidth hyperparameter while taking a fraction of the runtime. We also present statistical consistency guarantees showing the trade-off between computational cost and estimation rates. Surprisingly, up to a certain point, we can enjoy the same estimation rates while lowering computational cost, showing that DBSCAN++ is a sub-quadratic algorithm that attains minimax optimal rates for level-set estimation, a quality that may be of independent interest.", "bibtex": "@InProceedings{pmlr-v97-jang19a,\n title = \t {{DBSCAN}++: Towards fast and scalable density clustering},\n author = {Jang, Jennifer and Jiang, Heinrich},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3019--3029},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jang19a/jang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jang19a.html},\n abstract = \t {DBSCAN is a classical density-based clustering procedure with tremendous practical relevance. However, DBSCAN implicitly needs to compute the empirical density for each sample point, leading to a quadratic worst-case time complexity, which is too slow on large datasets. We propose DBSCAN++, a simple modification of DBSCAN which only requires computing the densities for a chosen subset of points. We show empirically that, compared to traditional DBSCAN, DBSCAN++ can provide not only competitive performance but also added robustness in the bandwidth hyperparameter while taking a fraction of the runtime. We also present statistical consistency guarantees showing the trade-off between computational cost and estimation rates. Surprisingly, up to a certain point, we can enjoy the same estimation rates while lowering computational cost, showing that DBSCAN++ is a sub-quadratic algorithm that attains minimax optimal rates for level-set estimation, a quality that may be of independent interest.}\n}", "pdf": "http://proceedings.mlr.press/v97/jang19a/jang19a.pdf", "supp": "", "pdf_size": 2683105, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=852483317070308792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Uber; Google Research", "aff_domain": "gmail.com; ", "email": "gmail.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/jang19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Uber Technologies Inc.;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.uber.com;https://research.google", "aff_unique_abbr": "Uber;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "DL2: Training and Querying Neural Networks with Logic", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4331", "id": "4331", "author_site": "Marc Fischer, Mislav Balunovic, Dana Drachsler-Cohen, Timon Gehr, Ce Zhang, Martin Vechev", "author": "Marc Fischer; Mislav Balunovic; Dana Drachsler-Cohen; Timon Gehr; Ce Zhang; Martin Vechev", "abstract": "We present DL2, a system for training and querying neural networks with logical constraints. Using DL2, one can declaratively specify domain knowledge constraints to be enforced during training, as well as pose queries on the model to find inputs that satisfy a set of constraints. DL2 works by translating logical constraints into a loss function with desirable mathematical properties. The loss is then minimized with standard gradient-based methods. We evaluate DL2 by training networks with interesting constraints in unsupervised, semi-supervised and supervised settings. Our experimental evaluation demonstrates that DL2 is more expressive than prior approaches combining logic and neural networks, and its loss functions are better suited for optimization. Further, we show that for a number of queries, DL2 can find the desired inputs in seconds (even for large models such as ResNet-50 on ImageNet).", "bibtex": "@InProceedings{pmlr-v97-fischer19a,\n title = \t {{DL}2: Training and Querying Neural Networks with Logic},\n author = {Fischer, Marc and Balunovic, Mislav and Drachsler-Cohen, Dana and Gehr, Timon and Zhang, Ce and Vechev, Martin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1931--1941},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fischer19a/fischer19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fischer19a.html},\n abstract = \t {We present DL2, a system for training and querying neural networks with logical constraints. Using DL2, one can declaratively specify domain knowledge constraints to be enforced during training, as well as pose queries on the model to find inputs that satisfy a set of constraints. DL2 works by translating logical constraints into a loss function with desirable mathematical properties. The loss is then minimized with standard gradient-based methods. We evaluate DL2 by training networks with interesting constraints in unsupervised, semi-supervised and supervised settings. Our experimental evaluation demonstrates that DL2 is more expressive than prior approaches combining logic and neural networks, and its loss functions are better suited for optimization. Further, we show that for a number of queries, DL2 can find the desired inputs in seconds (even for large models such as ResNet-50 on ImageNet).}\n}", "pdf": "http://proceedings.mlr.press/v97/fischer19a/fischer19a.pdf", "supp": "", "pdf_size": 478200, "gs_citation": 231, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14132347304439794371&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland", "aff_domain": "inf.ethz.ch; ; ; ; ; ", "email": "inf.ethz.ch; ; ; ; ; ", "github": "https://github.com/eth-sri/dl2", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/fischer19a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "DP-GP-LVM: A Bayesian Non-Parametric Model for Learning Multivariate Dependency Structures", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3566", "id": "3566", "author_site": "Andrew R Lawrence, Carl Henrik Ek, Neill Campbell", "author": "Andrew Lawrence; Carl Henrik Ek; Neill Campbell", "abstract": "We present a non-parametric Bayesian latent variable model capable of learning dependency structures across dimensions in a multivariate setting. Our approach is based on flexible Gaussian process priors for the generative mappings and interchangeable Dirichlet process priors to learn the structure. The introduction of the Dirichlet process as a specific structural prior allows our model to circumvent issues associated with previous Gaussian process latent variable models. Inference is performed by deriving an efficient variational bound on the marginal log-likelihood of the model. We demonstrate the efficacy of our approach via analysis of discovered structure and superior quantitative performance on missing data imputation.", "bibtex": "@InProceedings{pmlr-v97-lawrence19a,\n title = \t {{DP}-{GP}-{LVM}: A {B}ayesian Non-Parametric Model for Learning Multivariate Dependency Structures},\n author = {Lawrence, Andrew and Ek, Carl Henrik and Campbell, Neill},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3682--3691},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lawrence19a/lawrence19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lawrence19a.html},\n abstract = \t {We present a non-parametric Bayesian latent variable model capable of learning dependency structures across dimensions in a multivariate setting. Our approach is based on flexible Gaussian process priors for the generative mappings and interchangeable Dirichlet process priors to learn the structure. The introduction of the Dirichlet process as a specific structural prior allows our model to circumvent issues associated with previous Gaussian process latent variable models. Inference is performed by deriving an efficient variational bound on the marginal log-likelihood of the model. We demonstrate the efficacy of our approach via analysis of discovered structure and superior quantitative performance on missing data imputation.}\n}", "pdf": "http://proceedings.mlr.press/v97/lawrence19a/lawrence19a.pdf", "supp": "", "pdf_size": 1758959, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18201970691685461865&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Dept. of Computer Science, University of Bath, UK; Dept. of Computer Science, University of Bristol, UK; Dept. of Computer Science, University of Bath, UK", "aff_domain": "bath.ac.uk; ; ", "email": "bath.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/lawrence19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Bath;University of Bristol", "aff_unique_dep": "Dept. of Computer Science;Dept. of Computer Science", "aff_unique_url": "https://www.bath.ac.uk;https://www.bristol.ac.uk", "aff_unique_abbr": "Bath;Bristol", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Data Poisoning Attacks in Multi-Party Learning", "author": "Saeed Mahloujifar, Mohammad Mahmoody, Ameer Mohammed", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3817", "id": "3817" }, { "title": "Data Poisoning Attacks on Stochastic Bandits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3961", "id": "3961", "author_site": "Fang Liu, Ness Shroff", "author": "Fang Liu; Ness Shroff", "abstract": "Stochastic multi-armed bandits form a class of online learning problems that have important applications in online recommendation systems, adaptive medical treatment, and many others. Even though potential attacks against these learning algorithms may hijack their behavior, causing catastrophic loss in real-world applications, little is known about adversarial attacks on bandit algorithms. In this paper, we propose a framework of offline attacks on bandit algorithms and study convex optimization based attacks on several popular bandit algorithms. We show that the attacker can force the bandit algorithm to pull a target arm with high probability by a slight manipulation of the rewards in the data. Then we study a form of online attacks on bandit algorithms and propose an adaptive attack strategy against any bandit algorithm without the knowledge of the bandit algorithm. Our adaptive attack strategy can hijack the behavior of the bandit algorithm to suffer a linear regret with only a logarithmic cost to the attacker. Our results demonstrate a significant security threat to stochastic bandits.", "bibtex": "@InProceedings{pmlr-v97-liu19e,\n title = \t {Data Poisoning Attacks on Stochastic Bandits},\n author = {Liu, Fang and Shroff, Ness},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4042--4050},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19e/liu19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19e.html},\n abstract = \t {Stochastic multi-armed bandits form a class of online learning problems that have important applications in online recommendation systems, adaptive medical treatment, and many others. Even though potential attacks against these learning algorithms may hijack their behavior, causing catastrophic loss in real-world applications, little is known about adversarial attacks on bandit algorithms. In this paper, we propose a framework of offline attacks on bandit algorithms and study convex optimization based attacks on several popular bandit algorithms. We show that the attacker can force the bandit algorithm to pull a target arm with high probability by a slight manipulation of the rewards in the data. Then we study a form of online attacks on bandit algorithms and propose an adaptive attack strategy against any bandit algorithm without the knowledge of the bandit algorithm. Our adaptive attack strategy can hijack the behavior of the bandit algorithm to suffer a linear regret with only a logarithmic cost to the attacker. Our results demonstrate a significant security threat to stochastic bandits.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19e/liu19e.pdf", "supp": "", "pdf_size": 594349, "gs_citation": 132, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13493179141751095636&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Electrical and Computer Engineering; Department of Electrical and Computer Engineering + Department of Computer Science and Engineering, The Ohio State University", "aff_domain": "osu.edu;osu.edu", "email": "osu.edu;osu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/liu19e.html", "aff_unique_index": "0;0+1", "aff_unique_norm": "Unknown Institution;Ohio State University", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Computer Science and Engineering", "aff_unique_url": ";https://www.osu.edu", "aff_unique_abbr": ";OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1", "aff_country_unique": ";United States" }, { "title": "Data Shapley: Equitable Valuation of Data for Machine Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4290", "id": "4290", "author_site": "Amirata Ghorbani, James Zou", "author": "Amirata Ghorbani; James Zou", "abstract": "As data becomes the fuel driving technological and economic growth, a fundamental challenge is how to quantify the value of data in algorithmic predictions and decisions. For example, in healthcare and consumer markets, it has been suggested that individuals should be compensated for the data that they generate, but it is not clear what is an equitable valuation for individual data. In this work, we develop a principled framework to address data valuation in the context of supervised machine learning. Given a learning algorithm trained on $n$ data points to produce a predictor, we propose data Shapley as a metric to quantify the value of each training datum to the predictor performance. Data Shapley uniquely satisfies several natural properties of equitable data valuation. We develop Monte Carlo and gradient-based methods to efficiently estimate data Shapley values in practical settings where complex learning algorithms, including neural networks, are trained on large datasets. In addition to being equitable, extensive experiments across biomedical, image and synthetic data demonstrate that data Shapley has several other benefits: 1) it is more powerful than the popular leave-one-out or leverage score in providing insight on what data is more valuable for a given learning task; 2) low Shapley value data effectively capture outliers and corruptions; 3) high Shapley value data inform what type of new data to acquire to improve the predictor.", "bibtex": "@InProceedings{pmlr-v97-ghorbani19c,\n title = \t {Data Shapley: Equitable Valuation of Data for Machine Learning},\n author = {Ghorbani, Amirata and Zou, James},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2242--2251},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ghorbani19c/ghorbani19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/ghorbani19c.html},\n abstract = \t {As data becomes the fuel driving technological and economic growth, a fundamental challenge is how to quantify the value of data in algorithmic predictions and decisions. For example, in healthcare and consumer markets, it has been suggested that individuals should be compensated for the data that they generate, but it is not clear what is an equitable valuation for individual data. In this work, we develop a principled framework to address data valuation in the context of supervised machine learning. Given a learning algorithm trained on $n$ data points to produce a predictor, we propose data Shapley as a metric to quantify the value of each training datum to the predictor performance. Data Shapley uniquely satisfies several natural properties of equitable data valuation. We develop Monte Carlo and gradient-based methods to efficiently estimate data Shapley values in practical settings where complex learning algorithms, including neural networks, are trained on large datasets. In addition to being equitable, extensive experiments across biomedical, image and synthetic data demonstrate that data Shapley has several other benefits: 1) it is more powerful than the popular leave-one-out or leverage score in providing insight on what data is more valuable for a given learning task; 2) low Shapley value data effectively capture outliers and corruptions; 3) high Shapley value data inform what type of new data to acquire to improve the predictor.}\n}", "pdf": "http://proceedings.mlr.press/v97/ghorbani19c/ghorbani19c.pdf", "supp": "", "pdf_size": 1545184, "gs_citation": 1067, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7645060584356925514&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering, Stanford University, Stanford, CA, USA; Department of Biomedical Data Science, Stanford University, Stanford, CA, USA", "aff_domain": "stanford.edu;stanford.edu", "email": "stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ghorbani19c.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Dead-ends and Secure Exploration in Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3651", "id": "3651", "author_site": "Mehdi Fatemi, Shikhar Sharma, Harm van Seijen, Samira Ebrahimi Kahou", "author": "Mehdi Fatemi; Shikhar Sharma; Harm Van Seijen; Samira Ebrahimi Kahou", "abstract": "Many interesting applications of reinforcement learning (RL) involve MDPs that include numerous \u201cdead-end\" states. Upon reaching a dead-end state, the agent continues to interact with the environment in a dead-end trajectory before reaching an undesired terminal state, regardless of whatever actions are chosen. The situation is even worse when existence of many dead-end states is coupled with distant positive rewards from any initial state (we term this as Bridge Effect). Hence, conventional exploration techniques often incur prohibitively many training steps before convergence. To deal with the bridge effect, we propose a condition for exploration, called security. We next establish formal results that translate the security condition into the learning problem of an auxiliary value function. This new value function is used to cap \u201cany\" given exploration policy and is guaranteed to make it secure. As a special case, we use this theory and introduce secure random-walk. We next extend our results to the deep RL settings by identifying and addressing two main challenges that arise. Finally, we empirically compare secure random-walk with standard benchmarks in two sets of experiments including the Atari game of Montezuma\u2019s Revenge.", "bibtex": "@InProceedings{pmlr-v97-fatemi19a,\n title = \t {Dead-ends and Secure Exploration in Reinforcement Learning},\n author = {Fatemi, Mehdi and Sharma, Shikhar and Van Seijen, Harm and Kahou, Samira Ebrahimi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1873--1881},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fatemi19a/fatemi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fatemi19a.html},\n abstract = \t {Many interesting applications of reinforcement learning (RL) involve MDPs that include numerous \u201cdead-end\" states. Upon reaching a dead-end state, the agent continues to interact with the environment in a dead-end trajectory before reaching an undesired terminal state, regardless of whatever actions are chosen. The situation is even worse when existence of many dead-end states is coupled with distant positive rewards from any initial state (we term this as Bridge Effect). Hence, conventional exploration techniques often incur prohibitively many training steps before convergence. To deal with the bridge effect, we propose a condition for exploration, called security. We next establish formal results that translate the security condition into the learning problem of an auxiliary value function. This new value function is used to cap \u201cany\" given exploration policy and is guaranteed to make it secure. As a special case, we use this theory and introduce secure random-walk. We next extend our results to the deep RL settings by identifying and addressing two main challenges that arise. Finally, we empirically compare secure random-walk with standard benchmarks in two sets of experiments including the Atari game of Montezuma\u2019s Revenge.}\n}", "pdf": "http://proceedings.mlr.press/v97/fatemi19a/fatemi19a.pdf", "supp": "", "pdf_size": 1893816, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2424288316964890518&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Microsoft Research, 2000 McGill College Avenue, Suite 550, Montr \u00b4eal, QC H3A 3H3, Canada; Microsoft Research, 2000 McGill College Avenue, Suite 550, Montr \u00b4eal, QC H3A 3H3, Canada; Microsoft Research, 2000 McGill College Avenue, Suite 550, Montr \u00b4eal, QC H3A 3H3, Canada; McGill University, 845 Sherbrooke Street West, Montr \u00b4eal, QC H3A 0G4, Canada", "aff_domain": "microsoft.com; ; ; ", "email": "microsoft.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/fatemi19a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Microsoft;McGill University", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.mcgill.ca", "aff_unique_abbr": "MSR;McGill", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Montr\u00e9al", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Decentralized Exploration in Multi-Armed Bandits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3563", "id": "3563", "author_site": "Rapha\u00ebl F\u00e9raud, REDA ALAMI, Romain Laroche", "author": "Raphael Feraud; Reda Alami; Romain Laroche", "abstract": "We consider the decentralized exploration problem: a set of players collaborate to identify the best arm by asynchronously interacting with the same stochastic environment. The objective is to insure privacy in the best arm identification problem between asynchronous, collaborative, and thrifty players. In the context of a digital service, we advocate that this decentralized approach allows a good balance between conflicting interests: the providers optimize their services, while protecting privacy of users and saving resources. We define the privacy level as the amount of information an adversary could infer by intercepting all the messages concerning a single user. We provide a generic algorithm DECENTRALIZED ELIMINATION, which uses any best arm identification algorithm as a subroutine. We prove that this algorithm insures privacy, with a low communication cost, and that in comparison to the lower bound of the best arm identification problem, its sample complexity suffers from a penalty depending on the inverse of the probability of the most frequent players. Then, thanks to the genericity of the approach, we extend the proposed algorithm to the non-stationary bandits. Finally, experiments illustrate and complete the analysis.", "bibtex": "@InProceedings{pmlr-v97-feraud19a,\n title = \t {Decentralized Exploration in Multi-Armed Bandits},\n author = {Feraud, Raphael and Alami, Reda and Laroche, Romain},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1901--1909},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/feraud19a/feraud19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/feraud19a.html},\n abstract = \t {We consider the decentralized exploration problem: a set of players collaborate to identify the best arm by asynchronously interacting with the same stochastic environment. The objective is to insure privacy in the best arm identification problem between asynchronous, collaborative, and thrifty players. In the context of a digital service, we advocate that this decentralized approach allows a good balance between conflicting interests: the providers optimize their services, while protecting privacy of users and saving resources. We define the privacy level as the amount of information an adversary could infer by intercepting all the messages concerning a single user. We provide a generic algorithm DECENTRALIZED ELIMINATION, which uses any best arm identification algorithm as a subroutine. We prove that this algorithm insures privacy, with a low communication cost, and that in comparison to the lower bound of the best arm identification problem, its sample complexity suffers from a penalty depending on the inverse of the probability of the most frequent players. Then, thanks to the genericity of the approach, we extend the proposed algorithm to the non-stationary bandits. Finally, experiments illustrate and complete the analysis.}\n}", "pdf": "http://proceedings.mlr.press/v97/feraud19a/feraud19a.pdf", "supp": "", "pdf_size": 382951, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3273240319661688462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Orange Labs; Orange Labs; Microsoft Research", "aff_domain": "orange.com; ; ", "email": "orange.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/feraud19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Orange;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.orange.com;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Orange;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "France;United States" }, { "title": "Decentralized Stochastic Optimization and Gossip Algorithms with Compressed Communication", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4005", "id": "4005", "author_site": "Anastasiia Koloskova, Sebastian Stich, Martin Jaggi", "author": "Anastasia Koloskova; Sebastian Stich; Martin Jaggi", "abstract": "We consider decentralized stochastic optimization with the objective function (e.g. data samples for machine learning tasks) being distributed over n machines that can only communicate to their neighbors on a fixed communication graph. To address the communication bottleneck, the nodes compress (e.g. quantize or sparsify) their model updates. We cover both unbiased and biased compression operators with quality denoted by \\delta <= 1 (\\delta=1 meaning no compression). We (i) propose a novel gossip-based stochastic gradient descent algorithm, CHOCO-SGD, that converges at rate O(1/(nT) + 1/(T \\rho^2 \\delta)^2) for strongly convex objectives, where T denotes the number of iterations and \\rho the eigengap of the connectivity matrix. We (ii) present a novel gossip algorithm, CHOCO-GOSSIP, for the average consensus problem that converges in time O(1/(\\rho^2\\delta) \\log (1/\\epsilon)) for accuracy \\epsilon > 0. This is (up to our knowledge) the first gossip algorithm that supports arbitrary compressed messages for \\delta > 0 and still exhibits linear convergence. We (iii) show in experiments that both of our algorithms do outperform the respective state-of-the-art baselines and CHOCO-SGD can reduce communication by at least two orders of magnitudes.", "bibtex": "@InProceedings{pmlr-v97-koloskova19a,\n title = \t {Decentralized Stochastic Optimization and Gossip Algorithms with Compressed Communication},\n author = {Koloskova, Anastasia and Stich, Sebastian and Jaggi, Martin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3478--3487},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/koloskova19a/koloskova19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/koloskova19a.html},\n abstract = \t {We consider decentralized stochastic optimization with the objective function (e.g. data samples for machine learning tasks) being distributed over n machines that can only communicate to their neighbors on a fixed communication graph. To address the communication bottleneck, the nodes compress (e.g. quantize or sparsify) their model updates. We cover both unbiased and biased compression operators with quality denoted by \\delta <= 1 (\\delta=1 meaning no compression). We (i) propose a novel gossip-based stochastic gradient descent algorithm, CHOCO-SGD, that converges at rate O(1/(nT) + 1/(T \\rho^2 \\delta)^2) for strongly convex objectives, where T denotes the number of iterations and \\rho the eigengap of the connectivity matrix. We (ii) present a novel gossip algorithm, CHOCO-GOSSIP, for the average consensus problem that converges in time O(1/(\\rho^2\\delta) \\log (1/\\epsilon)) for accuracy \\epsilon > 0. This is (up to our knowledge) the first gossip algorithm that supports arbitrary compressed messages for \\delta > 0 and still exhibits linear convergence. We (iii) show in experiments that both of our algorithms do outperform the respective state-of-the-art baselines and CHOCO-SGD can reduce communication by at least two orders of magnitudes.}\n}", "pdf": "http://proceedings.mlr.press/v97/koloskova19a/koloskova19a.pdf", "supp": "", "pdf_size": 3311131, "gs_citation": 609, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8799480302950548238&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "EPFL, Lausanne, Switzerland; EPFL, Lausanne, Switzerland; EPFL, Lausanne, Switzerland", "aff_domain": "epfl.ch; ; ", "email": "epfl.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/koloskova19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Decomposing feature-level variation with Covariate Gaussian Process Latent Variable Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4154", "id": "4154", "author_site": "Kaspar M\u00e4rtens, Kieran Campbell, Christopher Yau", "author": "Kaspar M\u00e4rtens; Kieran Campbell; Christopher Yau", "abstract": "The interpretation of complex high-dimensional data typically requires the use of dimensionality reduction techniques to extract explanatory low-dimensional representations. However, in many real-world problems these representations may not be sufficient to aid interpretation on their own, and it would be desirable to interpret the model in terms of the original features themselves. Our goal is to characterise how feature-level variation depends on latent low-dimensional representations, external covariates, and non-linear interactions between the two. In this paper, we propose to achieve this through a structured kernel decomposition in a hybrid Gaussian Process model which we call the Covariate Gaussian Process Latent Variable Model (c-GPLVM). We demonstrate the utility of our model on simulated examples and applications in disease progression modelling from high-dimensional gene expression data in the presence of additional phenotypes. In each setting we show how the c-GPLVM can extract low-dimensional structures from high-dimensional data sets whilst allowing a breakdown of feature-level variability that is not present in other commonly used dimensionality reduction approaches.", "bibtex": "@InProceedings{pmlr-v97-martens19a,\n title = \t {Decomposing feature-level variation with Covariate {G}aussian Process Latent Variable Models},\n author = {M{\\\"a}rtens, Kaspar and Campbell, Kieran and Yau, Christopher},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4372--4381},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/martens19a/martens19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/martens19a.html},\n abstract = \t {The interpretation of complex high-dimensional data typically requires the use of dimensionality reduction techniques to extract explanatory low-dimensional representations. However, in many real-world problems these representations may not be sufficient to aid interpretation on their own, and it would be desirable to interpret the model in terms of the original features themselves. Our goal is to characterise how feature-level variation depends on latent low-dimensional representations, external covariates, and non-linear interactions between the two. In this paper, we propose to achieve this through a structured kernel decomposition in a hybrid Gaussian Process model which we call the Covariate Gaussian Process Latent Variable Model (c-GPLVM). We demonstrate the utility of our model on simulated examples and applications in disease progression modelling from high-dimensional gene expression data in the presence of additional phenotypes. In each setting we show how the c-GPLVM can extract low-dimensional structures from high-dimensional data sets whilst allowing a breakdown of feature-level variability that is not present in other commonly used dimensionality reduction approaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/martens19a/martens19a.pdf", "supp": "", "pdf_size": 3330022, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3291712378520398367&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "Department of Statistics, University of Oxford, UK; Department of Statistics, University of British Columbia, Canada+BC Cancer Agency, Canada+UBC Data Science Institute, Canada; The Alan Turing Institute, UK+Institute of Cancer and Genomic Sciences, University of Birmingham, UK", "aff_domain": "stats.ox.ac.uk;stat.ubc.ca;bham.ac.uk", "email": "stats.ox.ac.uk;stat.ubc.ca;bham.ac.uk", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/martens19a.html", "aff_unique_index": "0;1+2+1;3+4", "aff_unique_norm": "University of Oxford;University of British Columbia;BC Cancer Agency;Alan Turing Institute;University of Birmingham", "aff_unique_dep": "Department of Statistics;Department of Statistics;;;Institute of Cancer and Genomic Sciences", "aff_unique_url": "https://www.ox.ac.uk;https://www.ubc.ca;https://www.bccancer.ca;https://www.turing.ac.uk;https://www.birmingham.ac.uk", "aff_unique_abbr": "Oxford;UBC;;ATI;UoB", "aff_campus_unique_index": "0;;2", "aff_campus_unique": "Oxford;;Birmingham", "aff_country_unique_index": "0;1+1+1;0+0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "Deep Compressed Sensing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4326", "id": "4326", "author_site": "Yan Wu, Mihaela Rosca, Timothy Lillicrap", "author": "Yan Wu; Mihaela Rosca; Timothy Lillicrap", "abstract": "Compressed sensing (CS) provides an elegant framework for recovering sparse signals from compressed measurements. For example, CS can exploit the structure of natural images and recover an image from only a few random measurements. Unlike popular autoencoding models, reconstruction in CS is posed as an optimisation problem that is separate from sensing. CS is flexible and data efficient, but its application has been restricted by the strong assumption of sparsity and costly reconstruction process. A recent approach that combines CS with neural network generators has removed the constraint of sparsity, but reconstruction remains slow. Here we propose a novel framework that significantly improves both the performance and speed of signal recovery by jointly training a generator and the optimisation process for reconstruction via meta-learning. We explore training the measurements with different objectives, and derive a family of models based on minimising measurement errors. We show that Generative Adversarial Nets (GANs) can be viewed as a special case in this family of models. Borrowing insights from the CS perspective, we develop a novel way of improving GANs using gradient information from the discriminator.", "bibtex": "@InProceedings{pmlr-v97-wu19d,\n title = \t {Deep Compressed Sensing},\n author = {Wu, Yan and Rosca, Mihaela and Lillicrap, Timothy},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6850--6860},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wu19d/wu19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/wu19d.html},\n abstract = \t {Compressed sensing (CS) provides an elegant framework for recovering sparse signals from compressed measurements. For example, CS can exploit the structure of natural images and recover an image from only a few random measurements. Unlike popular autoencoding models, reconstruction in CS is posed as an optimisation problem that is separate from sensing. CS is flexible and data efficient, but its application has been restricted by the strong assumption of sparsity and costly reconstruction process. A recent approach that combines CS with neural network generators has removed the constraint of sparsity, but reconstruction remains slow. Here we propose a novel framework that significantly improves both the performance and speed of signal recovery by jointly training a generator and the optimisation process for reconstruction via meta-learning. We explore training the measurements with different objectives, and derive a family of models based on minimising measurement errors. We show that Generative Adversarial Nets (GANs) can be viewed as a special case in this family of models. Borrowing insights from the CS perspective, we develop a novel way of improving GANs using gradient information from the discriminator.}\n}", "pdf": "http://proceedings.mlr.press/v97/wu19d/wu19d.pdf", "supp": "", "pdf_size": 668451, "gs_citation": 218, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9065053746586625339&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK", "aff_domain": "google.com; ; ", "email": "google.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wu19d.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Deep Counterfactual Regret Minimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3751", "id": "3751", "author_site": "Noam Brown, Adam Lerer, Sam Gross, Tuomas Sandholm", "author": "Noam Brown; Adam Lerer; Sam Gross; Tuomas Sandholm", "abstract": "Counterfactual Regret Minimization (CFR) is the leading algorithm for solving large imperfect-information games. It converges to an equilibrium by iteratively traversing the game tree. In order to deal with extremely large games, abstraction is typically applied before running CFR. The abstracted game is solved with tabular CFR, and its solution is mapped back to the full game. This process can be problematic because aspects of abstraction are often manual and domain specific, abstraction algorithms may miss important strategic nuances of the game, and there is a chicken-and-egg problem because determining a good abstraction requires knowledge of the equilibrium of the game. This paper introduces", "bibtex": "@InProceedings{pmlr-v97-brown19b,\n title = \t {Deep Counterfactual Regret Minimization},\n author = {Brown, Noam and Lerer, Adam and Gross, Sam and Sandholm, Tuomas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {793--802},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/brown19b/brown19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/brown19b.html},\n abstract = \t {Counterfactual Regret Minimization (CFR) is the leading algorithm for solving large imperfect-information games. It converges to an equilibrium by iteratively traversing the game tree. In order to deal with extremely large games, abstraction is typically applied before running CFR. The abstracted game is solved with tabular CFR, and its solution is mapped back to the full game. This process can be problematic because aspects of abstraction are often manual and domain specific, abstraction algorithms may miss important strategic nuances of the game, and there is a chicken-and-egg problem because determining a good abstraction requires knowledge of the equilibrium of the game. This paper introduces", "pdf": "http://proceedings.mlr.press/v97/brown19b/brown19b.pdf", "supp": "", "pdf_size": 3387988, "gs_citation": 302, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16868985428264019603&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Facebook AI Research + Computer Science Department, Carnegie Mellon University; Facebook AI Research; Facebook AI Research; Computer Science Department, Carnegie Mellon University + Strategic Machine Inc., Strategy Robot Inc., and Optimized Markets Inc.", "aff_domain": "cs.cmu.edu; ; ; ", "email": "cs.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/brown19b.html", "aff_unique_index": "0+1;0;0;1+2", "aff_unique_norm": "Meta;Carnegie Mellon University;Strategic Machine Inc.", "aff_unique_dep": "Facebook AI Research;Computer Science Department;", "aff_unique_url": "https://research.facebook.com;https://www.cmu.edu;", "aff_unique_abbr": "FAIR;CMU;", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Deep Factors for Forecasting", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3881", "id": "3881", "author_site": "Yuyang Wang, Alex Smola, Danielle Robinson, Jan Gasthaus, Dean Foster, Tim Januschowski", "author": "Yuyang Wang; Alex Smola; Danielle Maddix; Jan Gasthaus; Dean Foster; Tim Januschowski", "abstract": "Producing probabilistic forecasts for large collections of similar and/or dependent time series is a practically highly relevant, yet challenging task. Classical time series models fail to capture complex patterns in the data and multivariate techniques struggle to scale to large problem sizes, but their reliance on strong structural assumptions makes them data-efficient and allows them to provide estimates of uncertainty. The converse is true for models based on deep neural networks, which can learn complex patterns and dependencies given enough data. In this paper, we propose a hybrid model that incorporates the benefits of both approaches. Our new method is data-driven and scalable via a latent, global, deep component. It also handles uncertainty through a local classical model. We provide both theoretical and empirical evidence for the soundness of our approach through a necessary and sufficient decomposition of exchangeable time series into a global and a local part and extensive experiments. Our experiments demonstrate the advantages of our model both in term of data efficiency and computational complexity.", "bibtex": "@InProceedings{pmlr-v97-wang19k,\n title = \t {Deep Factors for Forecasting},\n author = {Wang, Yuyang and Smola, Alex and Maddix, Danielle and Gasthaus, Jan and Foster, Dean and Januschowski, Tim},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6607--6617},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19k/wang19k.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19k.html},\n abstract = \t {Producing probabilistic forecasts for large collections of similar and/or dependent time series is a practically highly relevant, yet challenging task. Classical time series models fail to capture complex patterns in the data and multivariate techniques struggle to scale to large problem sizes, but their reliance on strong structural assumptions makes them data-efficient and allows them to provide estimates of uncertainty. The converse is true for models based on deep neural networks, which can learn complex patterns and dependencies given enough data. In this paper, we propose a hybrid model that incorporates the benefits of both approaches. Our new method is data-driven and scalable via a latent, global, deep component. It also handles uncertainty through a local classical model. We provide both theoretical and empirical evidence for the soundness of our approach through a necessary and sufficient decomposition of exchangeable time series into a global and a local part and extensive experiments. Our experiments demonstrate the advantages of our model both in term of data efficiency and computational complexity.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19k/wang19k.pdf", "supp": "", "pdf_size": 2518928, "gs_citation": 247, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14148859762726923676&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Amazon Research; Amazon Research; Amazon Research; Amazon Research; Amazon Research; Amazon Research", "aff_domain": "amazon.com; ; ; ; ; ", "email": "amazon.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/wang19k.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Amazon", "aff_unique_dep": "Amazon Research", "aff_unique_url": "https://www.amazon.science", "aff_unique_abbr": "Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Deep Gaussian Processes with Importance-Weighted Variational Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4027", "id": "4027", "author_site": "Hugh Salimbeni, Vincent Dutordoir, James Hensman, Marc P Deisenroth", "author": "Hugh Salimbeni; Vincent Dutordoir; James Hensman; Marc Deisenroth", "abstract": "Deep Gaussian processes (DGPs) can model complex marginal densities as well as complex mappings. Non-Gaussian marginals are essential for modelling real-world data, and can be generated from the DGP by incorporating uncorrelated variables to the model. Previous work in the DGP model has introduced noise additively, and used variational inference with a combination of sparse Gaussian processes and mean-field Gaussians for the approximate posterior. Additive noise attenuates the signal, and the Gaussian form of variational distribution may lead to an inaccurate posterior. We instead incorporate noisy variables as latent covariates, and propose a novel importance-weighted objective, which leverages analytic results and provides a mechanism to trade off computation for improved accuracy. Our results demonstrate that the importance-weighted objective works well in practice and consistently outperforms classical variational inference, especially for deeper models.", "bibtex": "@InProceedings{pmlr-v97-salimbeni19a,\n title = \t {Deep {G}aussian Processes with Importance-Weighted Variational Inference},\n author = {Salimbeni, Hugh and Dutordoir, Vincent and Hensman, James and Deisenroth, Marc},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5589--5598},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/salimbeni19a/salimbeni19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/salimbeni19a.html},\n abstract = \t {Deep Gaussian processes (DGPs) can model complex marginal densities as well as complex mappings. Non-Gaussian marginals are essential for modelling real-world data, and can be generated from the DGP by incorporating uncorrelated variables to the model. Previous work in the DGP model has introduced noise additively, and used variational inference with a combination of sparse Gaussian processes and mean-field Gaussians for the approximate posterior. Additive noise attenuates the signal, and the Gaussian form of variational distribution may lead to an inaccurate posterior. We instead incorporate noisy variables as latent covariates, and propose a novel importance-weighted objective, which leverages analytic results and provides a mechanism to trade off computation for improved accuracy. Our results demonstrate that the importance-weighted objective works well in practice and consistently outperforms classical variational inference, especially for deeper models.}\n}", "pdf": "http://proceedings.mlr.press/v97/salimbeni19a/salimbeni19a.pdf", "supp": "", "pdf_size": 801804, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17591045211502754804&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Imperial College London+PROWLER.io; PROWLER.io; PROWLER.io; Imperial College London+PROWLER.io", "aff_domain": "ic.ac.uk; ; ; ", "email": "ic.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/salimbeni19a.html", "aff_unique_index": "0+1;1;1;0+1", "aff_unique_norm": "Imperial College London;PROWLER.io", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://prowler.io", "aff_unique_abbr": "ICL;PROWLER.io", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Deep Generative Learning via Variational Gradient Flow", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3826", "id": "3826", "author_site": "Yuan Gao, Yuling Jiao, Yang Wang, Yao Wang, Can Yang, Shunkang Zhang", "author": "Yuan Gao; Yuling Jiao; Yang Wang; Yao Wang; Can Yang; Shunkang Zhang", "abstract": "We propose a framework to learn deep generative models via \\textbf{V}ariational \\textbf{Gr}adient Fl\\textbf{ow} (VGrow) on probability spaces. The evolving distribution that asymptotically converges to the target distribution is governed by a vector field, which is the negative gradient of the first variation of the $f$-divergence between them. We prove that the evolving distribution coincides with the pushforward distribution through the infinitesimal time composition of residual maps that are perturbations of the identity map along the vector field. The vector field depends on the density ratio of the pushforward distribution and the target distribution, which can be consistently learned from a binary classification problem. Connections of our proposed VGrow method with other popular methods, such as VAE, GAN and flow-based methods, have been established in this framework, gaining new insights of deep generative learning. We also evaluated several commonly used divergences, including Kullback-Leibler, Jensen-Shannon, Jeffreys divergences as well as our newly discovered \u201clogD\u201d divergence which serves as the objective function of the logD-trick GAN. Experimental results on benchmark datasets demonstrate that VGrow can generate high-fidelity images in a stable and efficient manner, achieving competitive performance with state-of-the-art GANs.", "bibtex": "@InProceedings{pmlr-v97-gao19b,\n title = \t {Deep Generative Learning via Variational Gradient Flow},\n author = {Gao, Yuan and Jiao, Yuling and Wang, Yang and Wang, Yao and Yang, Can and Zhang, Shunkang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2093--2101},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gao19b/gao19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/gao19b.html},\n abstract = \t {We propose a framework to learn deep generative models via \\textbf{V}ariational \\textbf{Gr}adient Fl\\textbf{ow} (VGrow) on probability spaces. The evolving distribution that asymptotically converges to the target distribution is governed by a vector field, which is the negative gradient of the first variation of the $f$-divergence between them. We prove that the evolving distribution coincides with the pushforward distribution through the infinitesimal time composition of residual maps that are perturbations of the identity map along the vector field. The vector field depends on the density ratio of the pushforward distribution and the target distribution, which can be consistently learned from a binary classification problem. Connections of our proposed VGrow method with other popular methods, such as VAE, GAN and flow-based methods, have been established in this framework, gaining new insights of deep generative learning. We also evaluated several commonly used divergences, including Kullback-Leibler, Jensen-Shannon, Jeffreys divergences as well as our newly discovered \u201clogD\u201d divergence which serves as the objective function of the logD-trick GAN. Experimental results on benchmark datasets demonstrate that VGrow can generate high-fidelity images in a stable and efficient manner, achieving competitive performance with state-of-the-art GANs.}\n}", "pdf": "http://proceedings.mlr.press/v97/gao19b/gao19b.pdf", "supp": "", "pdf_size": 1924848, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13167225334345346820&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/gao19b.html" }, { "title": "Deep Residual Output Layers for Neural Language Generation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4145", "id": "4145", "author_site": "Nikolaos Pappas, James Henderson", "author": "Nikolaos Pappas; James Henderson", "abstract": "Many tasks, including language generation, benefit from learning the structure of the output space, particularly when the space of output labels is large and the data is sparse. State-of-the-art neural language models indirectly capture the output space structure in their classifier weights since they lack parameter sharing across output labels. Learning shared output label mappings helps, but existing methods have limited expressivity and are prone to overfitting. In this paper, we investigate the usefulness of more powerful shared mappings for output labels, and propose a deep residual output mapping with dropout between layers to better capture the structure of the output space and avoid overfitting. Evaluations on three language generation tasks show that our output label mapping can match or improve state-of-the-art recurrent and self-attention architectures, and suggest that the classifier does not necessarily need to be high-rank to better model natural language if it is better at capturing the structure of the output space.", "bibtex": "@InProceedings{pmlr-v97-pappas19a,\n title = \t {Deep Residual Output Layers for Neural Language Generation},\n author = {Pappas, Nikolaos and Henderson, James},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5000--5011},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/pappas19a/pappas19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/pappas19a.html},\n abstract = \t {Many tasks, including language generation, benefit from learning the structure of the output space, particularly when the space of output labels is large and the data is sparse. State-of-the-art neural language models indirectly capture the output space structure in their classifier weights since they lack parameter sharing across output labels. Learning shared output label mappings helps, but existing methods have limited expressivity and are prone to overfitting. In this paper, we investigate the usefulness of more powerful shared mappings for output labels, and propose a deep residual output mapping with dropout between layers to better capture the structure of the output space and avoid overfitting. Evaluations on three language generation tasks show that our output label mapping can match or improve state-of-the-art recurrent and self-attention architectures, and suggest that the classifier does not necessarily need to be high-rank to better model natural language if it is better at capturing the structure of the output space.}\n}", "pdf": "http://proceedings.mlr.press/v97/pappas19a/pappas19a.pdf", "supp": "", "pdf_size": 389982, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6336276005436023906&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/pappas19a.html" }, { "title": "DeepMDP: Learning Continuous Latent Space Models for Representation Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4097", "id": "4097", "author_site": "Carles Gelada, Saurabh Kumar, Jacob Buckman, Ofir Nachum, Marc Bellemare", "author": "Carles Gelada; Saurabh Kumar; Jacob Buckman; Ofir Nachum; Marc G. Bellemare", "abstract": "Many reinforcement learning (RL) tasks provide the agent with high-dimensional observations that can be simplified into low-dimensional continuous states. To formalize this process, we introduce the concept of a \\texit{DeepMDP}, a parameterized latent space model that is trained via the minimization of two tractable latent space losses: prediction of rewards and prediction of the distribution over next latent states. We show that the optimization of these objectives guarantees (1) the quality of the embedding function as a representation of the state space and (2) the quality of the DeepMDP as a model of the environment. Our theoretical findings are substantiated by the experimental result that a trained DeepMDP recovers the latent structure underlying high-dimensional observations on a synthetic environment. Finally, we show that learning a DeepMDP as an auxiliary task in the Atari 2600 domain leads to large performance improvements over model-free RL.", "bibtex": "@InProceedings{pmlr-v97-gelada19a,\n title = \t {{D}eep{MDP}: Learning Continuous Latent Space Models for Representation Learning},\n author = {Gelada, Carles and Kumar, Saurabh and Buckman, Jacob and Nachum, Ofir and Bellemare, Marc G.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2170--2179},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gelada19a/gelada19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gelada19a.html},\n abstract = \t {Many reinforcement learning (RL) tasks provide the agent with high-dimensional observations that can be simplified into low-dimensional continuous states. To formalize this process, we introduce the concept of a \\texit{DeepMDP}, a parameterized latent space model that is trained via the minimization of two tractable latent space losses: prediction of rewards and prediction of the distribution over next latent states. We show that the optimization of these objectives guarantees (1) the quality of the embedding function as a representation of the state space and (2) the quality of the DeepMDP as a model of the environment. Our theoretical findings are substantiated by the experimental result that a trained DeepMDP recovers the latent structure underlying high-dimensional observations on a synthetic environment. Finally, we show that learning a DeepMDP as an auxiliary task in the Atari 2600 domain leads to large performance improvements over model-free RL.}\n}", "pdf": "http://proceedings.mlr.press/v97/gelada19a/gelada19a.pdf", "supp": "", "pdf_size": 1269731, "gs_citation": 378, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10221817652681130665&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Brain; Google Brain; Center for Language and Speech Processing, Johns Hopkins University; Google Brain; Google Brain", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/gelada19a.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Google;Johns Hopkins University", "aff_unique_dep": "Google Brain;Center for Language and Speech Processing", "aff_unique_url": "https://brain.google.com;https://www.jhu.edu", "aff_unique_abbr": "Google Brain;JHU", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "DeepNose: Using artificial neural networks to represent the space of odorants", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4175", "id": "4175", "author_site": "Ngoc Tran, Daniel Kepple, Sergey Shuvaev, Alexei Koulakov", "author": "Ngoc Tran; Daniel Kepple; Sergey Shuvaev; Alexei Koulakov", "abstract": "The olfactory system employs an ensemble of odorant receptors (ORs) to sense odorants and to derive olfactory percepts. We trained artificial neural networks to represent the chemical space of odorants and used this representation to predict human olfactory percepts. We hypothesized that ORs may be considered 3D convolutional filters that extract molecular features and, as such, can be trained using machine learning methods. First, we trained a convolutional autoencoder, called DeepNose, to deduce a low-dimensional representation of odorant molecules which were represented by their 3D spatial structure. Next, we tested the ability of DeepNose features in predicting physical properties and odorant percepts based on 3D molecular structure alone. We found that, despite the lack of human expertise, DeepNose features often outperformed molecular descriptors used in computational chemistry in predicting both physical properties and human perceptions. We propose that DeepNose network can extract", "bibtex": "@InProceedings{pmlr-v97-tran19b,\n title = \t {{D}eep{N}ose: Using artificial neural networks to represent the space of odorants},\n author = {Tran, Ngoc and Kepple, Daniel and Shuvaev, Sergey and Koulakov, Alexei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6305--6314},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tran19b/tran19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/tran19b.html},\n abstract = \t {The olfactory system employs an ensemble of odorant receptors (ORs) to sense odorants and to derive olfactory percepts. We trained artificial neural networks to represent the chemical space of odorants and used this representation to predict human olfactory percepts. We hypothesized that ORs may be considered 3D convolutional filters that extract molecular features and, as such, can be trained using machine learning methods. First, we trained a convolutional autoencoder, called DeepNose, to deduce a low-dimensional representation of odorant molecules which were represented by their 3D spatial structure. Next, we tested the ability of DeepNose features in predicting physical properties and odorant percepts based on 3D molecular structure alone. We found that, despite the lack of human expertise, DeepNose features often outperformed molecular descriptors used in computational chemistry in predicting both physical properties and human perceptions. We propose that DeepNose network can extract", "pdf": "http://proceedings.mlr.press/v97/tran19b/tran19b.pdf", "supp": "", "pdf_size": 2715553, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9471804803194022012&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "Cold Spring Harbor Laboratory; Cold Spring Harbor Laboratory; Cold Spring Harbor Laboratory; Cold Spring Harbor Laboratory", "aff_domain": "cshl.edu; ; ; ", "email": "cshl.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/tran19b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Cold Spring Harbor Laboratory", "aff_unique_dep": "", "aff_unique_url": "https://www.cshl.edu", "aff_unique_abbr": "CSHL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Defending Against Saddle Point Attack in Byzantine-Robust Distributed Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3660", "id": "3660", "author_site": "Dong Yin, Yudong Chen, Kannan Ramchandran, Peter Bartlett", "author": "Dong Yin; Yudong Chen; Ramchandran Kannan; Peter Bartlett", "abstract": "We study robust distributed learning that involves minimizing a non-convex loss function with saddle points. We consider the Byzantine setting where some worker machines have abnormal or even arbitrary and adversarial behavior, and in this setting, the Byzantine machines may create fake local minima near a saddle point that is far away from any true local minimum, even when robust gradient estimators are used. We develop ByzantinePGD, a robust first-order algorithm that can provably escape saddle points and fake local minima, and converge to an approximate true local minimizer with low iteration complexity. As a by-product, we give a simpler algorithm and analysis for escaping saddle points in the usual non-Byzantine setting. We further discuss three robust gradient estimators that can be used in ByzantinePGD, including median, trimmed mean, and iterative filtering. We characterize their performance in concrete statistical settings, and argue for their near-optimality in low and high dimensional regimes.", "bibtex": "@InProceedings{pmlr-v97-yin19a,\n title = \t {Defending Against Saddle Point Attack in {B}yzantine-Robust Distributed Learning},\n author = {Yin, Dong and Chen, Yudong and Kannan, Ramchandran and Bartlett, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7074--7084},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yin19a/yin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yin19a.html},\n abstract = \t {We study robust distributed learning that involves minimizing a non-convex loss function with saddle points. We consider the Byzantine setting where some worker machines have abnormal or even arbitrary and adversarial behavior, and in this setting, the Byzantine machines may create fake local minima near a saddle point that is far away from any true local minimum, even when robust gradient estimators are used. We develop ByzantinePGD, a robust first-order algorithm that can provably escape saddle points and fake local minima, and converge to an approximate true local minimizer with low iteration complexity. As a by-product, we give a simpler algorithm and analysis for escaping saddle points in the usual non-Byzantine setting. We further discuss three robust gradient estimators that can be used in ByzantinePGD, including median, trimmed mean, and iterative filtering. We characterize their performance in concrete statistical settings, and argue for their near-optimality in low and high dimensional regimes.}\n}", "pdf": "http://proceedings.mlr.press/v97/yin19a/yin19a.pdf", "supp": "", "pdf_size": 376194, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7848042066315912985&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/yin19a.html" }, { "title": "Demystifying Dropout", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4061", "id": "4061", "author_site": "Hongchang Gao, Jian Pei, Heng Huang", "author": "Hongchang Gao; Jian Pei; Heng Huang", "abstract": "Dropout is a popular technique to train large-scale deep neural networks to alleviate the overfitting problem. To disclose the underlying reasons for its gain, numerous works have tried to explain it from different perspectives. In this paper, unlike existing works, we explore it from a new perspective to provide new insight into this line of research. In detail, we disentangle the forward and backward pass of dropout. Then, we find that these two passes need different levels of noise to improve the generalization performance of deep neural networks. Based on this observation, we propose the augmented dropout which employs different dropping strategies in the forward and backward pass. Experimental results have verified the effectiveness of our proposed method.", "bibtex": "@InProceedings{pmlr-v97-gao19d,\n title = \t {Demystifying Dropout},\n author = {Gao, Hongchang and Pei, Jian and Huang, Heng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2112--2121},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gao19d/gao19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/gao19d.html},\n abstract = \t {Dropout is a popular technique to train large-scale deep neural networks to alleviate the overfitting problem. To disclose the underlying reasons for its gain, numerous works have tried to explain it from different perspectives. In this paper, unlike existing works, we explore it from a new perspective to provide new insight into this line of research. In detail, we disentangle the forward and backward pass of dropout. Then, we find that these two passes need different levels of noise to improve the generalization performance of deep neural networks. Based on this observation, we propose the augmented dropout which employs different dropping strategies in the forward and backward pass. Experimental results have verified the effectiveness of our proposed method.}\n}", "pdf": "http://proceedings.mlr.press/v97/gao19d/gao19d.pdf", "supp": "", "pdf_size": 828131, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18444234517097533896&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "JD Finance America Corporation+Department of Electrical and Computer Engineering, University of Pittsburgh, Pittsburgh, USA; JD.com+School of Computing Science, Simon Fraser University, Canada; JD Finance America Corporation+Department of Electrical and Computer Engineering, University of Pittsburgh, Pittsburgh, USA", "aff_domain": "pitt.edu; ;pitt.edu", "email": "pitt.edu; ;pitt.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/gao19d.html", "aff_unique_index": "0+1;2+3;0+1", "aff_unique_norm": "JD;University of Pittsburgh;JD.com;Simon Fraser University", "aff_unique_dep": "JD Finance America Corporation;Department of Electrical and Computer Engineering;;School of Computing Science", "aff_unique_url": ";https://www.pitt.edu;https://www.jd.com;https://www.sfu.ca", "aff_unique_abbr": ";Pitt;JD;SFU", "aff_campus_unique_index": "1;;1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0+0;1+2;0+0", "aff_country_unique": "United States;China;Canada" }, { "title": "Detecting Overlapping and Correlated Communities without Pure Nodes: Identifiability and Algorithm", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3926", "id": "3926", "author_site": "Kejun Huang, Xiao Fu", "author": "Kejun Huang; Xiao Fu", "abstract": "Many machine learning problems come in the form of networks with relational data between entities, and one of the key unsupervised learning tasks is to detect communities in such a network. We adopt the mixed-membership stochastic blockmodel as the underlying probabilistic model, and give conditions under which the memberships of a subset of nodes can be uniquely identified. Our method starts by constructing a second-order graph moment, which can be shown to converge to a specific product of the true parameters as the size of the network increases. To correctly recover the true membership parameters, we formulate an optimization problem using insights from convex geometry. We show that if the true memberships satisfy a so-called sufficiently scattered condition, then solving the proposed problem correctly identifies the ground truth. We also propose an efficient algorithm for detecting communities, which is significantly faster than prior work and with better convergence properties. Experiments on synthetic and real data justify the validity of the proposed learning framework for network data.", "bibtex": "@InProceedings{pmlr-v97-huang19c,\n title = \t {Detecting Overlapping and Correlated Communities without Pure Nodes: Identifiability and Algorithm},\n author = {Huang, Kejun and Fu, Xiao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2859--2868},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/huang19c/huang19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/huang19c.html},\n abstract = \t {Many machine learning problems come in the form of networks with relational data between entities, and one of the key unsupervised learning tasks is to detect communities in such a network. We adopt the mixed-membership stochastic blockmodel as the underlying probabilistic model, and give conditions under which the memberships of a subset of nodes can be uniquely identified. Our method starts by constructing a second-order graph moment, which can be shown to converge to a specific product of the true parameters as the size of the network increases. To correctly recover the true membership parameters, we formulate an optimization problem using insights from convex geometry. We show that if the true memberships satisfy a so-called sufficiently scattered condition, then solving the proposed problem correctly identifies the ground truth. We also propose an efficient algorithm for detecting communities, which is significantly faster than prior work and with better convergence properties. Experiments on synthetic and real data justify the validity of the proposed learning framework for network data.}\n}", "pdf": "http://proceedings.mlr.press/v97/huang19c/huang19c.pdf", "supp": "", "pdf_size": 197640, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7273078237805313490&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer and Information Science and Engineering, University of Florida, Gainesville, FL, USA; School of Electrical Engineering and Computer Science, Oregon State University, Corvallis, OR, USA", "aff_domain": "ufl.edu;oregonstate.edu", "email": "ufl.edu;oregonstate.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/huang19c.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Florida;Oregon State University", "aff_unique_dep": "Department of Computer and Information Science and Engineering;School of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.ufl.edu;https://osu.edu", "aff_unique_abbr": "UF;OSU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Gainesville;Corvallis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Diagnosing Bottlenecks in Deep Q-learning Algorithms", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4253", "id": "4253", "author_site": "Justin Fu, Aviral Kumar, Matthew Soh, Sergey Levine", "author": "Justin Fu; Aviral Kumar; Matthew Soh; Sergey Levine", "abstract": "Q-learning methods are a common class of algorithms used in reinforcement learning (RL). However, their behavior with function approximation, especially with neural networks, is poorly understood theoretically and empirically. In this work, we aim to experimentally investigate potential issues in Q-learning, by means of a \"unit testing\" framework where we can utilize oracles to disentangle sources of error. Specifically, we investigate questions related to function approximation, sampling error and nonstationarity, and where available, verify if trends found in oracle settings hold true with deep RL methods. We find that large neural network architectures have many benefits with regards to learning stability; offer several practical compensations for overfitting; and develop a novel sampling method based on explicitly compensating for function approximation error that yields fair improvement on high-dimensional continuous control domains.", "bibtex": "@InProceedings{pmlr-v97-fu19a,\n title = \t {Diagnosing Bottlenecks in Deep Q-learning Algorithms},\n author = {Fu, Justin and Kumar, Aviral and Soh, Matthew and Levine, Sergey},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2021--2030},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fu19a/fu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fu19a.html},\n abstract = \t {Q-learning methods are a common class of algorithms used in reinforcement learning (RL). However, their behavior with function approximation, especially with neural networks, is poorly understood theoretically and empirically. In this work, we aim to experimentally investigate potential issues in Q-learning, by means of a \"unit testing\" framework where we can utilize oracles to disentangle sources of error. Specifically, we investigate questions related to function approximation, sampling error and nonstationarity, and where available, verify if trends found in oracle settings hold true with deep RL methods. We find that large neural network architectures have many benefits with regards to learning stability; offer several practical compensations for overfitting; and develop a novel sampling method based on explicitly compensating for function approximation error that yields fair improvement on high-dimensional continuous control domains.}\n}", "pdf": "http://proceedings.mlr.press/v97/fu19a/fu19a.pdf", "supp": "", "pdf_size": 2270228, "gs_citation": 180, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5346609515080581577&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "UC Berkeley; UC Berkeley; UC Berkeley; UC Berkeley", "aff_domain": "eecs.berkeley.edu;berkeley.edu; ; ", "email": "eecs.berkeley.edu;berkeley.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/fu19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Differentiable Dynamic Normalization for Learning Deep Representation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3623", "id": "3623", "author_site": "Ping Luo, Peng Zhanglin, Shao Wenqi, Zhang ruimao, Ren jiamin, Wu lingyun", "author": "Ping Luo; Peng Zhanglin; Shao Wenqi; Zhang Ruimao; Ren Jiamin; Wu Lingyun", "abstract": "This work presents Dynamic Normalization (DN), which is able to learn arbitrary normalization operations for different convolutional layers in a deep ConvNet. Unlike existing normalization approaches that predefined computations of the statistics (mean and variance), DN learns to estimate them. DN has several appealing benefits. First, it adapts to various networks, tasks, and batch sizes. Second, it can be easily implemented and trained in a differentiable end-to-end manner with merely small number of parameters. Third, its matrix formulation represents a wide range of normalization methods, shedding light on analyzing them theoretically. Extensive studies show that DN outperforms its counterparts in CIFAR10 and ImageNet.", "bibtex": "@InProceedings{pmlr-v97-luo19a,\n title = \t {Differentiable Dynamic Normalization for Learning Deep Representation},\n author = {Luo, Ping and Zhanglin, Peng and Wenqi, Shao and Ruimao, Zhang and Jiamin, Ren and Lingyun, Wu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4203--4211},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/luo19a/luo19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/luo19a.html},\n abstract = \t {This work presents Dynamic Normalization (DN), which is able to learn arbitrary normalization operations for different convolutional layers in a deep ConvNet. Unlike existing normalization approaches that predefined computations of the statistics (mean and variance), DN learns to estimate them. DN has several appealing benefits. First, it adapts to various networks, tasks, and batch sizes. Second, it can be easily implemented and trained in a differentiable end-to-end manner with merely small number of parameters. Third, its matrix formulation represents a wide range of normalization methods, shedding light on analyzing them theoretically. Extensive studies show that DN outperforms its counterparts in CIFAR10 and ImageNet.}\n}", "pdf": "http://proceedings.mlr.press/v97/luo19a/luo19a.pdf", "supp": "", "pdf_size": 6240588, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10660999193320177264&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, The University of Hong Kong+Department of Electronic Engineering, The Chinese University of Hong Kong+SenseTime Group Ltd.; Department of Electronic Engineering, The Chinese University of Hong Kong+SenseTime Group Ltd.; Department of Electronic Engineering, The Chinese University of Hong Kong+SenseTime Group Ltd.; Department of Electronic Engineering, The Chinese University of Hong Kong+SenseTime Group Ltd.; SenseTime Group Ltd.; SenseTime Group Ltd.", "aff_domain": "gmail.com; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/luo19a.html", "aff_unique_index": "0+1+2;1+2;1+2;1+2;2;2", "aff_unique_norm": "University of Hong Kong;Chinese University of Hong Kong;SenseTime Group", "aff_unique_dep": "Department of Computer Science;Department of Electronic Engineering;", "aff_unique_url": "https://www.hku.hk;https://www.cuhk.edu.hk;https://www.sensetime.com", "aff_unique_abbr": "HKU;CUHK;SenseTime", "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0+0;0+0;0+0;0+0;0;0", "aff_country_unique": "China" }, { "title": "Differentiable Linearized ADMM", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3629", "id": "3629", "author_site": "Xingyu Xie, Jianlong Wu, Guangcan Liu, Zhisheng Zhong, Zhouchen Lin", "author": "Xingyu Xie; Jianlong Wu; Guangcan Liu; Zhisheng Zhong; Zhouchen Lin", "abstract": "Recently, a number of learning-based optimization methods that combine data-driven architectures with the classical optimization algorithms have been proposed and explored, showing superior empirical performance in solving various ill-posed inverse problems, but there is still a scarcity of rigorous analysis about the convergence behaviors of learning-based optimization. In particular, most existing analyses are specific to unconstrained problems but cannot apply to the more general cases where some variables of interest are subject to certain constraints. In this paper, we propose Differentiable Linearized ADMM (D-LADMM) for solving the problems with linear constraints. Specifically, D-LADMM is a K-layer LADMM inspired deep neural network, which is obtained by firstly introducing some learnable weights in the classical Linearized ADMM algorithm and then generalizing the proximal operator to some learnable activation function. Notably, we rigorously prove that there exist a set of learnable parameters for D-LADMM to generate globally converged solutions, and we show that those desired parameters can be attained by training D-LADMM in a proper way. To the best of our knowledge, we are the first to provide the convergence analysis for the learning-based optimization method on constrained problems.", "bibtex": "@InProceedings{pmlr-v97-xie19c,\n title = \t {Differentiable Linearized {ADMM}},\n author = {Xie, Xingyu and Wu, Jianlong and Liu, Guangcan and Zhong, Zhisheng and Lin, Zhouchen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6902--6911},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xie19c/xie19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/xie19c.html},\n abstract = \t {Recently, a number of learning-based optimization methods that combine data-driven architectures with the classical optimization algorithms have been proposed and explored, showing superior empirical performance in solving various ill-posed inverse problems, but there is still a scarcity of rigorous analysis about the convergence behaviors of learning-based optimization. In particular, most existing analyses are specific to unconstrained problems but cannot apply to the more general cases where some variables of interest are subject to certain constraints. In this paper, we propose Differentiable Linearized ADMM (D-LADMM) for solving the problems with linear constraints. Specifically, D-LADMM is a K-layer LADMM inspired deep neural network, which is obtained by firstly introducing some learnable weights in the classical Linearized ADMM algorithm and then generalizing the proximal operator to some learnable activation function. Notably, we rigorously prove that there exist a set of learnable parameters for D-LADMM to generate globally converged solutions, and we show that those desired parameters can be attained by training D-LADMM in a proper way. To the best of our knowledge, we are the first to provide the convergence analysis for the learning-based optimization method on constrained problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/xie19c/xie19c.pdf", "supp": "", "pdf_size": 908429, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7429496083508800871&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Key Lab. of Machine Perception, School of EECS, Peking University; Key Lab. of Machine Perception, School of EECS, Peking University; Key Lab. of Machine Perception, School of EECS, Peking University; B-DAT and CICAEET, School of Automation, Nanjing University of Information Science and Technology; Key Lab. of Machine Perception, School of EECS, Peking University", "aff_domain": "nuist.edu.cn;pku.edu.cn; ; ; ", "email": "nuist.edu.cn;pku.edu.cn; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/xie19c.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Peking University;Nanjing University of Information Science and Technology", "aff_unique_dep": "School of EECS;School of Automation", "aff_unique_url": "http://www.pku.edu.cn;http://www.nuist.edu.cn", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Differential Inclusions for Modeling Nonsmooth ADMM Variants: A Continuous Limit Theory", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3903", "id": "3903", "author_site": "Huizhuo Yuan, Yuren Zhou, Chris Junchi Li, Qingyun Sun", "author": "Huizhuo Yuan; Yuren Zhou; Chris Junchi Li; Qingyun Sun", "abstract": "Recently, there has been a great deal of research attention on understanding the convergence behavior of first-order methods. One line of this research focuses on analyzing the convergence behavior of first-order methods using tools from continuous dynamical systems such as ordinary differential equations and differential inclusions. These research results shed lights on better understanding first-order methods from a non-optimization point of view. The alternating direction method of multipliers (ADMM) is a widely used first-order method for solving optimization problems arising from machine learning and statistics, and it is important to investigate its behavior using these new techniques from dynamical systems. Existing works along this line have been mainly focusing on problems with smooth objective functions, which exclude many important applications that are traditionally solved by ADMM variants. In this paper, we analyze some well-known and widely used ADMM variants for nonsmooth optimization problems using tools of differential inclusions. In particular, we analyze the convergence behavior of linearized ADMM, gradient-based ADMM, generalized ADMM and accelerated generalized ADMM for nonsmooth problems and show their connections with dynamical systems. We anticipate that these results will provide new insights on understanding ADMM for solving nonsmooth problems.", "bibtex": "@InProceedings{pmlr-v97-yuan19c,\n title = \t {Differential Inclusions for Modeling Nonsmooth {ADMM} Variants: A Continuous Limit Theory},\n author = {Yuan, Huizhuo and Zhou, Yuren and Li, Chris Junchi and Sun, Qingyun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7232--7241},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yuan19c/yuan19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/yuan19c.html},\n abstract = \t {Recently, there has been a great deal of research attention on understanding the convergence behavior of first-order methods. One line of this research focuses on analyzing the convergence behavior of first-order methods using tools from continuous dynamical systems such as ordinary differential equations and differential inclusions. These research results shed lights on better understanding first-order methods from a non-optimization point of view. The alternating direction method of multipliers (ADMM) is a widely used first-order method for solving optimization problems arising from machine learning and statistics, and it is important to investigate its behavior using these new techniques from dynamical systems. Existing works along this line have been mainly focusing on problems with smooth objective functions, which exclude many important applications that are traditionally solved by ADMM variants. In this paper, we analyze some well-known and widely used ADMM variants for nonsmooth optimization problems using tools of differential inclusions. In particular, we analyze the convergence behavior of linearized ADMM, gradient-based ADMM, generalized ADMM and accelerated generalized ADMM for nonsmooth problems and show their connections with dynamical systems. We anticipate that these results will provide new insights on understanding ADMM for solving nonsmooth problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/yuan19c/yuan19c.pdf", "supp": "", "pdf_size": 606272, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9013456701856051111&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Peking University; Duke University; Tencent AI Lab; Stanford University", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/yuan19c.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Peking University;Duke University;Tencent;Stanford University", "aff_unique_dep": ";;Tencent AI Lab;", "aff_unique_url": "http://www.pku.edu.cn;https://www.duke.edu;https://ai.tencent.com;https://www.stanford.edu", "aff_unique_abbr": "Peking U;Duke;Tencent AI Lab;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Differentially Private Empirical Risk Minimization with Non-convex Loss Functions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3618", "id": "3618", "author_site": "Di Wang, Changyou Chen, Jinhui Xu", "author": "Di Wang; Changyou Chen; Jinhui Xu", "abstract": "We study the problem of Empirical Risk Minimization (ERM) with (smooth) non-convex loss functions under the differential-privacy (DP) model. Existing approaches for this problem mainly adopt gradient norms to measure the error, which in general cannot guarantee the quality of the solution. To address this issue, we first study the expected excess empirical (or population) risk, which was primarily used as the utility to measure the quality for convex loss functions. Specifically, we show that the excess empirical (or population) risk can be upper bounded by $\\tilde{O}(\\frac{d\\log (1/\\delta)}{\\log n\\epsilon^2})$ in the $(\\epsilon, \\delta)$-DP settings, where $n$ is the data size and $d$ is the dimensionality of the space. The $\\frac{1}{\\log n}$ term in the empirical risk bound can be further improved to $\\frac{1}{n^{\\Omega(1)}}$ (when $d$ is a constant) by a highly non-trivial analysis on the time-average error. To obtain more efficient solutions, we also consider the connection between achieving differential privacy and finding approximate local minimum. Particularly, we show that when the size $n$ is large enough, there are $(\\epsilon, \\delta)$-DP algorithms which can find an approximate local minimum of the empirical risk with high probability in both the constrained and non-constrained settings. These results indicate that one can escape saddle points privately.", "bibtex": "@InProceedings{pmlr-v97-wang19c,\n title = \t {Differentially Private Empirical Risk Minimization with Non-convex Loss Functions},\n author = {Wang, Di and Chen, Changyou and Xu, Jinhui},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6526--6535},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19c/wang19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19c.html},\n abstract = \t {We study the problem of Empirical Risk Minimization (ERM) with (smooth) non-convex loss functions under the differential-privacy (DP) model. Existing approaches for this problem mainly adopt gradient norms to measure the error, which in general cannot guarantee the quality of the solution. To address this issue, we first study the expected excess empirical (or population) risk, which was primarily used as the utility to measure the quality for convex loss functions. Specifically, we show that the excess empirical (or population) risk can be upper bounded by $\\tilde{O}(\\frac{d\\log (1/\\delta)}{\\log n\\epsilon^2})$ in the $(\\epsilon, \\delta)$-DP settings, where $n$ is the data size and $d$ is the dimensionality of the space. The $\\frac{1}{\\log n}$ term in the empirical risk bound can be further improved to $\\frac{1}{n^{\\Omega(1)}}$ (when $d$ is a constant) by a highly non-trivial analysis on the time-average error. To obtain more efficient solutions, we also consider the connection between achieving differential privacy and finding approximate local minimum. Particularly, we show that when the size $n$ is large enough, there are $(\\epsilon, \\delta)$-DP algorithms which can find an approximate local minimum of the empirical risk with high probability in both the constrained and non-constrained settings. These results indicate that one can escape saddle points privately.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19c/wang19c.pdf", "supp": "", "pdf_size": 205909, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2244257715278042972&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science and Engineering, State University of New York at Buffalo, Buffalo, USA; Department of Computer Science and Engineering, State University of New York at Buffalo, Buffalo, USA; Department of Computer Science and Engineering, State University of New York at Buffalo, Buffalo, USA", "aff_domain": "buffalo.edu;buffalo.edu;buffalo.edu", "email": "buffalo.edu;buffalo.edu;buffalo.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wang19c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "State University of New York at Buffalo", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.buffalo.edu", "aff_unique_abbr": "SUNY Buffalo", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Differentially Private Fair Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4111", "id": "4111", "author_site": "Matthew Jagielski, Michael Kearns, Jieming Mao, Alina Oprea, Aaron Roth, Saeed Sharifi-Malvajerdi, Jonathan Ullman", "author": "Matthew Jagielski; Michael Kearns; Jieming Mao; Alina Oprea; Aaron Roth; Saeed Sharifi -Malvajerdi; Jonathan Ullman", "abstract": "Motivated by settings in which predictive models may be required to be non-discriminatory with respect to certain attributes (such as race), but even collecting the sensitive attribute may be forbidden or restricted, we initiate the study of fair learning under the constraint of differential privacy. Our first algorithm is a private implementation of the equalized odds post-processing approach of (Hardt et al., 2016). This algorithm is appealingly simple, but must be able to use protected group membership explicitly at test time, which can be viewed as a form of \u201cdisparate treatment\u201d. Our second algorithm is a differentially private version of the oracle-efficient in-processing approach of (Agarwal et al., 2018) which is more complex but need not have access to protected group membership at test time. We identify new tradeoffs between fairness, accuracy, and privacy that emerge only when requiring all three properties, and show that these tradeoffs can be milder if group membership may be used at test time. We conclude with a brief experimental evaluation.", "bibtex": "@InProceedings{pmlr-v97-jagielski19a,\n title = \t {Differentially Private Fair Learning},\n author = {Jagielski, Matthew and Kearns, Michael and Mao, Jieming and Oprea, Alina and Roth, Aaron and -Malvajerdi, Saeed Sharifi and Ullman, Jonathan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3000--3008},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jagielski19a/jagielski19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jagielski19a.html},\n abstract = \t {Motivated by settings in which predictive models may be required to be non-discriminatory with respect to certain attributes (such as race), but even collecting the sensitive attribute may be forbidden or restricted, we initiate the study of fair learning under the constraint of differential privacy. Our first algorithm is a private implementation of the equalized odds post-processing approach of (Hardt et al., 2016). This algorithm is appealingly simple, but must be able to use protected group membership explicitly at test time, which can be viewed as a form of \u201cdisparate treatment\u201d. Our second algorithm is a differentially private version of the oracle-efficient in-processing approach of (Agarwal et al., 2018) which is more complex but need not have access to protected group membership at test time. We identify new tradeoffs between fairness, accuracy, and privacy that emerge only when requiring all three properties, and show that these tradeoffs can be milder if group membership may be used at test time. We conclude with a brief experimental evaluation.}\n}", "pdf": "http://proceedings.mlr.press/v97/jagielski19a/jagielski19a.pdf", "supp": "", "pdf_size": 577493, "gs_citation": 202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2495143136089309174&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Northeastern University; University of Pennsylvania; University of Pennsylvania; Northeastern University; University of Pennsylvania; University of Pennsylvania; Northeastern University", "aff_domain": "wharton.upenn.edu; ; ; ; ; ; ", "email": "wharton.upenn.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/jagielski19a.html", "aff_unique_index": "0;1;1;0;1;1;0", "aff_unique_norm": "Northeastern University;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.northeastern.edu;https://www.upenn.edu", "aff_unique_abbr": "NEU;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Differentially Private Learning of Geometric Concepts", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3863", "id": "3863", "author_site": "Haim Kaplan, Yishay Mansour, Yossi Matias, Uri Stemmer", "author": "Haim Kaplan; Yishay Mansour; Yossi Matias; Uri Stemmer", "abstract": "We present differentially private efficient algorithms for learning union of polygons in the plane (which are not necessarily convex). Our algorithms achieve $(\\alpha,\\beta)$-PAC learning and $(\\epsilon,\\delta)$-differential privacy using a sample of size $\\tilde{O}\\left(\\frac{1}{\\alpha\\epsilon}k\\log d\\right)$, where the domain is $[d]\\times[d]$ and $k$ is the number of edges in the union of polygons.", "bibtex": "@InProceedings{pmlr-v97-kaplan19a,\n title = \t {Differentially Private Learning of Geometric Concepts},\n author = {Kaplan, Haim and Mansour, Yishay and Matias, Yossi and Stemmer, Uri},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3233--3241},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kaplan19a/kaplan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kaplan19a.html},\n abstract = \t {We present differentially private efficient algorithms for learning union of polygons in the plane (which are not necessarily convex). Our algorithms achieve $(\\alpha,\\beta)$-PAC learning and $(\\epsilon,\\delta)$-differential privacy using a sample of size $\\tilde{O}\\left(\\frac{1}{\\alpha\\epsilon}k\\log d\\right)$, where the domain is $[d]\\times[d]$ and $k$ is the number of edges in the union of polygons.}\n}", "pdf": "http://proceedings.mlr.press/v97/kaplan19a/kaplan19a.pdf", "supp": "", "pdf_size": 394362, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=496841756293407139&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Tel Aviv University + Google research, Israel; Tel Aviv University + Google research, Israel; Google research, Israel; Ben-Gurion University + Supported by a gift from Google Ltd.", "aff_domain": "tau.ac.il;tau.ac.il;google.com;uri.co.il", "email": "tau.ac.il;tau.ac.il;google.com;uri.co.il", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kaplan19a.html", "aff_unique_index": "0+1;0+1;1;2+1", "aff_unique_norm": "Tel Aviv University;Google;Ben-Gurion University of the Negev", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.tau.ac.il;https://research.google;https://www.bgu.ac.il", "aff_unique_abbr": "TAU;Google;BGU", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0;0+1", "aff_country_unique": "Israel;United Kingdom" }, { "title": "Dimension-Wise Importance Sampling Weight Clipping for Sample-Efficient Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3671", "id": "3671", "author_site": "Seungyul Han, Youngchul Sung", "author": "Seungyul Han; Youngchul Sung", "abstract": "In importance sampling (IS)-based reinforcement learning algorithms such as Proximal Policy Optimization (PPO), IS weights are typically clipped to avoid large variance in learning. However, policy update from clipped statistics induces large bias in tasks with high action dimensions, and bias from clipping makes it difficult to reuse old samples with large IS weights. In this paper, we consider PPO, a representative on-policy algorithm, and propose its improvement by dimension-wise IS weight clipping which separately clips the IS weight of each action dimension to avoid large bias and adaptively controls the IS weight to bound policy update from the current policy. This new technique enables efficient learning for high action-dimensional tasks and reusing of old samples like in off-policy learning to increase the sample efficiency. Numerical results show that the proposed new algorithm outperforms PPO and other RL algorithms in various Open AI Gym tasks.", "bibtex": "@InProceedings{pmlr-v97-han19b,\n title = \t {Dimension-Wise Importance Sampling Weight Clipping for Sample-Efficient Reinforcement Learning},\n author = {Han, Seungyul and Sung, Youngchul},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2586--2595},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/han19b/han19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/han19b.html},\n abstract = \t {In importance sampling (IS)-based reinforcement learning algorithms such as Proximal Policy Optimization (PPO), IS weights are typically clipped to avoid large variance in learning. However, policy update from clipped statistics induces large bias in tasks with high action dimensions, and bias from clipping makes it difficult to reuse old samples with large IS weights. In this paper, we consider PPO, a representative on-policy algorithm, and propose its improvement by dimension-wise IS weight clipping which separately clips the IS weight of each action dimension to avoid large bias and adaptively controls the IS weight to bound policy update from the current policy. This new technique enables efficient learning for high action-dimensional tasks and reusing of old samples like in off-policy learning to increase the sample efficiency. Numerical results show that the proposed new algorithm outperforms PPO and other RL algorithms in various Open AI Gym tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/han19b/han19b.pdf", "supp": "", "pdf_size": 2812657, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17087407211234698411&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/han19b.html", "aff_unique_index": "0;0", "aff_unique_norm": "KAIST", "aff_unique_dep": "School of Electrical Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Dimensionality Reduction for Tukey Regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3896", "id": "3896", "author_site": "Kenneth Clarkson, Ruosong Wang, David Woodruff", "author": "Kenneth Clarkson; Ruosong Wang; David Woodruff", "abstract": "We give the first dimensionality reduction methods for the overconstrained Tukey regression problem. The Tukey loss function $\\|y\\|_M = \\sum_i M(y_i)$ has $M(y_i) \\approx |y_i|^p$ for residual errors $y_i$ smaller than a prescribed threshold $\\tau$, but $M(y_i)$ becomes constant for errors $|y_i| > \\tau$. Our results depend on a new structural result, proven constructively, showing that for any $d$-dimensional subspace $L \\subset \\mathbb{R}^n$, there is a fixed bounded-size subset of coordinates containing, for every $y \\in L$, all the large coordinates, with respect to the Tukey loss function, of $y$. Our methods reduce a given Tukey regression problem to a smaller weighted version, whose solution is a provably good approximate solution to the original problem. Our reductions are fast, simple and easy to implement, and we give empirical results demonstrating their practicality, using existing heuristic solvers for the small versions. We also give exponential-time algorithms giving provably good solutions, and hardness results suggesting that a significant speedup in the worst case is unlikely.", "bibtex": "@InProceedings{pmlr-v97-clarkson19a,\n title = \t {Dimensionality Reduction for Tukey Regression},\n author = {Clarkson, Kenneth and Wang, Ruosong and Woodruff, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1262--1271},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/clarkson19a/clarkson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/clarkson19a.html},\n abstract = \t {We give the first dimensionality reduction methods for the overconstrained Tukey regression problem. The Tukey loss function $\\|y\\|_M = \\sum_i M(y_i)$ has $M(y_i) \\approx |y_i|^p$ for residual errors $y_i$ smaller than a prescribed threshold $\\tau$, but $M(y_i)$ becomes constant for errors $|y_i| > \\tau$. Our results depend on a new structural result, proven constructively, showing that for any $d$-dimensional subspace $L \\subset \\mathbb{R}^n$, there is a fixed bounded-size subset of coordinates containing, for every $y \\in L$, all the large coordinates, with respect to the Tukey loss function, of $y$. Our methods reduce a given Tukey regression problem to a smaller weighted version, whose solution is a provably good approximate solution to the original problem. Our reductions are fast, simple and easy to implement, and we give empirical results demonstrating their practicality, using existing heuristic solvers for the small versions. We also give exponential-time algorithms giving provably good solutions, and hardness results suggesting that a significant speedup in the worst case is unlikely.}\n}", "pdf": "http://proceedings.mlr.press/v97/clarkson19a/clarkson19a.pdf", "supp": "", "pdf_size": 464395, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15887736307200040005&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "IBM Research - Almaden, San Jose, California, USA; Carnegie Mellon University, Pittsburgh, Pennsylvania, USA; Carnegie Mellon University, Pittsburgh, Pennsylvania, USA", "aff_domain": "andrew.cmu.edu; ; ", "email": "andrew.cmu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/clarkson19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "IBM;Carnegie Mellon University", "aff_unique_dep": "IBM Research;", "aff_unique_url": "https://www.ibm.com/research;https://www.cmu.edu", "aff_unique_abbr": "IBM;CMU", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Almaden;Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Direct Uncertainty Prediction for Medical Second Opinions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3695", "id": "3695", "author_site": "Maithra Raghu, Katy Blumer, Rory sayres, Ziad Obermeyer, Bobby Kleinberg, Sendhil Mullainathan, Jon Kleinberg", "author": "Maithra Raghu; Katy Blumer; Rory Sayres; Ziad Obermeyer; Bobby Kleinberg; Sendhil Mullainathan; Jon Kleinberg", "abstract": "The issue of disagreements amongst human experts is a ubiquitous one in both machine learning and medicine. In medicine, this often corresponds to doctor disagreements on a patient diagnosis. In this work, we show that machine learning models can be successfully trained to give uncertainty scores to data instances that result in high expert disagreements. In particular, they can identify patient cases that would benefit most from a medical second opinion. Our central methodological finding is that Direct Uncertainty Prediction (DUP), training a model to predict an uncertainty score directly from the raw patient features, works better than Uncertainty Via Classification, the two step process of training a classifier and postprocessing the output distribution to give an uncertainty score. We show this both with a theoretical result, and on extensive evaluations on a large scale medical imaging application.", "bibtex": "@InProceedings{pmlr-v97-raghu19a,\n title = \t {Direct Uncertainty Prediction for Medical Second Opinions},\n author = {Raghu, Maithra and Blumer, Katy and Sayres, Rory and Obermeyer, Ziad and Kleinberg, Bobby and Mullainathan, Sendhil and Kleinberg, Jon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5281--5290},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/raghu19a/raghu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/raghu19a.html},\n abstract = \t {The issue of disagreements amongst human experts is a ubiquitous one in both machine learning and medicine. In medicine, this often corresponds to doctor disagreements on a patient diagnosis. In this work, we show that machine learning models can be successfully trained to give uncertainty scores to data instances that result in high expert disagreements. In particular, they can identify patient cases that would benefit most from a medical second opinion. Our central methodological finding is that Direct Uncertainty Prediction (DUP), training a model to predict an uncertainty score directly from the raw patient features, works better than Uncertainty Via Classification, the two step process of training a classifier and postprocessing the output distribution to give an uncertainty score. We show this both with a theoretical result, and on extensive evaluations on a large scale medical imaging application.}\n}", "pdf": "http://proceedings.mlr.press/v97/raghu19a/raghu19a.pdf", "supp": "", "pdf_size": 1907872, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15693292062593156990&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Cornell University + Google Brain; Google Brain; Google Brain; UC Berkeley School of Public Health; Department of Computer Science, Cornell University; Chicago Booth School of Business; Department of Computer Science, Cornell University", "aff_domain": "gmail.com; ; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/raghu19a.html", "aff_unique_index": "0+1;1;1;2;0;3;0", "aff_unique_norm": "Cornell University;Google;University of California, Berkeley;University of Chicago", "aff_unique_dep": "Department of Computer Science;Google Brain;School of Public Health;Booth School of Business", "aff_unique_url": "https://www.cornell.edu;https://brain.google.com;https://www.berkeley.edu;https://\u5e03\u65af.chicagobooth.edu/", "aff_unique_abbr": "Cornell;Google Brain;UC Berkeley;Chicago Booth", "aff_campus_unique_index": "1;1;1;2;3", "aff_campus_unique": ";Mountain View;Berkeley;Chicago", "aff_country_unique_index": "0+0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Dirichlet Simplex Nest and Geometric Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3943", "id": "3943", "author_site": "Mikhail Yurochkin, Aritra Guha, Yuekai Sun, XuanLong Nguyen", "author": "Mikhail Yurochkin; Aritra Guha; Yuekai Sun; Xuanlong Nguyen", "abstract": "We propose Dirichlet Simplex Nest, a class of probabilistic models suitable for a variety of data types, and develop fast and provably accurate inference algorithms by accounting for the model\u2019s convex geometry and low dimensional simplicial structure. By exploiting the connection to Voronoi tessellation and properties of Dirichlet distribution, the proposed inference algorithm is shown to achieve consistency and strong error bound guarantees on a range of model settings and data distributions. The effectiveness of our model and the learning algorithm is demonstrated by simulations and by analyses of text and financial data.", "bibtex": "@InProceedings{pmlr-v97-yurochkin19b,\n title = \t {{D}irichlet Simplex Nest and Geometric Inference},\n author = {Yurochkin, Mikhail and Guha, Aritra and Sun, Yuekai and Nguyen, Xuanlong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7262--7271},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yurochkin19b/yurochkin19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/yurochkin19b.html},\n abstract = \t {We propose Dirichlet Simplex Nest, a class of probabilistic models suitable for a variety of data types, and develop fast and provably accurate inference algorithms by accounting for the model\u2019s convex geometry and low dimensional simplicial structure. By exploiting the connection to Voronoi tessellation and properties of Dirichlet distribution, the proposed inference algorithm is shown to achieve consistency and strong error bound guarantees on a range of model settings and data distributions. The effectiveness of our model and the learning algorithm is demonstrated by simulations and by analyses of text and financial data.}\n}", "pdf": "http://proceedings.mlr.press/v97/yurochkin19b/yurochkin19b.pdf", "supp": "", "pdf_size": 8649547, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3107204927758089702&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "IBM Research, Cambridge+MIT-IBM Watson AI Lab; Department of Statistics, University of Michigan; Department of Statistics, University of Michigan; Department of Statistics, University of Michigan", "aff_domain": "ibm.com; ; ; ", "email": "ibm.com; ; ; ", "github": "https://github.com/moonfolk/VLADe", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/yurochkin19b.html", "aff_unique_index": "0+1;2;2;2", "aff_unique_norm": "IBM;Massachusetts Institute of Technology;University of Michigan", "aff_unique_dep": "IBM Research;IBM Watson AI Lab;Department of Statistics", "aff_unique_url": "https://www.ibm.com/research;https://www.mitibmwatsonailab.org;https://www.umich.edu", "aff_unique_abbr": "IBM;MIT-IBM AI Lab;UM", "aff_campus_unique_index": "0;2;2;2", "aff_campus_unique": "Cambridge;;Ann Arbor", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Discovering Conditionally Salient Features with Statistical Guarantees", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4238", "id": "4238", "author_site": "Jaime Roquero Gimenez, James Zou", "author": "Jaime Roquero Gimenez; James Zou", "abstract": "The goal of feature selection is to identify important features that are relevant to explain a outcome variable. Most of the work in this domain has focused on identifying", "bibtex": "@InProceedings{pmlr-v97-gimenez19a,\n title = \t {Discovering Conditionally Salient Features with Statistical Guarantees},\n author = {Gimenez, Jaime Roquero and Zou, James},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2290--2298},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gimenez19a/gimenez19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gimenez19a.html},\n abstract = \t {The goal of feature selection is to identify important features that are relevant to explain a outcome variable. Most of the work in this domain has focused on identifying", "pdf": "http://proceedings.mlr.press/v97/gimenez19a/gimenez19a.pdf", "supp": "", "pdf_size": 1228525, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3269970517515101061&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/gimenez19a.html" }, { "title": "Discovering Context Effects from Raw Choice Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4284", "id": "4284", "author_site": "Arjun Seshadri, Alexander Peysakhovich, Johan Ugander", "author": "Arjun Seshadri; Alex Peysakhovich; Johan Ugander", "abstract": "Many applications in preference learning assume that decisions come from the maximization of a stable utility function. Yet a large experimental literature shows that individual choices and judgements can be affected by \u201cirrelevant\u201d aspects of the context in which they are made. An important class of such contexts is the composition of the choice set. In this work, our goal is to discover such choice set effects from raw choice data. We introduce an extension of the Multinomial Logit (MNL) model, called the context dependent random utility model (CDM), which allows for a particular class of choice set effects. We show that the CDM can be thought of as a second-order approximation to a general choice system, can be inferred optimally using maximum likelihood and, importantly, is easily interpretable. We apply the CDM to both real and simulated choice data to perform principled exploratory analyses for the presence of choice set effects.", "bibtex": "@InProceedings{pmlr-v97-seshadri19a,\n title = \t {Discovering Context Effects from Raw Choice Data},\n author = {Seshadri, Arjun and Peysakhovich, Alex and Ugander, Johan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5660--5669},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/seshadri19a/seshadri19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/seshadri19a.html},\n abstract = \t {Many applications in preference learning assume that decisions come from the maximization of a stable utility function. Yet a large experimental literature shows that individual choices and judgements can be affected by \u201cirrelevant\u201d aspects of the context in which they are made. An important class of such contexts is the composition of the choice set. In this work, our goal is to discover such choice set effects from raw choice data. We introduce an extension of the Multinomial Logit (MNL) model, called the context dependent random utility model (CDM), which allows for a particular class of choice set effects. We show that the CDM can be thought of as a second-order approximation to a general choice system, can be inferred optimally using maximum likelihood and, importantly, is easily interpretable. We apply the CDM to both real and simulated choice data to perform principled exploratory analyses for the presence of choice set effects.}\n}", "pdf": "http://proceedings.mlr.press/v97/seshadri19a/seshadri19a.pdf", "supp": "", "pdf_size": 857793, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15040500565104885549&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Stanford University; Facebook Artificial Intelligence Research; Stanford University", "aff_domain": "stanford.edu;fb.com;stanford.edu", "email": "stanford.edu;fb.com;stanford.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/seshadri19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Stanford University;Meta", "aff_unique_dep": ";Artificial Intelligence Research", "aff_unique_url": "https://www.stanford.edu;https://research.facebook.com", "aff_unique_abbr": "Stanford;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Discovering Latent Covariance Structures for Multiple Time Series", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4069", "id": "4069", "author_site": "Anh Tong, Jaesik Choi", "author": "Anh Tong; Jaesik Choi", "abstract": "Analyzing multivariate time series data is important to predict future events and changes of complex systems in finance, manufacturing, and administrative decisions. The expressiveness power of Gaussian Process (GP) regression methods has been significantly improved by compositional covariance structures. In this paper, we present a new GP model which naturally handles multiple time series by placing an Indian Buffet Process (IBP) prior on the presence of shared kernels. Our selective covariance structure decomposition allows exploiting shared parameters over a set of multiple, selected time series. We also investigate the well-definedness of the models when infinite latent components are introduced. We present a pragmatic search algorithm which explores a larger structure space efficiently. Experiments conducted on five real-world data sets demonstrate that our new model outperforms existing methods in term of structure discoveries and predictive performances.", "bibtex": "@InProceedings{pmlr-v97-tong19a,\n title = \t {Discovering Latent Covariance Structures for Multiple Time Series},\n author = {Tong, Anh and Choi, Jaesik},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6285--6294},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tong19a/tong19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tong19a.html},\n abstract = \t {Analyzing multivariate time series data is important to predict future events and changes of complex systems in finance, manufacturing, and administrative decisions. The expressiveness power of Gaussian Process (GP) regression methods has been significantly improved by compositional covariance structures. In this paper, we present a new GP model which naturally handles multiple time series by placing an Indian Buffet Process (IBP) prior on the presence of shared kernels. Our selective covariance structure decomposition allows exploiting shared parameters over a set of multiple, selected time series. We also investigate the well-definedness of the models when infinite latent components are introduced. We present a pragmatic search algorithm which explores a larger structure space efficiently. Experiments conducted on five real-world data sets demonstrate that our new model outperforms existing methods in term of structure discoveries and predictive performances.}\n}", "pdf": "http://proceedings.mlr.press/v97/tong19a/tong19a.pdf", "supp": "", "pdf_size": 2991816, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15651549313098404287&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science and Engineering, Ulsan National Institute of Science and Technology, Ulsan, 44919, South Korea; Department of Computer Science and Engineering, Ulsan National Institute of Science and Technology, Ulsan, 44919, South Korea", "aff_domain": "unist.ac.kr;unist.ac.kr", "email": "unist.ac.kr;unist.ac.kr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/tong19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Ulsan National Institute of Science and Technology", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.unist.ac.kr", "aff_unique_abbr": "UNIST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ulsan", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Discovering Options for Exploration by Minimizing Cover Time", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4279", "id": "4279", "author_site": "Yuu Jinnai, Jee Won Park, David Abel, George Konidaris", "author": "Yuu Jinnai; Jee Won Park; David Abel; George Konidaris", "abstract": "One of the main challenges in reinforcement learning is solving tasks with sparse reward. We show that the difficulty of discovering a distant rewarding state in an MDP is bounded by the expected cover time of a random walk over the graph induced by the MDP\u2019s transition dynamics. We therefore propose to accelerate exploration by constructing options that minimize cover time. We introduce a new option discovery algorithm that diminishes the expected cover time by connecting the most distant states in the state-space graph with options. We show empirically that the proposed algorithm improves learning in several domains with sparse rewards.", "bibtex": "@InProceedings{pmlr-v97-jinnai19b,\n title = \t {Discovering Options for Exploration by Minimizing Cover Time},\n author = {Jinnai, Yuu and Park, Jee Won and Abel, David and Konidaris, George},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3130--3139},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jinnai19b/jinnai19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/jinnai19b.html},\n abstract = \t {One of the main challenges in reinforcement learning is solving tasks with sparse reward. We show that the difficulty of discovering a distant rewarding state in an MDP is bounded by the expected cover time of a random walk over the graph induced by the MDP\u2019s transition dynamics. We therefore propose to accelerate exploration by constructing options that minimize cover time. We introduce a new option discovery algorithm that diminishes the expected cover time by connecting the most distant states in the state-space graph with options. We show empirically that the proposed algorithm improves learning in several domains with sparse rewards.}\n}", "pdf": "http://proceedings.mlr.press/v97/jinnai19b/jinnai19b.pdf", "supp": "", "pdf_size": 4163416, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9780401953851504286&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "aff": "Brown University; Brown University; Brown University; Brown University", "aff_domain": "brown.edu; ; ; ", "email": "brown.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/jinnai19b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Brown University", "aff_unique_dep": "", "aff_unique_url": "https://www.brown.edu", "aff_unique_abbr": "Brown", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Discriminative Regularization for Latent Variable Models with Applications to Electrocardiography", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3736", "id": "3736", "author_site": "Andrew Miller, Ziad Obermeyer, John Cunningham, Sendhil Mullainathan", "author": "Andrew Miller; Ziad Obermeyer; John Cunningham; Sendhil Mullainathan", "abstract": "Generative models often use latent variables to represent structured variation in high-dimensional data, such as images and medical waveforms. However, these latent variables may ignore subtle, yet meaningful features in the data. Some features may predict an outcome of interest (e.g. heart attack) but account for only a small fraction of variation in the data. We propose a generative model training objective that uses a black-box discriminative model as a regularizer to learn representations that preserve this predictive variation. With these discriminatively regularized latent variable models, we visualize and measure variation in the data that influence a black-box predictive model, enabling an expert to better understand each prediction. With this technique, we study models that use electrocardiograms to predict outcomes of clinical interest. We measure our approach on synthetic and real data with statistical summaries and an experiment carried out by a physician.", "bibtex": "@InProceedings{pmlr-v97-miller19a,\n title = \t {Discriminative Regularization for Latent Variable Models with Applications to Electrocardiography},\n author = {Miller, Andrew and Obermeyer, Ziad and Cunningham, John and Mullainathan, Sendhil},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4585--4594},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/miller19a/miller19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/miller19a.html},\n abstract = \t {Generative models often use latent variables to represent structured variation in high-dimensional data, such as images and medical waveforms. However, these latent variables may ignore subtle, yet meaningful features in the data. Some features may predict an outcome of interest (e.g. heart attack) but account for only a small fraction of variation in the data. We propose a generative model training objective that uses a black-box discriminative model as a regularizer to learn representations that preserve this predictive variation. With these discriminatively regularized latent variable models, we visualize and measure variation in the data that influence a black-box predictive model, enabling an expert to better understand each prediction. With this technique, we study models that use electrocardiograms to predict outcomes of clinical interest. We measure our approach on synthetic and real data with statistical summaries and an experiment carried out by a physician.}\n}", "pdf": "http://proceedings.mlr.press/v97/miller19a/miller19a.pdf", "supp": "", "pdf_size": 1076544, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=615805873225205738&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Data Science Institute, Columbia University, New York, NY, USA; School of Public Health, UC Berkeley, Berkeley, CA, USA; Department of Statistics, Columbia University, New York, NY, USA; Booth School of Business, University of Chicago, Chicago, IL, USA", "aff_domain": "columbia.edu; ; ; ", "email": "columbia.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/miller19a.html", "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Columbia University;University of California, Berkeley;University of Chicago", "aff_unique_dep": "Data Science Institute;School of Public Health;Booth School of Business", "aff_unique_url": "https://www.columbia.edu;https://www.berkeley.edu;https://www.chicagobooth.edu", "aff_unique_abbr": "Columbia;UC Berkeley;UChicago", "aff_campus_unique_index": "0;1;0;2", "aff_campus_unique": "New York;Berkeley;Chicago", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Disentangled Graph Convolutional Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3948", "id": "3948", "author_site": "Jianxin Ma, Peng Cui, Kun Kuang, Xin Wang, Wenwu Zhu", "author": "Jianxin Ma; Peng Cui; Kun Kuang; Xin Wang; Wenwu Zhu", "abstract": "The formation of a real-world graph typically arises from the highly complex interaction of many latent factors. The existing deep learning methods for graph-structured data neglect the entanglement of the latent factors, rendering the learned representations non-robust and hardly explainable. However, learning representations that disentangle the latent factors poses great challenges and remains largely unexplored in the literature of graph neural networks. In this paper, we introduce the disentangled graph convolutional network (DisenGCN) to learn disentangled node representations. In particular, we propose a novel neighborhood routing mechanism, which is capable of dynamically identifying the latent factor that may have caused the edge between a node and one of its neighbors, and accordingly assigning the neighbor to a channel that extracts and convolutes features specific to that factor. We theoretically prove the convergence properties of the routing mechanism. Empirical results show that our proposed model can achieve significant performance gains, especially when the data demonstrate the existence of many entangled factors.", "bibtex": "@InProceedings{pmlr-v97-ma19a,\n title = \t {Disentangled Graph Convolutional Networks},\n author = {Ma, Jianxin and Cui, Peng and Kuang, Kun and Wang, Xin and Zhu, Wenwu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4212--4221},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ma19a/ma19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ma19a.html},\n abstract = \t {The formation of a real-world graph typically arises from the highly complex interaction of many latent factors. The existing deep learning methods for graph-structured data neglect the entanglement of the latent factors, rendering the learned representations non-robust and hardly explainable. However, learning representations that disentangle the latent factors poses great challenges and remains largely unexplored in the literature of graph neural networks. In this paper, we introduce the disentangled graph convolutional network (DisenGCN) to learn disentangled node representations. In particular, we propose a novel neighborhood routing mechanism, which is capable of dynamically identifying the latent factor that may have caused the edge between a node and one of its neighbors, and accordingly assigning the neighbor to a channel that extracts and convolutes features specific to that factor. We theoretically prove the convergence properties of the routing mechanism. Empirical results show that our proposed model can achieve significant performance gains, especially when the data demonstrate the existence of many entangled factors.}\n}", "pdf": "http://proceedings.mlr.press/v97/ma19a/ma19a.pdf", "supp": "", "pdf_size": 569290, "gs_citation": 426, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3646112779231102462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/ma19a.html" }, { "title": "Disentangling Disentanglement in Variational Autoencoders", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3855", "id": "3855", "author_site": "Emile Mathieu, Tom Rainforth, N Siddharth, Yee-Whye Teh", "author": "Emile Mathieu; Tom Rainforth; N Siddharth; Yee Whye Teh", "abstract": "We develop a generalisation of disentanglement in variational autoencoders (VAEs)\u2014decomposition of the latent representation\u2014characterising it as the fulfilment of two factors: a) the latent encodings of the data having an appropriate level of overlap, and b) the aggregate encoding of the data conforming to a desired structure, represented through the prior. Decomposition permits disentanglement, i.e. explicit independence between latents, as a special case, but also allows for a much richer class of properties to be imposed on the learnt representation, such as sparsity, clustering, independent subspaces, or even intricate hierarchical dependency relationships. We show that the $\\beta$-VAE varies from the standard VAE predominantly in its control of latent overlap and that for the standard choice of an isotropic Gaussian prior, its objective is invariant to rotations of the latent representation. Viewed from the decomposition perspective, breaking this invariance with simple manipulations of the prior can yield better disentanglement with little or no detriment to reconstructions. We further demonstrate how other choices of prior can assist in producing different decompositions and introduce an alternative training objective that allows the control of both decomposition factors in a principled manner.", "bibtex": "@InProceedings{pmlr-v97-mathieu19a,\n title = \t {Disentangling Disentanglement in Variational Autoencoders},\n author = {Mathieu, Emile and Rainforth, Tom and Siddharth, N and Teh, Yee Whye},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4402--4412},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mathieu19a/mathieu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mathieu19a.html},\n abstract = \t {We develop a generalisation of disentanglement in variational autoencoders (VAEs)\u2014decomposition of the latent representation\u2014characterising it as the fulfilment of two factors: a) the latent encodings of the data having an appropriate level of overlap, and b) the aggregate encoding of the data conforming to a desired structure, represented through the prior. Decomposition permits disentanglement, i.e. explicit independence between latents, as a special case, but also allows for a much richer class of properties to be imposed on the learnt representation, such as sparsity, clustering, independent subspaces, or even intricate hierarchical dependency relationships. We show that the $\\beta$-VAE varies from the standard VAE predominantly in its control of latent overlap and that for the standard choice of an isotropic Gaussian prior, its objective is invariant to rotations of the latent representation. Viewed from the decomposition perspective, breaking this invariance with simple manipulations of the prior can yield better disentanglement with little or no detriment to reconstructions. We further demonstrate how other choices of prior can assist in producing different decompositions and introduce an alternative training objective that allows the control of both decomposition factors in a principled manner.}\n}", "pdf": "http://proceedings.mlr.press/v97/mathieu19a/mathieu19a.pdf", "supp": "", "pdf_size": 4115763, "gs_citation": 361, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4865252587822770331&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Statistics; Department of Statistics; Department of Engineering, University of Oxford; Department of Statistics", "aff_domain": "stats.ox.ac.uk;stats.ox.ac.uk;robots.ox.ac.uk; ", "email": "stats.ox.ac.uk;stats.ox.ac.uk;robots.ox.ac.uk; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/mathieu19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University Affiliation Not Specified;", "aff_unique_dep": "Department of Statistics;", "aff_unique_url": ";", "aff_unique_abbr": ";", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "", "aff_country_unique": "" }, { "title": "Distributed Learning over Unreliable Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4184", "id": "4184", "author_site": "Chen Yu, Hanlin Tang, Cedric Renggli, Simon Kassing, Ankit Singla, Dan Alistarh, Ce Zhang, Ji Liu", "author": "Chen Yu; Hanlin Tang; Cedric Renggli; Simon Kassing; Ankit Singla; Dan Alistarh; Ce Zhang; Ji Liu", "abstract": "Most of today\u2019s distributed machine learning systems assume", "bibtex": "@InProceedings{pmlr-v97-yu19f,\n title = \t {Distributed Learning over Unreliable Networks},\n author = {Yu, Chen and Tang, Hanlin and Renggli, Cedric and Kassing, Simon and Singla, Ankit and Alistarh, Dan and Zhang, Ce and Liu, Ji},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7202--7212},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yu19f/yu19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/yu19f.html},\n abstract = \t {Most of today\u2019s distributed machine learning systems assume", "pdf": "http://proceedings.mlr.press/v97/yu19f/yu19f.pdf", "supp": "", "pdf_size": 856094, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6368707817468385310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Department of Computer Science, University of Rochester, USA; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; Institute of Science and Technology Austria; Department of Computer Science, ETH Zurich; Seattle AI Lab, FeDA Lab, Kwai Inc.", "aff_domain": "ur.rochester.edu; ; ; ; ; ; ; ", "email": "ur.rochester.edu; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/yu19f.html", "aff_unique_index": "0;1;1;1;1;2;1;3", "aff_unique_norm": "University of Rochester;ETH Zurich;Institute of Science and Technology Austria;Seattle AI Lab", "aff_unique_dep": "Department of Computer Science;Department of Computer Science;;AI Lab", "aff_unique_url": "https://www.rochester.edu;https://www.ethz.ch;https://www.ist.ac.at;", "aff_unique_abbr": "U of Rochester;ETHZ;IST Austria;SeaAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;2;1;0", "aff_country_unique": "United States;Switzerland;Austria" }, { "title": "Distributed Learning with Sublinear Communication", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3787", "id": "3787", "author_site": "Jayadev Acharya, Christopher De Sa, Dylan Foster, Karthik Sridharan", "author": "Jayadev Acharya; Chris De Sa; Dylan Foster; Karthik Sridharan", "abstract": "In distributed statistical learning, $N$ samples are split across $m$ machines and a learner wishes to use minimal communication to learn as well as if the examples were on a single machine. This model has received substantial interest in machine learning due to its scalability and potential for parallel speedup. However, in high-dimensional settings, where the number examples is smaller than the number of features (\u2018\"dimension\"), the speedup afforded by distributed learning may be overshadowed by the cost of communicating a single example. This paper investigates the following question: When is it possible to learn a $d$-dimensional model in the distributed setting with total communication sublinear in $d$? Starting with a negative result, we observe that for learning $\\ell_1$-bounded or sparse linear models, no algorithm can obtain optimal error until communication is linear in dimension. Our main result is that by slightly relaxing the standard boundedness assumptions for linear models, we can obtain distributed algorithms that enjoy optimal error with communication", "bibtex": "@InProceedings{pmlr-v97-acharya19b,\n title = \t {Distributed Learning with Sublinear Communication},\n author = {Acharya, Jayadev and De Sa, Chris and Foster, Dylan and Sridharan, Karthik},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {40--50},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/acharya19b/acharya19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/acharya19b.html},\n abstract = \t {In distributed statistical learning, $N$ samples are split across $m$ machines and a learner wishes to use minimal communication to learn as well as if the examples were on a single machine. This model has received substantial interest in machine learning due to its scalability and potential for parallel speedup. However, in high-dimensional settings, where the number examples is smaller than the number of features (\u2018\"dimension\"), the speedup afforded by distributed learning may be overshadowed by the cost of communicating a single example. This paper investigates the following question: When is it possible to learn a $d$-dimensional model in the distributed setting with total communication sublinear in $d$? Starting with a negative result, we observe that for learning $\\ell_1$-bounded or sparse linear models, no algorithm can obtain optimal error until communication is linear in dimension. Our main result is that by slightly relaxing the standard boundedness assumptions for linear models, we can obtain distributed algorithms that enjoy optimal error with communication", "pdf": "http://proceedings.mlr.press/v97/acharya19b/acharya19b.pdf", "supp": "", "pdf_size": 1125887, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14493139483927597029&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "Cornell University; Cornell University; Massachusetts Institute of Technology; Cornell University", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/acharya19b.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Cornell University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://web.mit.edu", "aff_unique_abbr": "Cornell;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Distributed Weighted Matching via Randomized Composable Coresets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4194", "id": "4194", "author_site": "Sepehr Assadi, Mohammad Hossein Bateni, Vahab Mirrokni", "author": "Sepehr Assadi; Mohammadhossein Bateni; Vahab Mirrokni", "abstract": "Maximum weight matching is one of the most fundamental combinatorial optimization problems with a wide range of applications in data mining and bioinformatics. Developing distributed weighted matching algorithms has been challenging due to the sequential nature of efficient algorithms for this problem. In this paper, we develop a simple distributed algorithm for the problem on general graphs with approximation guarantee of 2 + eps that (nearly) matches that of the sequential greedy algorithm. A key advantage of this algorithm is that it can be easily implemented in only two rounds of computation in modern parallel computation frameworks such as MapReduce. We also demonstrate the efficiency of our algorithm in practice on various graphs (some with half a trillion edges) by achieving objective values always close to what is achievable in the centralized setting.", "bibtex": "@InProceedings{pmlr-v97-assadi19a,\n title = \t {Distributed Weighted Matching via Randomized Composable Coresets},\n author = {Assadi, Sepehr and Bateni, Mohammadhossein and Mirrokni, Vahab},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {333--343},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/assadi19a/assadi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/assadi19a.html},\n abstract = \t {Maximum weight matching is one of the most fundamental combinatorial optimization problems with a wide range of applications in data mining and bioinformatics. Developing distributed weighted matching algorithms has been challenging due to the sequential nature of efficient algorithms for this problem. In this paper, we develop a simple distributed algorithm for the problem on general graphs with approximation guarantee of 2 + eps that (nearly) matches that of the sequential greedy algorithm. A key advantage of this algorithm is that it can be easily implemented in only two rounds of computation in modern parallel computation frameworks such as MapReduce. We also demonstrate the efficiency of our algorithm in practice on various graphs (some with half a trillion edges) by achieving objective values always close to what is achievable in the centralized setting.}\n}", "pdf": "http://proceedings.mlr.press/v97/assadi19a/assadi19a.pdf", "supp": "", "pdf_size": 320001, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11399120185515564658&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Princeton University, Princeton, NJ, US + Google Research, New York, NY, US; Google Research, New York, NY, US; Google Research, New York, NY, US", "aff_domain": "princeton.edu; ; ", "email": "princeton.edu; ; ", "github": "", "project": "https://arxiv.org/", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/assadi19a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "Princeton University;Google", "aff_unique_dep": "Department of Computer Science;Google Research", "aff_unique_url": "https://www.princeton.edu;https://research.google", "aff_unique_abbr": "Princeton;Google Research", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "Princeton;New York", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Distributed, Egocentric Representations of Graphs for Detecting Critical Structures", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3807", "id": "3807", "author_site": "Ruo-Chun Tzeng, Shan-Hung (Brandon) Wu", "author": "Ruo-Chun Tzeng; Shan-Hung Wu", "abstract": "We study the problem of detecting critical structures using a graph embedding model. Existing graph embedding models lack the ability to precisely detect critical structures that are specific to a task at the global scale. In this paper, we propose a novel graph embedding model, called the Ego-CNNs, that employs the ego-convolutions convolutions at each layer and stacks up layers using an ego-centric way to detects precise critical structures efficiently. An Ego-CNN can be jointly trained with a task model and help explain/discover knowledge for the task. We conduct extensive experiments and the results show that Ego-CNNs (1) can lead to comparable task performance as the state-of-the-art graph embedding models, (2) works nicely with CNN visualization techniques to illustrate the detected structures, and (3) is efficient and can incorporate with scale-free priors, which commonly occurs in social network datasets, to further improve the training efficiency.", "bibtex": "@InProceedings{pmlr-v97-tzeng19a,\n title = \t {Distributed, Egocentric Representations of Graphs for Detecting Critical Structures},\n author = {Tzeng, Ruo-Chun and Wu, Shan-Hung},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6354--6362},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tzeng19a/tzeng19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tzeng19a.html},\n abstract = \t {We study the problem of detecting critical structures using a graph embedding model. Existing graph embedding models lack the ability to precisely detect critical structures that are specific to a task at the global scale. In this paper, we propose a novel graph embedding model, called the Ego-CNNs, that employs the ego-convolutions convolutions at each layer and stacks up layers using an ego-centric way to detects precise critical structures efficiently. An Ego-CNN can be jointly trained with a task model and help explain/discover knowledge for the task. We conduct extensive experiments and the results show that Ego-CNNs (1) can lead to comparable task performance as the state-of-the-art graph embedding models, (2) works nicely with CNN visualization techniques to illustrate the detected structures, and (3) is efficient and can incorporate with scale-free priors, which commonly occurs in social network datasets, to further improve the training efficiency.}\n}", "pdf": "http://proceedings.mlr.press/v97/tzeng19a/tzeng19a.pdf", "supp": "", "pdf_size": 2257904, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12279780969131164067&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Microsoft Inc.; CS Department, National Tsing Hua University, Taiwan", "aff_domain": "microsoft.com;cs.nthu.edu.tw", "email": "microsoft.com;cs.nthu.edu.tw", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/tzeng19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Microsoft;National Tsing Hua University", "aff_unique_dep": "Microsoft Corporation;CS Department", "aff_unique_url": "https://www.microsoft.com;https://www.nthu.edu.tw", "aff_unique_abbr": "Microsoft;NTHU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Taiwan", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "title": "Distribution calibration for regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4047", "id": "4047", "author_site": "Hao Song, Tom Diethe, Meelis Kull, Peter Flach", "author": "Hao Song; Tom Diethe; Meelis Kull; Peter Flach", "abstract": "We are concerned with obtaining well-calibrated output distributions from regression models. Such distributions allow us to quantify the uncertainty that the model has regarding the predicted target value. We introduce the novel concept of distribution calibration, and demonstrate its advantages over the existing definition of quantile calibration. We further propose a post-hoc approach to improving the predictions from previously trained regression models, using multi-output Gaussian Processes with a novel Beta link function. The proposed method is experimentally verified on a set of common regression models and shows improvements for both distribution-level and quantile-level calibration.", "bibtex": "@InProceedings{pmlr-v97-song19a,\n title = \t {Distribution calibration for regression},\n author = {Song, Hao and Diethe, Tom and Kull, Meelis and Flach, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5897--5906},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/song19a/song19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/song19a.html},\n abstract = \t {We are concerned with obtaining well-calibrated output distributions from regression models. Such distributions allow us to quantify the uncertainty that the model has regarding the predicted target value. We introduce the novel concept of distribution calibration, and demonstrate its advantages over the existing definition of quantile calibration. We further propose a post-hoc approach to improving the predictions from previously trained regression models, using multi-output Gaussian Processes with a novel Beta link function. The proposed method is experimentally verified on a set of common regression models and shows improvements for both distribution-level and quantile-level calibration.}\n}", "pdf": "http://proceedings.mlr.press/v97/song19a/song19a.pdf", "supp": "", "pdf_size": 2009686, "gs_citation": 140, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=369577948168997714&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "University of Bristol, Bristol, United Kingdom+The Alan Turing Institute, London, United Kingdom; Amazon Research, Cambridge, United Kingdom; University of Tartu, Tartu, Estonia; University of Bristol, Bristol, United Kingdom+The Alan Turing Institute, London, United Kingdom", "aff_domain": "bristol.ac.uk; ; ; ", "email": "bristol.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/song19a.html", "aff_unique_index": "0+1;2;3;0+1", "aff_unique_norm": "University of Bristol;Alan Turing Institute;Amazon;University of Tartu", "aff_unique_dep": ";;Amazon Research;", "aff_unique_url": "https://www.bristol.ac.uk;https://www.turing.ac.uk;https://www.amazon.science;https://www.ut.ee", "aff_unique_abbr": "UoB;ATI;Amazon Research;UT", "aff_campus_unique_index": "0+1;2;3;0+1", "aff_campus_unique": "Bristol;London;Cambridge;Tartu", "aff_country_unique_index": "0+0;0;1;0+0", "aff_country_unique": "United Kingdom;Estonia" }, { "title": "Distributional Multivariate Policy Evaluation and Exploration with the Bellman GAN", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3848", "id": "3848", "author_site": "dror freirich, Tzahi Shimkin, Ron Meir, Aviv Tamar", "author": "Dror Freirich; Tzahi Shimkin; Ron Meir; Aviv Tamar", "abstract": "The recently proposed distributional approach to reinforcement learning (DiRL) is centered on learning the distribution of the reward-to-go, often referred to as the value distribution. In this work, we show that the distributional Bellman equation, which drives DiRL methods, is equivalent to a generative adversarial network (GAN) model. In this formulation, DiRL can be seen as learning a deep generative model of the value distribution, driven by the discrepancy between the distribution of the current value, and the distribution of the sum of current reward and next value. We use this insight to propose a GAN-based approach to DiRL, which leverages the strengths of GANs in learning distributions of high dimensional data. In particular, we show that our GAN approach can be used for DiRL with multivariate rewards, an important setting which cannot be tackled with prior methods. The multivariate setting also allows us to unify learning the distribution of values and state transitions, and we exploit this idea to devise a novel exploration method that is driven by the discrepancy in estimating both values and states.", "bibtex": "@InProceedings{pmlr-v97-freirich19a,\n title = \t {Distributional Multivariate Policy Evaluation and Exploration with the {B}ellman {GAN}},\n author = {Freirich, Dror and Shimkin, Tzahi and Meir, Ron and Tamar, Aviv},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1983--1992},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/freirich19a/freirich19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/freirich19a.html},\n abstract = \t {The recently proposed distributional approach to reinforcement learning (DiRL) is centered on learning the distribution of the reward-to-go, often referred to as the value distribution. In this work, we show that the distributional Bellman equation, which drives DiRL methods, is equivalent to a generative adversarial network (GAN) model. In this formulation, DiRL can be seen as learning a deep generative model of the value distribution, driven by the discrepancy between the distribution of the current value, and the distribution of the sum of current reward and next value. We use this insight to propose a GAN-based approach to DiRL, which leverages the strengths of GANs in learning distributions of high dimensional data. In particular, we show that our GAN approach can be used for DiRL with multivariate rewards, an important setting which cannot be tackled with prior methods. The multivariate setting also allows us to unify learning the distribution of values and state transitions, and we exploit this idea to devise a novel exploration method that is driven by the discrepancy in estimating both values and states.}\n}", "pdf": "http://proceedings.mlr.press/v97/freirich19a/freirich19a.pdf", "supp": "", "pdf_size": 807683, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2058761807543421988&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "The Viterbi Faculty of Electrical Engineering, Technion - Israel Institute of Technology; The Viterbi Faculty of Electrical Engineering, Technion - Israel Institute of Technology; The Viterbi Faculty of Electrical Engineering, Technion - Israel Institute of Technology; Berkeley AI Research Lab, UC Berkeley", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/freirich19a.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Technion - Israel Institute of Technology;University of California, Berkeley", "aff_unique_dep": "Viterbi Faculty of Electrical Engineering;Berkeley AI Research Lab", "aff_unique_url": "https://www.technion.ac.il;https://www.berkeley.edu", "aff_unique_abbr": "Technion;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Israel;United States" }, { "title": "Distributional Reinforcement Learning for Efficient Exploration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4297", "id": "4297", "author_site": "Borislav Mavrin, Hengshuai Yao, Linglong Kong, Kaiwen Wu, Yaoliang Yu", "author": "Borislav Mavrin; Hengshuai Yao; Linglong Kong; Kaiwen Wu; Yaoliang Yu", "abstract": "In distributional reinforcement learning (RL), the estimated distribution of value functions model both the parametric and intrinsic uncertainties. We propose a novel and efficient exploration method for deep RL that has two components. The first is a decaying schedule to suppress the intrinsic uncertainty. The second is an exploration bonus calculated from the upper quantiles of the learned distribution. In Atari 2600 games, our method achieves 483 % average gain across 49 games in cumulative rewards over QR-DQN. We also compared our algorithm with QR-DQN in a challenging 3D driving simulator (CARLA). Results show that our algorithm achieves nearoptimal safety rewards twice faster than QRDQN.", "bibtex": "@InProceedings{pmlr-v97-mavrin19a,\n title = \t {Distributional Reinforcement Learning for Efficient Exploration},\n author = {Mavrin, Borislav and Yao, Hengshuai and Kong, Linglong and Wu, Kaiwen and Yu, Yaoliang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4424--4434},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mavrin19a/mavrin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mavrin19a.html},\n abstract = \t {In distributional reinforcement learning (RL), the estimated distribution of value functions model both the parametric and intrinsic uncertainties. We propose a novel and efficient exploration method for deep RL that has two components. The first is a decaying schedule to suppress the intrinsic uncertainty. The second is an exploration bonus calculated from the upper quantiles of the learned distribution. In Atari 2600 games, our method achieves 483 % average gain across 49 games in cumulative rewards over QR-DQN. We also compared our algorithm with QR-DQN in a challenging 3D driving simulator (CARLA). Results show that our algorithm achieves nearoptimal safety rewards twice faster than QRDQN.}\n}", "pdf": "http://proceedings.mlr.press/v97/mavrin19a/mavrin19a.pdf", "supp": "", "pdf_size": 3822139, "gs_citation": 112, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1525223277413702022&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Alberta + Huawei Noah\u2019s Ark; Huawei Hi-Silicon; University of Alberta + Huawei Noah\u2019s Ark; University of Waterloo; University of Waterloo", "aff_domain": "ualberta.ca;huawei.com;ualberta.ca;uwaterloo.ca;uwaterloo.ca", "email": "ualberta.ca;huawei.com;ualberta.ca;uwaterloo.ca;uwaterloo.ca", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/mavrin19a.html", "aff_unique_index": "0+1;1;0+1;2;2", "aff_unique_norm": "University of Alberta;Huawei;University of Waterloo", "aff_unique_dep": ";Noah\u2019s Ark;", "aff_unique_url": "https://www.ualberta.ca;https://www.huawei.com;https://uwaterloo.ca", "aff_unique_abbr": "UAlberta;Huawei;UW", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0+1;0;0", "aff_country_unique": "Canada;China" }, { "title": "Do ImageNet Classifiers Generalize to ImageNet?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4270", "id": "4270", "author_site": "Benjamin Recht, Rebecca Roelofs, Ludwig Schmidt, Vaishaal Shankar", "author": "Benjamin Recht; Rebecca Roelofs; Ludwig Schmidt; Vaishaal Shankar", "abstract": "We build new test sets for the CIFAR-10 and ImageNet datasets. Both benchmarks have been the focus of intense research for almost a decade, raising the danger of overfitting to excessively re-used test sets. By closely following the original dataset creation processes, we test to what extent current classification models generalize to new data. We evaluate a broad range of models and find accuracy drops of 3% - 15% on CIFAR-10 and 11% - 14% on ImageNet. However, accuracy gains on the original test sets translate to larger gains on the new test sets. Our results suggest that the accuracy drops are not caused by adaptivity, but by the models\u2019 inability to generalize to slightly \"harder\" images than those found in the original test sets.", "bibtex": "@InProceedings{pmlr-v97-recht19a,\n title = \t {Do {I}mage{N}et Classifiers Generalize to {I}mage{N}et?},\n author = {Recht, Benjamin and Roelofs, Rebecca and Schmidt, Ludwig and Shankar, Vaishaal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5389--5400},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/recht19a/recht19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/recht19a.html},\n abstract = \t {We build new test sets for the CIFAR-10 and ImageNet datasets. Both benchmarks have been the focus of intense research for almost a decade, raising the danger of overfitting to excessively re-used test sets. By closely following the original dataset creation processes, we test to what extent current classification models generalize to new data. We evaluate a broad range of models and find accuracy drops of 3% - 15% on CIFAR-10 and 11% - 14% on ImageNet. However, accuracy gains on the original test sets translate to larger gains on the new test sets. Our results suggest that the accuracy drops are not caused by adaptivity, but by the models\u2019 inability to generalize to slightly \"harder\" images than those found in the original test sets.}\n}", "pdf": "http://proceedings.mlr.press/v97/recht19a/recht19a.pdf", "supp": "", "pdf_size": 743145, "gs_citation": 2201, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9642974458829870490&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of California Berkeley; Department of Computer Science, University of California Berkeley; Department of Computer Science, University of California Berkeley; Department of Computer Science, University of California Berkeley", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "https://github.com/modestyachts/CIFAR-10;https://github.com/modestyachts/ImageNetV2", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/recht19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Does Data Augmentation Lead to Positive Margin?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3770", "id": "3770", "author_site": "Shashank Rajput, Zhili Feng, Zachary Charles, Po-Ling Loh, Dimitris Papailiopoulos", "author": "Shashank Rajput; Zhili Feng; Zachary Charles; Po-Ling Loh; Dimitris Papailiopoulos", "abstract": "Data augmentation (DA) is commonly used during model training, as it significantly improves test error and model robustness. DA artificially expands the training set by applying random noise, rotations, crops, or even adversarial perturbations to the input data. Although DA is widely used, its capacity to provably improve robustness is not fully understood. In this work, we analyze the robustness that DA begets by quantifying the margin that DA enforces on empirical risk minimizers. We first focus on linear separators, and then a class of nonlinear models whose labeling is constant within small convex hulls of data points. We present lower bounds on the number of augmented data points required for non-zero margin, and show that commonly used DA techniques may only introduce significant margin after adding exponentially many points to the data set.", "bibtex": "@InProceedings{pmlr-v97-rajput19a,\n title = \t {Does Data Augmentation Lead to Positive Margin?},\n author = {Rajput, Shashank and Feng, Zhili and Charles, Zachary and Loh, Po-Ling and Papailiopoulos, Dimitris},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5321--5330},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rajput19a/rajput19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rajput19a.html},\n abstract = \t {Data augmentation (DA) is commonly used during model training, as it significantly improves test error and model robustness. DA artificially expands the training set by applying random noise, rotations, crops, or even adversarial perturbations to the input data. Although DA is widely used, its capacity to provably improve robustness is not fully understood. In this work, we analyze the robustness that DA begets by quantifying the margin that DA enforces on empirical risk minimizers. We first focus on linear separators, and then a class of nonlinear models whose labeling is constant within small convex hulls of data points. We present lower bounds on the number of augmented data points required for non-zero margin, and show that commonly used DA techniques may only introduce significant margin after adding exponentially many points to the data set.}\n}", "pdf": "http://proceedings.mlr.press/v97/rajput19a/rajput19a.pdf", "supp": "", "pdf_size": 716997, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17497626549920845799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, University of Wisconsin-Madison; Department of Computer Science, University of Wisconsin-Madison; Department of Electrical and Computer Engineering, University of Wisconsin-Madison; Department of Statistics, University of Wisconsin-Madison; Department of Electrical and Computer Engineering, University of Wisconsin-Madison", "aff_domain": "wisc.edu;cs.wisc.edu; ; ; ", "email": "wisc.edu;cs.wisc.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/rajput19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Domain Adaptation with Asymmetrically-Relaxed Distribution Alignment", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3752", "id": "3752", "author_site": "Yifan Wu, Ezra Winston, Divyansh Kaushik, Zachary Lipton", "author": "Yifan Wu; Ezra Winston; Divyansh Kaushik; Zachary Lipton", "abstract": "Domain adaptation addresses the common situation in which the target distribution generating our test data differs from the source distribution generating our training data. While absent assumptions, domain adaptation is impossible, strict conditions, e.g. covariate or label shift, enable principled algorithms. Recently-proposed domain-adversarial approaches consist of aligning source and target encodings, an approach often motivated as minimizing two (of three) terms in a theoretical bound on target error. Unfortunately, this minimization can cause arbitrary increases in the third term, a problem guaranteed to arise under shifting label distributions. We propose asymmetrically-relaxed distribution alignment, a new approach that overcomes some limitations of standard domain-adversarial algorithms. Moreover, we characterize precise assumptions under which our algorithm is theoretically principled and demonstrate empirical benefits on both synthetic and real datasets.", "bibtex": "@InProceedings{pmlr-v97-wu19f,\n title = \t {Domain Adaptation with Asymmetrically-Relaxed Distribution Alignment},\n author = {Wu, Yifan and Winston, Ezra and Kaushik, Divyansh and Lipton, Zachary},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6872--6881},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wu19f/wu19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/wu19f.html},\n abstract = \t {Domain adaptation addresses the common situation in which the target distribution generating our test data differs from the source distribution generating our training data. While absent assumptions, domain adaptation is impossible, strict conditions, e.g. covariate or label shift, enable principled algorithms. Recently-proposed domain-adversarial approaches consist of aligning source and target encodings, an approach often motivated as minimizing two (of three) terms in a theoretical bound on target error. Unfortunately, this minimization can cause arbitrary increases in the third term, a problem guaranteed to arise under shifting label distributions. We propose asymmetrically-relaxed distribution alignment, a new approach that overcomes some limitations of standard domain-adversarial algorithms. Moreover, we characterize precise assumptions under which our algorithm is theoretically principled and demonstrate empirical benefits on both synthetic and real datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/wu19f/wu19f.pdf", "supp": "", "pdf_size": 1552734, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16326230687922404950&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University", "aff_domain": "andrew.cmu.edu; ; ; ", "email": "andrew.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wu19f.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Domain Agnostic Learning with Disentangled Representations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3571", "id": "3571", "author_site": "Xingchao Peng, Zijun Huang, Ximeng Sun, Kate Saenko", "author": "Xingchao Peng; Zijun Huang; Ximeng Sun; Kate Saenko", "abstract": "Unsupervised model transfer has the potential to greatly improve the generalizability of deep models to novel domains. Yet the current literature assumes that the separation of target data into distinct domains is known a priori. In this paper, we propose the task of Domain-Agnostic Learning (DAL): How to transfer knowledge from a labeled source domain to unlabeled data from arbitrary target domains? To tackle this problem, we devise a novel Deep Adversarial Disentangled Autoencoder (DADA) capable of disentangling domain-specific features from class identity. We demonstrate experimentally that when the target domain labels are unknown, DADA leads to state-of-the-art performance on several image classification datasets.", "bibtex": "@InProceedings{pmlr-v97-peng19b,\n title = \t {Domain Agnostic Learning with Disentangled Representations},\n author = {Peng, Xingchao and Huang, Zijun and Sun, Ximeng and Saenko, Kate},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5102--5112},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/peng19b/peng19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/peng19b.html},\n abstract = \t {Unsupervised model transfer has the potential to greatly improve the generalizability of deep models to novel domains. Yet the current literature assumes that the separation of target data into distinct domains is known a priori. In this paper, we propose the task of Domain-Agnostic Learning (DAL): How to transfer knowledge from a labeled source domain to unlabeled data from arbitrary target domains? To tackle this problem, we devise a novel Deep Adversarial Disentangled Autoencoder (DADA) capable of disentangling domain-specific features from class identity. We demonstrate experimentally that when the target domain labels are unknown, DADA leads to state-of-the-art performance on several image classification datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/peng19b/peng19b.pdf", "supp": "", "pdf_size": 4026073, "gs_citation": 343, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10085135045247935679&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Computer Science Department, Boston University; Columbia Unversity and MADO AI Research; Computer Science Department, Boston University; Computer Science Department, Boston University", "aff_domain": "bu.edu;columbia.edu; ;bu.edu", "email": "bu.edu;columbia.edu; ;bu.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/peng19b.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Boston University;Columbia University", "aff_unique_dep": "Computer Science Department;", "aff_unique_url": "https://www.bu.edu;https://www.columbia.edu", "aff_unique_abbr": "BU;Columbia", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Boston;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "DoubleSqueeze: Parallel Stochastic Gradient Descent with Double-pass Error-Compensated Compression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4191", "id": "4191", "author_site": "Hanlin Tang, Chen Yu, Xiangru Lian, Tong Zhang, Ji Liu", "author": "Hanlin Tang; Chen Yu; Xiangru Lian; Tong Zhang; Ji Liu", "abstract": "A standard approach in large scale machine learning is distributed stochastic gradient training, which requires the computation of aggregated stochastic gradients over multiple nodes on a network. Communication is a major bottleneck in such applications, and in recent years, compressed stochastic gradient methods such as QSGD (quantized SGD) and sparse SGD have been proposed to reduce communication. It was also shown that error compensation can be combined with compression to achieve better convergence in a scheme that each node compresses its local stochastic gradient and broadcast the result to all other nodes over the network in a single pass. However, such a single pass broadcast approach is not realistic in many practical implementations. For example, under the popular parameter-server model for distributed learning, the worker nodes need to send the compressed local gradients to the parameter server, which performs the aggregation. The parameter server has to compress the aggregated stochastic gradient again before sending it back to the worker nodes. In this work, we provide a detailed analysis on this two-pass communication model, with error-compensated compression both on the worker nodes and on the parameter server. We show that the error-compensated stochastic gradient algorithm admits three very nice properties: 1) it is compatible with an", "bibtex": "@InProceedings{pmlr-v97-tang19d,\n title = \t {$\\texttt{DoubleSqueeze}$: Parallel Stochastic Gradient Descent with Double-pass Error-Compensated Compression},\n author = {Tang, Hanlin and Yu, Chen and Lian, Xiangru and Zhang, Tong and Liu, Ji},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6155--6165},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tang19d/tang19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/tang19d.html},\n abstract = \t {A standard approach in large scale machine learning is distributed stochastic gradient training, which requires the computation of aggregated stochastic gradients over multiple nodes on a network. Communication is a major bottleneck in such applications, and in recent years, compressed stochastic gradient methods such as QSGD (quantized SGD) and sparse SGD have been proposed to reduce communication. It was also shown that error compensation can be combined with compression to achieve better convergence in a scheme that each node compresses its local stochastic gradient and broadcast the result to all other nodes over the network in a single pass. However, such a single pass broadcast approach is not realistic in many practical implementations. For example, under the popular parameter-server model for distributed learning, the worker nodes need to send the compressed local gradients to the parameter server, which performs the aggregation. The parameter server has to compress the aggregated stochastic gradient again before sending it back to the worker nodes. In this work, we provide a detailed analysis on this two-pass communication model, with error-compensated compression both on the worker nodes and on the parameter server. We show that the error-compensated stochastic gradient algorithm admits three very nice properties: 1) it is compatible with an", "pdf": "http://proceedings.mlr.press/v97/tang19d/tang19d.pdf", "supp": "", "pdf_size": 2659558, "gs_citation": 289, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17437086628474726391&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Rochester; University of Rochester; University of Rochester; Hong Kong University of Science and Technology; Seattle AI Lab, FeDA Lab, Kwai Inc.", "aff_domain": "ur.rochester.edu; ; ; ; ", "email": "ur.rochester.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/tang19d.html", "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Rochester;Hong Kong University of Science and Technology;Seattle AI Lab", "aff_unique_dep": ";;AI Lab", "aff_unique_url": "https://www.rochester.edu;https://www.ust.hk;", "aff_unique_abbr": "U of R;HKUST;SeaAI", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Doubly Robust Joint Learning for Recommendation on Data Missing Not at Random", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3717", "id": "3717", "author_site": "Xiaojie Wang, Rui Zhang, Yu Sun, Jianzhong Qi", "author": "Xiaojie Wang; Rui Zhang; Yu Sun; Jianzhong Qi", "abstract": "In recommender systems, usually the ratings of a user to most items are missing and a critical problem is that the missing ratings are often missing not at random (MNAR) in reality. It is widely acknowledged that MNAR ratings make it difficult to accurately predict the ratings and unbiasedly estimate the performance of rating prediction. Recent approaches use imputed errors to recover the prediction errors for missing ratings, or weight observed ratings with the propensities of being observed. These approaches can still be severely biased in performance estimation or suffer from the variance of the propensities. To overcome these limitations, we first propose an estimator that integrates the imputed errors and propensities in a doubly robust way to obtain unbiased performance estimation and alleviate the effect of the propensity variance. To achieve good performance guarantees, based on this estimator, we propose joint learning of rating prediction and error imputation, which outperforms the state-of-the-art approaches on four real-world datasets.", "bibtex": "@InProceedings{pmlr-v97-wang19n,\n title = \t {Doubly Robust Joint Learning for Recommendation on Data Missing Not at Random},\n author = {Wang, Xiaojie and Zhang, Rui and Sun, Yu and Qi, Jianzhong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6638--6647},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19n/wang19n.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19n.html},\n abstract = \t {In recommender systems, usually the ratings of a user to most items are missing and a critical problem is that the missing ratings are often missing not at random (MNAR) in reality. It is widely acknowledged that MNAR ratings make it difficult to accurately predict the ratings and unbiasedly estimate the performance of rating prediction. Recent approaches use imputed errors to recover the prediction errors for missing ratings, or weight observed ratings with the propensities of being observed. These approaches can still be severely biased in performance estimation or suffer from the variance of the propensities. To overcome these limitations, we first propose an estimator that integrates the imputed errors and propensities in a doubly robust way to obtain unbiased performance estimation and alleviate the effect of the propensity variance. To achieve good performance guarantees, based on this estimator, we propose joint learning of rating prediction and error imputation, which outperforms the state-of-the-art approaches on four real-world datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19n/wang19n.pdf", "supp": "", "pdf_size": 345953, "gs_citation": 282, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16074824664429207392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Melbourne; University of Melbourne; Twitter; University of Melbourne", "aff_domain": "unimelb.edu.au;unimelb.edu.au;twitter.com;unimelb.edu.au", "email": "unimelb.edu.au;unimelb.edu.au;twitter.com;unimelb.edu.au", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wang19n.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Melbourne;Twitter, Inc.", "aff_unique_dep": ";", "aff_unique_url": "https://www.unimelb.edu.au;https://twitter.com", "aff_unique_abbr": "UniMelb;Twitter", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Australia;United States" }, { "title": "Doubly-Competitive Distribution Estimation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3929", "id": "3929", "author_site": "Yi Hao, Alon Orlitsky", "author": "Yi Hao; Alon Orlitsky", "abstract": "Distribution estimation is a statistical-learning cornerstone. Its classical", "bibtex": "@InProceedings{pmlr-v97-hao19a,\n title = \t {Doubly-Competitive Distribution Estimation},\n author = {Hao, Yi and Orlitsky, Alon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2614--2623},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hao19a/hao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hao19a.html},\n abstract = \t {Distribution estimation is a statistical-learning cornerstone. Its classical", "pdf": "http://proceedings.mlr.press/v97/hao19a/hao19a.pdf", "supp": "", "pdf_size": 287578, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10788515885816980427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical and Computer Engineering, University of California, San Diego, USA; Department of Electrical and Computer Engineering, University of California, San Diego, USA", "aff_domain": "eng.ucsd.edu;ucsd.edu", "email": "eng.ucsd.edu;ucsd.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/hao19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, San Diego", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.ucsd.edu", "aff_unique_abbr": "UCSD", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Dropout as a Structured Shrinkage Prior", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3900", "id": "3900", "author_site": "Eric Nalisnick, Jose Miguel Hernandez-Lobato, Padhraic Smyth", "author": "Eric Nalisnick; Jose Miguel Hernandez-Lobato; Padhraic Smyth", "abstract": "Dropout regularization of deep neural networks has been a mysterious yet effective tool to prevent overfitting. Explanations for its success range from the prevention of \"co-adapted\" weights to it being a form of cheap Bayesian inference. We propose a novel framework for understanding multiplicative noise in neural networks, considering continuous distributions as well as Bernoulli noise (i.e. dropout). We show that multiplicative noise induces structured shrinkage priors on a network\u2019s weights. We derive the equivalence through reparametrization properties of scale mixtures and without invoking any approximations. Given the equivalence, we then show that dropout\u2019s Monte Carlo training objective approximates marginal MAP estimation. We leverage these insights to propose a novel shrinkage framework for resnets, terming the prior \u2019automatic depth determination\u2019 as it is the natural analog of automatic relevance determination for network depth. Lastly, we investigate two inference strategies that improve upon the aforementioned MAP approximation in regression benchmarks.", "bibtex": "@InProceedings{pmlr-v97-nalisnick19a,\n title = \t {Dropout as a Structured Shrinkage Prior},\n author = {Nalisnick, Eric and Hernandez-Lobato, Jose Miguel and Smyth, Padhraic},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4712--4722},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nalisnick19a/nalisnick19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nalisnick19a.html},\n abstract = \t {Dropout regularization of deep neural networks has been a mysterious yet effective tool to prevent overfitting. Explanations for its success range from the prevention of \"co-adapted\" weights to it being a form of cheap Bayesian inference. We propose a novel framework for understanding multiplicative noise in neural networks, considering continuous distributions as well as Bernoulli noise (i.e. dropout). We show that multiplicative noise induces structured shrinkage priors on a network\u2019s weights. We derive the equivalence through reparametrization properties of scale mixtures and without invoking any approximations. Given the equivalence, we then show that dropout\u2019s Monte Carlo training objective approximates marginal MAP estimation. We leverage these insights to propose a novel shrinkage framework for resnets, terming the prior \u2019automatic depth determination\u2019 as it is the natural analog of automatic relevance determination for network depth. Lastly, we investigate two inference strategies that improve upon the aforementioned MAP approximation in regression benchmarks.}\n}", "pdf": "http://proceedings.mlr.press/v97/nalisnick19a/nalisnick19a.pdf", "supp": "", "pdf_size": 852711, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16208195687877220296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Engineering, University of Cambridge, Cambridge, United Kingdom+Microsoft Research, Cambridge, United Kingdom+Alan Turing Institute; Department of Engineering, University of Cambridge, Cambridge, United Kingdom+Microsoft Research, Cambridge, United Kingdom+Alan Turing Institute; Department of Computer Science, University of California, Irvine, United States of America", "aff_domain": "eng.cam.ac.uk; ; ", "email": "eng.cam.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/nalisnick19a.html", "aff_unique_index": "0+1+2;0+1+2;3", "aff_unique_norm": "University of Cambridge;Microsoft;Alan Turing Institute;University of California, Irvine", "aff_unique_dep": "Department of Engineering;Microsoft Research;;Department of Computer Science", "aff_unique_url": "https://www.cam.ac.uk;https://www.microsoft.com/en-us/research;https://www.turing.ac.uk;https://www.uci.edu", "aff_unique_abbr": "Cambridge;MSR;ATI;UCI", "aff_campus_unique_index": "0+0;0+0;2", "aff_campus_unique": "Cambridge;;Irvine", "aff_country_unique_index": "0+0+0;0+0+0;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Dual Entangled Polynomial Code: Three-Dimensional Coding for Distributed Matrix Multiplication", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3960", "id": "3960", "author_site": "Pedro Soto, Jun Li, Xiaodi Fan", "author": "Pedro Soto; Jun Li; Xiaodi Fan", "abstract": "Matrix multiplication is a fundamental building block in various machine learning algorithms. When the matrix comes from a large dataset, the multiplication can be split into multiple tasks which calculate the multiplication of submatrices on different nodes. As some nodes may be stragglers, coding schemes have been proposed to tolerate stragglers in such distributed matrix multiplication. However, existing coding schemes typically split the matrices in only one or two dimensions, limiting their capabilities to handle large-scale matrix multiplication. Three-dimensional coding, however, does not have any code construction that achieves the optimal number of tasks required for decoding, with the best result achieved by entangled polynomial (EP) codes. In this paper, we propose dual entangled polynomial (DEP) codes that require around 25% fewer tasks than EP codes by executing two matrix multiplications on each task. With experiments in a real cloud environment, we show that DEP codes can also save the decoding overhead and memory consumption of tasks.", "bibtex": "@InProceedings{pmlr-v97-soto19a,\n title = \t {Dual Entangled Polynomial Code: Three-Dimensional Coding for Distributed Matrix Multiplication},\n author = {Soto, Pedro and Li, Jun and Fan, Xiaodi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5937--5945},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/soto19a/soto19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/soto19a.html},\n abstract = \t {Matrix multiplication is a fundamental building block in various machine learning algorithms. When the matrix comes from a large dataset, the multiplication can be split into multiple tasks which calculate the multiplication of submatrices on different nodes. As some nodes may be stragglers, coding schemes have been proposed to tolerate stragglers in such distributed matrix multiplication. However, existing coding schemes typically split the matrices in only one or two dimensions, limiting their capabilities to handle large-scale matrix multiplication. Three-dimensional coding, however, does not have any code construction that achieves the optimal number of tasks required for decoding, with the best result achieved by entangled polynomial (EP) codes. In this paper, we propose dual entangled polynomial (DEP) codes that require around 25% fewer tasks than EP codes by executing two matrix multiplications on each task. With experiments in a real cloud environment, we show that DEP codes can also save the decoding overhead and memory consumption of tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/soto19a/soto19a.pdf", "supp": "", "pdf_size": 2197551, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15433238452854889382&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/soto19a.html" }, { "title": "Dynamic Learning with Frequent New Product Launches: A Sequential Multinomial Logit Bandit Problem", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3954", "id": "3954", "author_site": "Junyu Cao, Wei Sun", "author": "Junyu Cao; Wei Sun", "abstract": "Motivated by the phenomenon that companies introduce new products to keep abreast with customers\u2019 rapidly changing tastes, we consider a novel online learning setting where a profit-maximizing seller needs to learn customers\u2019 preferences through offering recommendations, which may contain existing products and new products that are launched in the middle of a selling period. We propose a sequential multinomial logit (SMNL) model to characterize customers\u2019 behavior when product recommendations are presented in tiers. For the offline version with known customers\u2019 preferences, we propose a polynomial-time algorithm and characterize the properties of the optimal tiered product recommendation. For the online problem, we propose a learning algorithm and quantify its regret bound. Moreover, we extend the setting to incorporate a constraint which ensures every new product is learned to a given accuracy. Our results demonstrate the tier structure can be used to mitigate the risks associated with learning new products.", "bibtex": "@InProceedings{pmlr-v97-cao19a,\n title = \t {Dynamic Learning with Frequent New Product Launches: A Sequential Multinomial Logit Bandit Problem},\n author = {Cao, Junyu and Sun, Wei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {912--920},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cao19a/cao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cao19a.html},\n abstract = \t {Motivated by the phenomenon that companies introduce new products to keep abreast with customers\u2019 rapidly changing tastes, we consider a novel online learning setting where a profit-maximizing seller needs to learn customers\u2019 preferences through offering recommendations, which may contain existing products and new products that are launched in the middle of a selling period. We propose a sequential multinomial logit (SMNL) model to characterize customers\u2019 behavior when product recommendations are presented in tiers. For the offline version with known customers\u2019 preferences, we propose a polynomial-time algorithm and characterize the properties of the optimal tiered product recommendation. For the online problem, we propose a learning algorithm and quantify its regret bound. Moreover, we extend the setting to incorporate a constraint which ensures every new product is learned to a given accuracy. Our results demonstrate the tier structure can be used to mitigate the risks associated with learning new products.}\n}", "pdf": "http://proceedings.mlr.press/v97/cao19a/cao19a.pdf", "supp": "", "pdf_size": 602888, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12726389061236945317&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Industrial Engineering and Operations Research, University of California, Berkeley, USA; IBM Research, Yorktown Height, New York, USA", "aff_domain": "berkeley.edu; ", "email": "berkeley.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/cao19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Berkeley;IBM", "aff_unique_dep": "Department of Industrial Engineering and Operations Research;IBM Research", "aff_unique_url": "https://www.berkeley.edu;https://www.ibm.com/research", "aff_unique_abbr": "UC Berkeley;IBM", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Berkeley;Yorktown Heights", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Dynamic Measurement Scheduling for Event Forecasting using Deep RL", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4215", "id": "4215", "author_site": "Chun-Hao (Kingsley) Chang, Mingjie Mai, Anna Goldenberg", "author": "Chun-Hao Chang; Mingjie Mai; Anna Goldenberg", "abstract": "Imagine a patient in critical condition. What and when should be measured to forecast detrimental events, especially under the budget constraints? We answer this question by deep reinforcement learning (RL) that jointly minimizes the measurement cost and maximizes predictive gain, by scheduling strategically-timed measurements. We learn our policy to be dynamically dependent on the patient\u2019s health history. To scale our framework to exponentially large action space, we distribute our reward in a sequential setting that makes the learning easier. In our simulation, our policy outperforms heuristic-based scheduling with higher predictive gain and lower cost. In a real-world ICU mortality prediction task (MIMIC3), our policies reduce the total number of measurements by 31% or improve predictive gain by a factor of 3 as compared to physicians, under the off-policy policy evaluation.", "bibtex": "@InProceedings{pmlr-v97-chang19a,\n title = \t {Dynamic Measurement Scheduling for Event Forecasting using Deep {RL}},\n author = {Chang, Chun-Hao and Mai, Mingjie and Goldenberg, Anna},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {951--960},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chang19a/chang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chang19a.html},\n abstract = \t {Imagine a patient in critical condition. What and when should be measured to forecast detrimental events, especially under the budget constraints? We answer this question by deep reinforcement learning (RL) that jointly minimizes the measurement cost and maximizes predictive gain, by scheduling strategically-timed measurements. We learn our policy to be dynamically dependent on the patient\u2019s health history. To scale our framework to exponentially large action space, we distribute our reward in a sequential setting that makes the learning easier. In our simulation, our policy outperforms heuristic-based scheduling with higher predictive gain and lower cost. In a real-world ICU mortality prediction task (MIMIC3), our policies reduce the total number of measurements by 31% or improve predictive gain by a factor of 3 as compared to physicians, under the off-policy policy evaluation.}\n}", "pdf": "http://proceedings.mlr.press/v97/chang19a/chang19a.pdf", "supp": "", "pdf_size": 2355099, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16682086403586827063&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "University of Toronto, Toronto, ON, Canada+Vector Institute, Toronto, ON, Canada+The Hospital for Sick Children, Toronto, ON, Canada; University of Toronto, Toronto, ON, Canada+Vector Institute, Toronto, ON, Canada+The Hospital for Sick Children, Toronto, ON, Canada; University of Toronto, Toronto, ON, Canada+Vector Institute, Toronto, ON, Canada+The Hospital for Sick Children, Toronto, ON, Canada", "aff_domain": "cs.toronto.edu; ; ", "email": "cs.toronto.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/chang19a.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", "aff_unique_norm": "University of Toronto;Vector Institute;Hospital for Sick Children", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.sickkids.ca", "aff_unique_abbr": "U of T;Vector Institute;", "aff_campus_unique_index": "0+0+0;0+0+0;0+0+0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0+0+0;0+0+0;0+0+0", "aff_country_unique": "Canada" }, { "title": "Dynamic Weights in Multi-Objective Deep Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3636", "id": "3636", "author_site": "Axel Abels, Diederik Roijers, Tom Lenaerts, Ann Now\u00e9, Denis Steckelmacher", "author": "Axel Abels; Diederik Roijers; Tom Lenaerts; Ann Now\u00e9; Denis Steckelmacher", "abstract": "Many real-world decision problems are characterized by multiple conflicting objectives which must be balanced based on their relative importance. In the dynamic weights setting the relative importance changes over time and specialized algorithms that deal with such change, such as a tabular Reinforcement Learning (RL) algorithm by Natarajan and Tadepalli (2005), are required. However, this earlier work is not feasible for RL settings that necessitate the use of function approximators. We generalize across weight changes and high-dimensional inputs by proposing a multi-objective Q-network whose outputs are conditioned on the relative importance of objectives and we introduce Diverse Experience Replay (DER) to counter the inherent non-stationarity of the Dynamic Weights setting. We perform an extensive experimental evaluation and compare our methods to adapted algorithms from Deep Multi-Task/Multi-Objective Reinforcement Learning and show that our proposed network in combination with DER dominates these adapted algorithms across weight change scenarios and problem domains.", "bibtex": "@InProceedings{pmlr-v97-abels19a,\n title = \t {Dynamic Weights in Multi-Objective Deep Reinforcement Learning},\n author = {Abels, Axel and Roijers, Diederik and Lenaerts, Tom and Now{\\'e}, Ann and Steckelmacher, Denis},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {11--20},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/abels19a/abels19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/abels19a.html},\n abstract = \t {Many real-world decision problems are characterized by multiple conflicting objectives which must be balanced based on their relative importance. In the dynamic weights setting the relative importance changes over time and specialized algorithms that deal with such change, such as a tabular Reinforcement Learning (RL) algorithm by Natarajan and Tadepalli (2005), are required. However, this earlier work is not feasible for RL settings that necessitate the use of function approximators. We generalize across weight changes and high-dimensional inputs by proposing a multi-objective Q-network whose outputs are conditioned on the relative importance of objectives and we introduce Diverse Experience Replay (DER) to counter the inherent non-stationarity of the Dynamic Weights setting. We perform an extensive experimental evaluation and compare our methods to adapted algorithms from Deep Multi-Task/Multi-Objective Reinforcement Learning and show that our proposed network in combination with DER dominates these adapted algorithms across weight change scenarios and problem domains.}\n}", "pdf": "http://proceedings.mlr.press/v97/abels19a/abels19a.pdf", "supp": "", "pdf_size": 1935841, "gs_citation": 231, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12040121315464946458&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 22, "aff": "Machine Learning Group, Universite Libre de Bruxelles, Brussels, Belgium+Artificial Intelligence Lab, Vrije Universiteit Brussel, Brussels, Belgium; Computational Intelligence, Vrije Universiteit Amsterdam, Amsterdam, the Netherlands; Machine Learning Group, Universite Libre de Bruxelles, Brussels, Belgium+Artificial Intelligence Lab, Vrije Universiteit Brussel, Brussels, Belgium; Machine Learning Group, Universite Libre de Bruxelles, Brussels, Belgium+Artificial Intelligence Lab, Vrije Universiteit Brussel, Brussels, Belgium; Machine Learning Group, Universite Libre de Bruxelles, Brussels, Belgium+Artificial Intelligence Lab, Vrije Universiteit Brussel, Brussels, Belgium", "aff_domain": "ulb.ac.be; ; ; ; ", "email": "ulb.ac.be; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/abels19a.html", "aff_unique_index": "0+1;2;0+1;0+1;0+1", "aff_unique_norm": "Universite Libre de Bruxelles;Vrije Universiteit Brussel;Vrije Universiteit Amsterdam", "aff_unique_dep": "Machine Learning Group;Artificial Intelligence Lab;Computational Intelligence", "aff_unique_url": "https://www.ulb.ac.be;https://www.vub.be;https://www.vu.nl", "aff_unique_abbr": "ULB;VUB;VU Amsterdam", "aff_campus_unique_index": "0+0;1;0+0;0+0;0+0", "aff_campus_unique": "Brussels;Amsterdam", "aff_country_unique_index": "0+0;1;0+0;0+0;0+0", "aff_country_unique": "Belgium;Netherlands" }, { "title": "EDDI: Efficient Dynamic Discovery of High-Value Information with Partial VAE", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3924", "id": "3924", "author_site": "Chao Ma, Sebastian Tschiatschek, Konstantina Palla, Jose Miguel Hernandez-Lobato, Sebastian Nowozin, Cheng Zhang", "author": "Chao Ma; Sebastian Tschiatschek; Konstantina Palla; Jose Miguel Hernandez-Lobato; Sebastian Nowozin; Cheng Zhang", "abstract": "Many real-life decision making situations allow further relevant information to be acquired at a specific cost, for example, in assessing the health status of a patient we may decide to take additional measurements such as diagnostic tests or imaging scans before making a final assessment. Acquiring more relevant information enables better decision making, but may be costly. How can we trade off the desire to make good decisions by acquiring further information with the cost of performing that acquisition? To this end, we propose a principled framework, named", "bibtex": "@InProceedings{pmlr-v97-ma19c,\n title = \t {{EDDI}: Efficient Dynamic Discovery of High-Value Information with Partial {VAE}},\n author = {Ma, Chao and Tschiatschek, Sebastian and Palla, Konstantina and Hernandez-Lobato, Jose Miguel and Nowozin, Sebastian and Zhang, Cheng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4234--4243},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ma19c/ma19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/ma19c.html},\n abstract = \t {Many real-life decision making situations allow further relevant information to be acquired at a specific cost, for example, in assessing the health status of a patient we may decide to take additional measurements such as diagnostic tests or imaging scans before making a final assessment. Acquiring more relevant information enables better decision making, but may be costly. How can we trade off the desire to make good decisions by acquiring further information with the cost of performing that acquisition? To this end, we propose a principled framework, named", "pdf": "http://proceedings.mlr.press/v97/ma19c/ma19c.pdf", "supp": "", "pdf_size": 4186294, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7932877212524867960&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/ma19c.html" }, { "title": "ELF OpenGo: an analysis and open reimplementation of AlphaZero", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3883", "id": "3883", "author_site": "Yuandong Tian, Jerry Ma, Qucheng Gong, Shubho Sengupta, Zhuoyuan Chen, James Pinkerton, Larry Zitnick", "author": "Yuandong Tian; Jerry Ma; Qucheng Gong; Shubho Sengupta; Zhuoyuan Chen; James Pinkerton; Larry Zitnick", "abstract": "The AlphaGo, AlphaGo Zero, and AlphaZero series of algorithms are remarkable demonstrations of deep reinforcement learning\u2019s capabilities, achieving superhuman performance in the complex game of Go with progressively increasing autonomy. However, many obstacles remain in the understanding of and usability of these promising approaches by the research community. Toward elucidating unresolved mysteries and facilitating future research, we propose ELF OpenGo, an open-source reimplementation of the AlphaZero algorithm. ELF OpenGo is the first open-source Go AI to convincingly demonstrate superhuman performance with a perfect (20:0) record against global top professionals. We apply ELF OpenGo to conduct extensive ablation studies, and to identify and analyze numerous interesting phenomena in both the model training and in the gameplay inference procedures. Our code, models, selfplay datasets, and auxiliary data are publicly available.", "bibtex": "@InProceedings{pmlr-v97-tian19a,\n title = \t {{ELF} {O}pen{G}o: an analysis and open reimplementation of {A}lpha{Z}ero},\n author = {Tian, Yuandong and Ma, Jerry and Gong, Qucheng and Sengupta, Shubho and Chen, Zhuoyuan and Pinkerton, James and Zitnick, Larry},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6244--6253},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tian19a/tian19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tian19a.html},\n abstract = \t {The AlphaGo, AlphaGo Zero, and AlphaZero series of algorithms are remarkable demonstrations of deep reinforcement learning\u2019s capabilities, achieving superhuman performance in the complex game of Go with progressively increasing autonomy. However, many obstacles remain in the understanding of and usability of these promising approaches by the research community. Toward elucidating unresolved mysteries and facilitating future research, we propose ELF OpenGo, an open-source reimplementation of the AlphaZero algorithm. ELF OpenGo is the first open-source Go AI to convincingly demonstrate superhuman performance with a perfect (20:0) record against global top professionals. We apply ELF OpenGo to conduct extensive ablation studies, and to identify and analyze numerous interesting phenomena in both the model training and in the gameplay inference procedures. Our code, models, selfplay datasets, and auxiliary data are publicly available.}\n}", "pdf": "http://proceedings.mlr.press/v97/tian19a/tian19a.pdf", "supp": "", "pdf_size": 4912631, "gs_citation": 137, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9736512126040760893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research; Facebook AI Research", "aff_domain": "fb.com;fb.com; ; ; ; ;fb.com", "email": "fb.com;fb.com; ; ; ; ;fb.com", "github": "", "project": "https://facebook.ai/developers/tools/elf-opengo", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/tian19a.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Meta", "aff_unique_dep": "Facebook AI Research", "aff_unique_url": "https://research.facebook.com", "aff_unique_abbr": "FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "EMI: Exploration with Mutual Information", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4026", "id": "4026", "author_site": "Hyoungseok Kim, Jaekyeom Kim, Yeonwoo Jeong, Sergey Levine, Hyun Oh Song", "author": "Hyoungseok Kim; Jaekyeom Kim; Yeonwoo Jeong; Sergey Levine; Hyun Oh Song", "abstract": "Reinforcement learning algorithms struggle when the reward signal is very sparse. In these cases, naive random exploration methods essentially rely on a random walk to stumble onto a rewarding state. Recent works utilize intrinsic motivation to guide the exploration via generative models, predictive forward models, or discriminative modeling of novelty. We propose EMI, which is an exploration method that constructs embedding representation of states and actions that does not rely on generative decoding of the full observation but extracts predictive signals that can be used to guide exploration based on forward prediction in the representation space. Our experiments show competitive results on challenging locomotion tasks with continuous control and on image-based exploration tasks with discrete actions on Atari. The source code is available at https://github.com/snu-mllab/EMI.", "bibtex": "@InProceedings{pmlr-v97-kim19a,\n title = \t {{EMI}: Exploration with Mutual Information},\n author = {Kim, Hyoungseok and Kim, Jaekyeom and Jeong, Yeonwoo and Levine, Sergey and Song, Hyun Oh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3360--3369},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kim19a/kim19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kim19a.html},\n abstract = \t {Reinforcement learning algorithms struggle when the reward signal is very sparse. In these cases, naive random exploration methods essentially rely on a random walk to stumble onto a rewarding state. Recent works utilize intrinsic motivation to guide the exploration via generative models, predictive forward models, or discriminative modeling of novelty. We propose EMI, which is an exploration method that constructs embedding representation of states and actions that does not rely on generative decoding of the full observation but extracts predictive signals that can be used to guide exploration based on forward prediction in the representation space. Our experiments show competitive results on challenging locomotion tasks with continuous control and on image-based exploration tasks with discrete actions on Atari. The source code is available at https://github.com/snu-mllab/EMI.}\n}", "pdf": "http://proceedings.mlr.press/v97/kim19a/kim19a.pdf", "supp": "", "pdf_size": 7686340, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13544760374723251277&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Seoul National University, Department of Computer Science and Engineering+Neural Processing Research Center; Seoul National University, Department of Computer Science and Engineering+Neural Processing Research Center; Seoul National University, Department of Computer Science and Engineering+Neural Processing Research Center; UC Berkeley, Department of Electrical Engineering and Computer Sciences; Seoul National University, Department of Computer Science and Engineering+Neural Processing Research Center", "aff_domain": "snu.ac.kr; ; ; ;snu.ac.kr", "email": "snu.ac.kr; ; ; ;snu.ac.kr", "github": "https://github.com/snu-mllab/EMI", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kim19a.html", "aff_unique_index": "0+1;0+1;0+1;2;0+1", "aff_unique_norm": "Seoul National University;Neural Processing Research Center;University of California, Berkeley", "aff_unique_dep": "Department of Computer Science and Engineering;;Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.snu.ac.kr;;https://www.berkeley.edu", "aff_unique_abbr": "SNU;;UC Berkeley", "aff_campus_unique_index": ";;;1;", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;2;0", "aff_country_unique": "South Korea;;United States" }, { "title": "Efficient Amortised Bayesian Inference for Hierarchical and Nonlinear Dynamical Systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4101", "id": "4101", "author_site": "Ted Meeds, Geoffrey Roeder, Paul Grant, Andrew Phillips, Neil Dalchau", "author": "Geoffrey Roeder; Paul Grant; Andrew Phillips; Neil Dalchau; Edward Meeds", "abstract": "We introduce a flexible, scalable Bayesian inference framework for nonlinear dynamical systems characterised by distinct and hierarchical variability at the individual, group, and population levels. Our model class is a generalisation of nonlinear mixed-effects (NLME) dynamical systems, the statistical workhorse for many experimental sciences. We cast parameter inference as stochastic optimisation of an end-to-end differentiable, block-conditional variational autoencoder. We specify the dynamics of the data-generating process as an ordinary differential equation (ODE) such that both the ODE and its solver are fully differentiable. This model class is highly flexible: the ODE right-hand sides can be a mixture of user-prescribed or \"white-box\" sub-components and neural network or \"black-box\" sub-components. Using stochastic optimisation, our amortised inference algorithm could seamlessly scale up to massive data collection pipelines (common in labs with robotic automation). Finally, our framework supports interpretability with respect to the underlying dynamics, as well as predictive generalization to unseen combinations of group components (also called \u201czero-shot\" learning). We empirically validate our method by predicting the dynamic behaviour of bacteria that were genetically engineered to function as biosensors.", "bibtex": "@InProceedings{pmlr-v97-meeds19a,\n title = \t {Efficient Amortised {B}ayesian Inference for Hierarchical and Nonlinear Dynamical Systems},\n author = {Roeder, Geoffrey, and Grant, Paul and Phillips, Andrew and Dalchau, Neil, and Meeds, Edward},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4445--4455},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/meeds19a/meeds19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/meeds19a.html},\n abstract = \t {We introduce a flexible, scalable Bayesian inference framework for nonlinear dynamical systems characterised by distinct and hierarchical variability at the individual, group, and population levels. Our model class is a generalisation of nonlinear mixed-effects (NLME) dynamical systems, the statistical workhorse for many experimental sciences. We cast parameter inference as stochastic optimisation of an end-to-end differentiable, block-conditional variational autoencoder. We specify the dynamics of the data-generating process as an ordinary differential equation (ODE) such that both the ODE and its solver are fully differentiable. This model class is highly flexible: the ODE right-hand sides can be a mixture of user-prescribed or \"white-box\" sub-components and neural network or \"black-box\" sub-components. Using stochastic optimisation, our amortised inference algorithm could seamlessly scale up to massive data collection pipelines (common in labs with robotic automation). Finally, our framework supports interpretability with respect to the underlying dynamics, as well as predictive generalization to unseen combinations of group components (also called \u201czero-shot\" learning). We empirically validate our method by predicting the dynamic behaviour of bacteria that were genetically engineered to function as biosensors.}\n}", "pdf": "http://proceedings.mlr.press/v97/meeds19a/meeds19a.pdf", "supp": "", "pdf_size": 5442006, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1247732699292692241&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Microsoft Research, Cambridge, United Kingdom+Princeton University, Princeton, United States of America; Microsoft Research, Cambridge, United Kingdom; Microsoft Research, Cambridge, United Kingdom; Microsoft Research, Cambridge, United Kingdom; Microsoft Research, Cambridge, United Kingdom", "aff_domain": "princeton.edu; ; ;microsoft.com;microsoft.com", "email": "princeton.edu; ; ;microsoft.com;microsoft.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/meeds19a.html", "aff_unique_index": "0+1;0;0;0;0", "aff_unique_norm": "Microsoft;Princeton University", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.princeton.edu", "aff_unique_abbr": "MSR;Princeton", "aff_campus_unique_index": "0+1;0;0;0;0", "aff_campus_unique": "Cambridge;Princeton", "aff_country_unique_index": "0+1;0;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Efficient Dictionary Learning with Gradient Descent", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3789", "id": "3789", "author_site": "Dar Gilboa, Sam Buchanan, John Wright", "author": "Dar Gilboa; Sam Buchanan; John Wright", "abstract": "Randomly initialized first-order optimization algorithms are the method of choice for solving many high-dimensional nonconvex problems in machine learning, yet general theoretical guarantees cannot rule out convergence to critical points of poor objective value. For some highly structured nonconvex problems however, the success of gradient descent can be understood by studying the geometry of the objective. We study one such problem \u2013 complete orthogonal dictionary learning, and provide converge guarantees for randomly initialized gradient descent to the neighborhood of a global optimum. The resulting rates scale as low order polynomials in the dimension even though the objective possesses an exponential number of saddle points. This efficient convergence can be viewed as a consequence of negative curvature normal to the stable manifolds associated with saddle points, and we provide evidence that this feature is shared by other nonconvex problems of importance as well.", "bibtex": "@InProceedings{pmlr-v97-gilboa19a,\n title = \t {Efficient Dictionary Learning with Gradient Descent},\n author = {Gilboa, Dar and Buchanan, Sam and Wright, John},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2252--2259},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gilboa19a/gilboa19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gilboa19a.html},\n abstract = \t {Randomly initialized first-order optimization algorithms are the method of choice for solving many high-dimensional nonconvex problems in machine learning, yet general theoretical guarantees cannot rule out convergence to critical points of poor objective value. For some highly structured nonconvex problems however, the success of gradient descent can be understood by studying the geometry of the objective. We study one such problem \u2013 complete orthogonal dictionary learning, and provide converge guarantees for randomly initialized gradient descent to the neighborhood of a global optimum. The resulting rates scale as low order polynomials in the dimension even though the objective possesses an exponential number of saddle points. This efficient convergence can be viewed as a consequence of negative curvature normal to the stable manifolds associated with saddle points, and we provide evidence that this feature is shared by other nonconvex problems of importance as well.}\n}", "pdf": "http://proceedings.mlr.press/v97/gilboa19a/gilboa19a.pdf", "supp": "", "pdf_size": 1382367, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17820633958201377013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Neuroscience, Columbia University + Data Science Institute, Columbia University; Department of Electrical Engineering, Columbia University + Data Science Institute, Columbia University; Department of Electrical Engineering, Columbia University + Data Science Institute, Columbia University", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/gilboa19a.html", "aff_unique_index": "0+0;0+0;0+0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "Department of Neuroscience", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Efficient Full-Matrix Adaptive Regularization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4302", "id": "4302", "author_site": "Naman Agarwal, Brian Bullins, Xinyi Chen, Elad Hazan, Karan Singh, Cyril Zhang, Yi Zhang", "author": "Naman Agarwal; Brian Bullins; Xinyi Chen; Elad Hazan; Karan Singh; Cyril Zhang; Yi Zhang", "abstract": "Adaptive regularization methods pre-multiply a descent direction by a preconditioning matrix. Due to the large number of parameters of machine learning problems, full-matrix preconditioning methods are prohibitively expensive. We show how to modify full-matrix adaptive regularization in order to make it practical and effective. We also provide a novel theoretical analysis for adaptive regularization in", "bibtex": "@InProceedings{pmlr-v97-agarwal19b,\n title = \t {Efficient Full-Matrix Adaptive Regularization},\n author = {Agarwal, Naman and Bullins, Brian and Chen, Xinyi and Hazan, Elad and Singh, Karan and Zhang, Cyril and Zhang, Yi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {102--110},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/agarwal19b/agarwal19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/agarwal19b.html},\n abstract = \t {Adaptive regularization methods pre-multiply a descent direction by a preconditioning matrix. Due to the large number of parameters of machine learning problems, full-matrix preconditioning methods are prohibitively expensive. We show how to modify full-matrix adaptive regularization in order to make it practical and effective. We also provide a novel theoretical analysis for adaptive regularization in", "pdf": "http://proceedings.mlr.press/v97/agarwal19b/agarwal19b.pdf", "supp": "", "pdf_size": 1602882, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14945182817146909743&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Google AI Princeton; Google AI Princeton + Department of Computer Science, Princeton University; Google AI Princeton; Google AI Princeton + Department of Computer Science, Princeton University; Google AI Princeton + Department of Computer Science, Princeton University; Google AI Princeton + Department of Computer Science, Princeton University; Google AI Princeton + Department of Computer Science, Princeton University", "aff_domain": "cs.princeton.edu; ; ; ; ; ; ", "email": "cs.princeton.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/agarwal19b.html", "aff_unique_index": "0;0+1;0;0+1;0+1;0+1;0+1", "aff_unique_norm": "Google;Princeton University", "aff_unique_dep": "Google AI;Department of Computer Science", "aff_unique_url": "https://ai.google;https://www.princeton.edu", "aff_unique_abbr": "Google AI;Princeton", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Princeton;", "aff_country_unique_index": "0;0+0;0;0+0;0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Efficient Nonconvex Regularized Tensor Completion with Structure-aware Proximal Iterations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3790", "id": "3790", "author_site": "Quanming Yao, James Kwok, Bo Han", "author": "Quanming Yao; James Tin-Yau Kwok; Bo Han", "abstract": "Nonconvex regularizers have been successfully used in low-rank matrix learning. In this paper, we extend this to the more challenging problem of low-rank tensor completion. Based on the proximal average algorithm, we develop an efficient solver that avoids expensive tensor folding and unfolding. A special \u201csparse plus low-rank\" structure, which is essential for fast computation of individual proximal steps, is maintained throughout the iterations. We also incorporate adaptive momentum to further speed up empirical convergence. Convergence results to critical points are provided under smoothness and Kurdyka-Lojasiewicz conditions. Experimental results on a number of synthetic and real-world data sets show that the proposed algorithm is more efficient in both time and space, and is also more accurate than existing approaches.", "bibtex": "@InProceedings{pmlr-v97-yao19a,\n title = \t {Efficient Nonconvex Regularized Tensor Completion with Structure-aware Proximal Iterations},\n author = {Yao, Quanming and Kwok, James Tin-Yau and Han, Bo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7035--7044},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yao19a/yao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yao19a.html},\n abstract = \t {Nonconvex regularizers have been successfully used in low-rank matrix learning. In this paper, we extend this to the more challenging problem of low-rank tensor completion. Based on the proximal average algorithm, we develop an efficient solver that avoids expensive tensor folding and unfolding. A special \u201csparse plus low-rank\" structure, which is essential for fast computation of individual proximal steps, is maintained throughout the iterations. We also incorporate adaptive momentum to further speed up empirical convergence. Convergence results to critical points are provided under smoothness and Kurdyka-Lojasiewicz conditions. Experimental results on a number of synthetic and real-world data sets show that the proposed algorithm is more efficient in both time and space, and is also more accurate than existing approaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/yao19a/yao19a.pdf", "supp": "", "pdf_size": 570403, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4618035128739261425&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "14Paradigm Inc, Beijing, China; 2Department of Computer Science and Engineering, Hong Kong University of Science and Technology, Hong Kong; 3Center for Advanced Intelligence Project, RIKEN, Japan", "aff_domain": "4paradigm.com; ; ", "email": "4paradigm.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yao19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "14Paradigm Inc;Hong Kong University of Science and Technology;RIKEN", "aff_unique_dep": ";Department of Computer Science and Engineering;Center for Advanced Intelligence Project", "aff_unique_url": ";https://www.ust.hk;https://www.riken.jp", "aff_unique_abbr": ";HKUST;RIKEN", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1", "aff_country_unique": "China;Japan" }, { "title": "Efficient Off-Policy Meta-Reinforcement Learning via Probabilistic Context Variables", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3641", "id": "3641", "author_site": "Kate Rakelly, Aurick Zhou, Chelsea Finn, Sergey Levine, Deirdre Quillen", "author": "Kate Rakelly; Aurick Zhou; Chelsea Finn; Sergey Levine; Deirdre Quillen", "abstract": "Deep reinforcement learning algorithms require large amounts of experience to learn an individual task. While meta-reinforcement learning (meta-RL) algorithms can enable agents to learn new skills from small amounts of experience, several major challenges preclude their practicality. Current methods rely heavily on on-policy experience, limiting their sample efficiency. They also lack mechanisms to reason about task uncertainty when adapting to new tasks, limiting their effectiveness on sparse reward problems. In this paper, we address these challenges by developing an off-policy meta-RL algorithm that disentangles task inference and control. In our approach, we perform online probabilistic filtering of latent task variables to infer how to solve a new task from small amounts of experience. This probabilistic interpretation enables posterior sampling for structured and efficient exploration. We demonstrate how to integrate these task variables with off-policy RL algorithms to achieve both meta-training and adaptation efficiency. Our method outperforms prior algorithms in sample efficiency by 20-100X as well as in asymptotic performance on several meta-RL benchmarks.", "bibtex": "@InProceedings{pmlr-v97-rakelly19a,\n title = \t {Efficient Off-Policy Meta-Reinforcement Learning via Probabilistic Context Variables},\n author = {Rakelly, Kate and Zhou, Aurick and Finn, Chelsea and Levine, Sergey and Quillen, Deirdre},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5331--5340},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rakelly19a/rakelly19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rakelly19a.html},\n abstract = \t {Deep reinforcement learning algorithms require large amounts of experience to learn an individual task. While meta-reinforcement learning (meta-RL) algorithms can enable agents to learn new skills from small amounts of experience, several major challenges preclude their practicality. Current methods rely heavily on on-policy experience, limiting their sample efficiency. They also lack mechanisms to reason about task uncertainty when adapting to new tasks, limiting their effectiveness on sparse reward problems. In this paper, we address these challenges by developing an off-policy meta-RL algorithm that disentangles task inference and control. In our approach, we perform online probabilistic filtering of latent task variables to infer how to solve a new task from small amounts of experience. This probabilistic interpretation enables posterior sampling for structured and efficient exploration. We demonstrate how to integrate these task variables with off-policy RL algorithms to achieve both meta-training and adaptation efficiency. Our method outperforms prior algorithms in sample efficiency by 20-100X as well as in asymptotic performance on several meta-RL benchmarks.}\n}", "pdf": "http://proceedings.mlr.press/v97/rakelly19a/rakelly19a.pdf", "supp": "", "pdf_size": 842432, "gs_citation": 838, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15379570585451726919&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "EECS Department, UC Berkeley; EECS Department, UC Berkeley; EECS Department, UC Berkeley; EECS Department, UC Berkeley; EECS Department, UC Berkeley", "aff_domain": "eecs.berkeley.edu;eecs.berkeley.edu; ; ; ", "email": "eecs.berkeley.edu;eecs.berkeley.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/rakelly19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "EECS Department", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Efficient On-Device Models using Neural Projections", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3794", "id": "3794", "author": "Sujith Ravi", "abstract": "Many applications involving visual and language understanding can be effectively solved using deep neural networks. Even though these techniques achieve state-of-the-art results, it is very challenging to apply them on devices with limited memory and computational capacity such as mobile phones, smart watches and IoT. We propose a neural projection approach for training compact on-device neural networks. We introduce \"projection\" networks that use locality-sensitive projections to generate compact binary representations and learn small neural networks with computationally efficient operations. We design a joint optimization framework where the projection network can be trained from scratch or leverage existing larger neural networks such as feed-forward NNs, CNNs or RNNs. The trained neural projection network can be directly used for inference on device at low memory and computation cost. We demonstrate the effectiveness of this as a general-purpose approach for significantly shrinking memory requirements of different types of neural networks while preserving good accuracy on multiple visual and text classification tasks.", "bibtex": "@InProceedings{pmlr-v97-ravi19a,\n title = \t {Efficient On-Device Models using Neural Projections},\n author = {Ravi, Sujith},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5370--5379},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ravi19a/ravi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ravi19a.html},\n abstract = \t {Many applications involving visual and language understanding can be effectively solved using deep neural networks. Even though these techniques achieve state-of-the-art results, it is very challenging to apply them on devices with limited memory and computational capacity such as mobile phones, smart watches and IoT. We propose a neural projection approach for training compact on-device neural networks. We introduce \"projection\" networks that use locality-sensitive projections to generate compact binary representations and learn small neural networks with computationally efficient operations. We design a joint optimization framework where the projection network can be trained from scratch or leverage existing larger neural networks such as feed-forward NNs, CNNs or RNNs. The trained neural projection network can be directly used for inference on device at low memory and computation cost. We demonstrate the effectiveness of this as a general-purpose approach for significantly shrinking memory requirements of different types of neural networks while preserving good accuracy on multiple visual and text classification tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/ravi19a/ravi19a.pdf", "supp": "", "pdf_size": 756825, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10990437854478167541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Google Research, Mountain View, California, USA", "aff_domain": "google.com", "email": "google.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/ravi19a.html", "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Efficient Training of BERT by Progressively Stacking", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3819", "id": "3819", "author_site": "Linyuan Gong, Di He, Zhuohan Li, Tao Qin, Liwei Wang, Tie-Yan Liu", "author": "Linyuan Gong; Di He; Zhuohan Li; Tao Qin; Liwei Wang; Tieyan Liu", "abstract": "Unsupervised pre-training is popularly used in natural language processing. By designing proper unsupervised prediction tasks, a deep neural network can be trained and shown to be effective in many downstream tasks. As the data is usually adequate, the model for pre-training is generally huge and contains millions of parameters. Therefore, the training efficiency becomes a critical issue even when using high-performance hardware. In this paper, we explore an efficient training method for the state-of-the-art bidirectional Transformer (BERT) model. By visualizing the self-attention distribution of different layers at different positions in a well-trained BERT model, we find that in most layers, the self-attention distribution will concentrate locally around its position and the start-of-sentence token. Motivating from this, we propose the stacking algorithm to transfer knowledge from a shallow model to a deep model; then we apply stacking progressively to accelerate BERT training. The experimental results showed that the models trained by our training strategy achieve similar performance to models trained from scratch, but our algorithm is much faster.", "bibtex": "@InProceedings{pmlr-v97-gong19a,\n title = \t {Efficient Training of {BERT} by Progressively Stacking},\n author = {Gong, Linyuan and He, Di and Li, Zhuohan and Qin, Tao and Wang, Liwei and Liu, Tieyan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2337--2346},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gong19a/gong19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gong19a.html},\n abstract = \t {Unsupervised pre-training is popularly used in natural language processing. By designing proper unsupervised prediction tasks, a deep neural network can be trained and shown to be effective in many downstream tasks. As the data is usually adequate, the model for pre-training is generally huge and contains millions of parameters. Therefore, the training efficiency becomes a critical issue even when using high-performance hardware. In this paper, we explore an efficient training method for the state-of-the-art bidirectional Transformer (BERT) model. By visualizing the self-attention distribution of different layers at different positions in a well-trained BERT model, we find that in most layers, the self-attention distribution will concentrate locally around its position and the start-of-sentence token. Motivating from this, we propose the stacking algorithm to transfer knowledge from a shallow model to a deep model; then we apply stacking progressively to accelerate BERT training. The experimental results showed that the models trained by our training strategy achieve similar performance to models trained from scratch, but our algorithm is much faster.}\n}", "pdf": "http://proceedings.mlr.press/v97/gong19a/gong19a.pdf", "supp": "", "pdf_size": 1686471, "gs_citation": 179, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5986521549795555603&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Key Laboratory of Machine Perception, MOE, School of EECS, Peking University; Key Laboratory of Machine Perception, MOE, School of EECS, Peking University; Key Laboratory of Machine Perception, MOE, School of EECS, Peking University + Microsoft Research; Microsoft Research; Center for Data Science, Peking University, Beijing Institute of Big Data Research; Microsoft Research", "aff_domain": "pku.edu.cn;microsoft.com;pku.edu.cn;microsoft.com;pku.edu.cn;microsoft.com", "email": "pku.edu.cn;microsoft.com;pku.edu.cn;microsoft.com;pku.edu.cn;microsoft.com", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/gong19a.html", "aff_unique_index": "0;0;0+1;1;0;1", "aff_unique_norm": "Peking University;Microsoft", "aff_unique_dep": "School of EECS;Microsoft Research", "aff_unique_url": "http://www.pku.edu.cn;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Peking U;MSR", "aff_campus_unique_index": ";1", "aff_campus_unique": ";Beijing", "aff_country_unique_index": "0;0;0+1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "Efficient learning of smooth probability functions from Bernoulli tests with guarantees", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4143", "id": "4143", "author_site": "Paul Rolland, Ali Kavis, Alexander Niklaus Immer, Adish Singla, Volkan Cevher", "author": "Paul Rolland; Ali Kavis; Alexander Immer; Adish Singla; Volkan Cevher", "abstract": "We study the fundamental problem of learning an unknown, smooth probability function via point-wise Bernoulli tests. We provide a scalable algorithm for efficiently solving this problem with rigorous guarantees. In particular, we prove the convergence rate of our posterior update rule to the true probability function in L2-norm. Moreover, we allow the Bernoulli tests to depend on contextual features, and provide a modified inference engine with provable guarantees for this novel setting. Numerical results show that the empirical convergence rates match the theory, and illustrate the superiority of our approach in handling contextual features over the state-of-the-art.", "bibtex": "@InProceedings{pmlr-v97-rolland19a,\n title = \t {Efficient learning of smooth probability functions from Bernoulli tests with guarantees},\n author = {Rolland, Paul and Kavis, Ali and Immer, Alexander and Singla, Adish and Cevher, Volkan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5459--5467},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rolland19a/rolland19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rolland19a.html},\n abstract = \t {We study the fundamental problem of learning an unknown, smooth probability function via point-wise Bernoulli tests. We provide a scalable algorithm for efficiently solving this problem with rigorous guarantees. In particular, we prove the convergence rate of our posterior update rule to the true probability function in L2-norm. Moreover, we allow the Bernoulli tests to depend on contextual features, and provide a modified inference engine with provable guarantees for this novel setting. Numerical results show that the empirical convergence rates match the theory, and illustrate the superiority of our approach in handling contextual features over the state-of-the-art.}\n}", "pdf": "http://proceedings.mlr.press/v97/rolland19a/rolland19a.pdf", "supp": "", "pdf_size": 654193, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17583594805526330120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Ecole Polytechnique F\u00e9d\u00e9r\u00e1le de Lausanne, Switzerland; Ecole Polytechnique F\u00e9d\u00e9r\u00e1le de Lausanne, Switzerland; Ecole Polytechnique F\u00e9d\u00e9r\u00e1le de Lausanne, Switzerland; Max Planck Institute for Software Systems, Saarbr\u00fccken, Germany; Ecole Polytechnique F\u00e9d\u00e9r\u00e1le de Lausanne, Switzerland", "aff_domain": "epfl.ch; ; ; ; ", "email": "epfl.ch; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/rolland19a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "EPFL;Max Planck Institute for Software Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.mpi-sws.org", "aff_unique_abbr": "EPFL;MPI-SWS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Saarbr\u00fccken", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "Switzerland;Germany" }, { "title": "Efficient optimization of loops and limits with randomized telescoping sums", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3852", "id": "3852", "author_site": "Alex Beatson, Ryan P Adams", "author": "Alex Beatson; Ryan P Adams", "abstract": "We consider optimization problems in which the objective requires an inner loop with many steps or is the limit of a sequence of increasingly costly approximations. Meta-learning, training recurrent neural networks, and optimization of the solutions to differential equations are all examples of optimization problems with this character. In such problems, it can be expensive to compute the objective function value and its gradient, but truncating the loop or using less accurate approximations can induce biases that damage the overall solution. We propose", "bibtex": "@InProceedings{pmlr-v97-beatson19a,\n title = \t {Efficient optimization of loops and limits with randomized telescoping sums},\n author = {Beatson, Alex and Adams, Ryan P},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {534--543},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/beatson19a/beatson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/beatson19a.html},\n abstract = \t {We consider optimization problems in which the objective requires an inner loop with many steps or is the limit of a sequence of increasingly costly approximations. Meta-learning, training recurrent neural networks, and optimization of the solutions to differential equations are all examples of optimization problems with this character. In such problems, it can be expensive to compute the objective function value and its gradient, but truncating the loop or using less accurate approximations can induce biases that damage the overall solution. We propose", "pdf": "http://proceedings.mlr.press/v97/beatson19a/beatson19a.pdf", "supp": "", "pdf_size": 2041981, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3412668840791342029&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, Princeton University; Department of Computer Science, Princeton University", "aff_domain": "cs.princeton.edu; ", "email": "cs.princeton.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/beatson19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3661", "id": "3661", "author_site": "Mingxing Tan, Quoc Le", "author": "Mingxing Tan; Quoc Le", "abstract": "Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are given. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on MobileNets and ResNet. To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves stateof-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet (Huang et al., 2018). Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flower (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.", "bibtex": "@InProceedings{pmlr-v97-tan19a,\n title = \t {{E}fficient{N}et: Rethinking Model Scaling for Convolutional Neural Networks},\n author = {Tan, Mingxing and Le, Quoc},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6105--6114},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tan19a/tan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tan19a.html},\n abstract = \t {Convolutional Neural Networks (ConvNets) are commonly developed at a fixed resource budget, and then scaled up for better accuracy if more resources are given. In this paper, we systematically study model scaling and identify that carefully balancing network depth, width, and resolution can lead to better performance. Based on this observation, we propose a new scaling method that uniformly scales all dimensions of depth/width/resolution using a simple yet highly effective compound coefficient. We demonstrate the effectiveness of this method on MobileNets and ResNet. To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. In particular, our EfficientNet-B7 achieves stateof-the-art 84.4% top-1 / 97.1% top-5 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet (Huang et al., 2018). Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flower (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters.}\n}", "pdf": "http://proceedings.mlr.press/v97/tan19a/tan19a.pdf", "supp": "", "pdf_size": 770292, "gs_citation": 28829, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5472015514843683656&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Google Research, Brain Team, Mountain View, CA; Google Research, Brain Team, Mountain View, CA", "aff_domain": "google.com; ", "email": "google.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/tan19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research, Brain Team", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "EigenDamage: Structured Pruning in the Kronecker-Factored Eigenbasis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4306", "id": "4306", "author_site": "Chaoqi Wang, Roger Grosse, Sanja Fidler, Guodong Zhang", "author": "Chaoqi Wang; Roger Grosse; Sanja Fidler; Guodong Zhang", "abstract": "Reducing the test time resource requirements of a neural network while preserving test accuracy is crucial for running inference on resource-constrained devices. To achieve this goal, we introduce a novel network reparameterization based on the Kronecker-factored eigenbasis (KFE), and then apply Hessian-based structured pruning methods in this basis. As opposed to existing Hessian-based pruning algorithms which do pruning in parameter coordinates, our method works in the KFE where different weights are approximately independent, enabling accurate pruning and fast computation. We demonstrate empirically the effectiveness of the proposed method through extensive experiments. In particular, we highlight that the improvements are especially significant for more challenging datasets and networks. With negligible loss of accuracy, an iterative-pruning version gives a 10x reduction in model size and a 8x reduction in FLOPs on wide ResNet32.", "bibtex": "@InProceedings{pmlr-v97-wang19g,\n title = \t {{E}igen{D}amage: Structured Pruning in the {K}ronecker-Factored Eigenbasis},\n author = {Wang, Chaoqi and Grosse, Roger and Fidler, Sanja and Zhang, Guodong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6566--6575},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19g/wang19g.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19g.html},\n abstract = \t {Reducing the test time resource requirements of a neural network while preserving test accuracy is crucial for running inference on resource-constrained devices. To achieve this goal, we introduce a novel network reparameterization based on the Kronecker-factored eigenbasis (KFE), and then apply Hessian-based structured pruning methods in this basis. As opposed to existing Hessian-based pruning algorithms which do pruning in parameter coordinates, our method works in the KFE where different weights are approximately independent, enabling accurate pruning and fast computation. We demonstrate empirically the effectiveness of the proposed method through extensive experiments. In particular, we highlight that the improvements are especially significant for more challenging datasets and networks. With negligible loss of accuracy, an iterative-pruning version gives a 10x reduction in model size and a 8x reduction in FLOPs on wide ResNet32.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19g/wang19g.pdf", "supp": "", "pdf_size": 2402771, "gs_citation": 149, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15048467937573583684&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada; Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada; Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada+NVIDIA; Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada", "aff_domain": "cs.toronto.edu; ; ;cs.toronto.edu", "email": "cs.toronto.edu; ; ;cs.toronto.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wang19g.html", "aff_unique_index": "0+1;0+1;0+1+2;0+1", "aff_unique_norm": "University of Toronto;Vector Institute;NVIDIA", "aff_unique_dep": "Department of Computer Science;;NVIDIA Corporation", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai;https://www.nvidia.com", "aff_unique_abbr": "U of T;Vector Institute;NVIDIA", "aff_campus_unique_index": "0+0;0+0;0+0;0+0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0+0;0+0;0+0+1;0+0", "aff_country_unique": "Canada;United States" }, { "title": "Emerging Convolutions for Generative Normalizing Flows", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4088", "id": "4088", "author_site": "Emiel Hoogeboom, Rianne Van den Berg, Max Welling", "author": "Emiel Hoogeboom; Rianne Van Den Berg; Max Welling", "abstract": "Generative flows are attractive because they admit exact likelihood optimization and efficient image synthesis. Recently, Kingma & Dhariwal (2018) demonstrated with Glow that generative flows are capable of generating high quality images. We generalize the 1 {\\texttimes} 1 convolutions proposed in Glow to invertible d {\\texttimes} d convolutions, which are more flexible since they operate on both channel and spatial axes. We propose two methods to produce invertible convolutions, that have receptive fields identical to standard convolutions: Emerging convolutions are obtained by chaining specific autoregressive convolutions, and periodic convolutions are decoupled in the frequency domain. Our experiments show that the flexibility of d {\\texttimes} d convolutions significantly improves the performance of generative flow models on galaxy images, CIFAR10 and ImageNet.", "bibtex": "@InProceedings{pmlr-v97-hoogeboom19a,\n title = \t {Emerging Convolutions for Generative Normalizing Flows},\n author = {Hoogeboom, Emiel and Van Den Berg, Rianne and Welling, Max},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2771--2780},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hoogeboom19a/hoogeboom19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hoogeboom19a.html},\n abstract = \t {Generative flows are attractive because they admit exact likelihood optimization and efficient image synthesis. Recently, Kingma & Dhariwal (2018) demonstrated with Glow that generative flows are capable of generating high quality images. We generalize the 1 {\\texttimes} 1 convolutions proposed in Glow to invertible d {\\texttimes} d convolutions, which are more flexible since they operate on both channel and spatial axes. We propose two methods to produce invertible convolutions, that have receptive fields identical to standard convolutions: Emerging convolutions are obtained by chaining specific autoregressive convolutions, and periodic convolutions are decoupled in the frequency domain. Our experiments show that the flexibility of d {\\texttimes} d convolutions significantly improves the performance of generative flow models on galaxy images, CIFAR10 and ImageNet.}\n}", "pdf": "http://proceedings.mlr.press/v97/hoogeboom19a/hoogeboom19a.pdf", "supp": "", "pdf_size": 1153702, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17212015756232898698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "UvA-Bosch Delta Lab, University of Amsterdam, Netherlands; University of Amsterdam, Netherlands; Canadian Institute for Advanced Research (CIFAR)", "aff_domain": "uva.nl; ; ", "email": "uva.nl; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hoogeboom19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Amsterdam;Canadian Institute for Advanced Research", "aff_unique_dep": "UvA-Bosch Delta Lab;", "aff_unique_url": "https://www.uva.nl;https://www.cifar.ca", "aff_unique_abbr": "UvA;CIFAR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Netherlands;Canada" }, { "title": "Empirical Analysis of Beam Search Performance Degradation in Neural Sequence Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4064", "id": "4064", "author_site": "Eldan Cohen, Christopher Beck", "author": "Eldan Cohen; Christopher Beck", "abstract": "Beam search is the most popular inference algorithm for decoding neural sequence models. Unlike greedy search, beam search allows for non-greedy local decisions that can potentially lead to a sequence with a higher overall probability. However, work on a number of applications has found that the quality of the highest probability hypothesis found by beam search degrades with large beam widths. We perform an empirical study of the behavior of beam search across three sequence synthesis tasks. We find that increasing the beam width leads to sequences that are disproportionately based on early, very low probability tokens that are followed by a sequence of tokens with higher (conditional) probability. We show that, empirically, such sequences are more likely to have a lower evaluation score than lower probability sequences without this pattern. Using the notion of search discrepancies from heuristic search, we hypothesize that large discrepancies are the cause of the performance degradation. We show that this hypothesis generalizes the previous ones in machine translation and image captioning. To validate our hypothesis, we show that constraining beam search to avoid large discrepancies eliminates the performance degradation.", "bibtex": "@InProceedings{pmlr-v97-cohen19a,\n title = \t {Empirical Analysis of Beam Search Performance Degradation in Neural Sequence Models},\n author = {Cohen, Eldan and Beck, Christopher},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1290--1299},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cohen19a/cohen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cohen19a.html},\n abstract = \t {Beam search is the most popular inference algorithm for decoding neural sequence models. Unlike greedy search, beam search allows for non-greedy local decisions that can potentially lead to a sequence with a higher overall probability. However, work on a number of applications has found that the quality of the highest probability hypothesis found by beam search degrades with large beam widths. We perform an empirical study of the behavior of beam search across three sequence synthesis tasks. We find that increasing the beam width leads to sequences that are disproportionately based on early, very low probability tokens that are followed by a sequence of tokens with higher (conditional) probability. We show that, empirically, such sequences are more likely to have a lower evaluation score than lower probability sequences without this pattern. Using the notion of search discrepancies from heuristic search, we hypothesize that large discrepancies are the cause of the performance degradation. We show that this hypothesis generalizes the previous ones in machine translation and image captioning. To validate our hypothesis, we show that constraining beam search to avoid large discrepancies eliminates the performance degradation.}\n}", "pdf": "http://proceedings.mlr.press/v97/cohen19a/cohen19a.pdf", "supp": "", "pdf_size": 853907, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1197310612675824214&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Mechanical and Industrial Engineering, University of Toronto, Toronto, Canada; Department of Mechanical and Industrial Engineering, University of Toronto, Toronto, Canada", "aff_domain": "mie.utoronto.ca; ", "email": "mie.utoronto.ca; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/cohen19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "Department of Mechanical and Industrial Engineering", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "End-to-End Probabilistic Inference for Nonstationary Audio Analysis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4030", "id": "4030", "author_site": "William Wilkinson, Michael Riis Andersen, Joshua D. Reiss, Dan Stowell, Arno Solin", "author": "William Wilkinson; Michael Andersen; Joshua D. Reiss; Dan Stowell; Arno Solin", "abstract": "A typical audio signal processing pipeline includes multiple disjoint analysis stages, including calculation of a time-frequency representation followed by spectrogram-based feature analysis. We show how time-frequency analysis and nonnegative matrix factorisation can be jointly formulated as a spectral mixture Gaussian process model with nonstationary priors over the amplitude variance parameters. Further, we formulate this nonlinear model\u2019s state space representation, making it amenable to infinite-horizon Gaussian process regression with approximate inference via expectation propagation, which scales linearly in the number of time steps and quadratically in the state dimensionality. By doing so, we are able to process audio signals with hundreds of thousands of data points. We demonstrate, on various tasks with empirical data, how this inference scheme outperforms more standard techniques that rely on extended Kalman filtering.", "bibtex": "@InProceedings{pmlr-v97-wilkinson19a,\n title = \t {End-to-End Probabilistic Inference for Nonstationary Audio Analysis},\n author = {Wilkinson, William and Andersen, Michael and Reiss, Joshua D. and Stowell, Dan and Solin, Arno},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6776--6785},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wilkinson19a/wilkinson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wilkinson19a.html},\n abstract = \t {A typical audio signal processing pipeline includes multiple disjoint analysis stages, including calculation of a time-frequency representation followed by spectrogram-based feature analysis. We show how time-frequency analysis and nonnegative matrix factorisation can be jointly formulated as a spectral mixture Gaussian process model with nonstationary priors over the amplitude variance parameters. Further, we formulate this nonlinear model\u2019s state space representation, making it amenable to infinite-horizon Gaussian process regression with approximate inference via expectation propagation, which scales linearly in the number of time steps and quadratically in the state dimensionality. By doing so, we are able to process audio signals with hundreds of thousands of data points. We demonstrate, on various tasks with empirical data, how this inference scheme outperforms more standard techniques that rely on extended Kalman filtering.}\n}", "pdf": "http://proceedings.mlr.press/v97/wilkinson19a/wilkinson19a.pdf", "supp": "", "pdf_size": 446303, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3996942805590354959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/wilkinson19a.html" }, { "title": "Entropic GANs meet VAEs: A Statistical Approach to Compute Sample Likelihoods in GANs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3909", "id": "3909", "author_site": "Yogesh Balaji, Hamed Hassani, Rama Chellappa, Soheil Feizi", "author": "Yogesh Balaji; Hamed Hassani; Rama Chellappa; Soheil Feizi", "abstract": "Building on the success of deep learning, two modern approaches to learn a probability model from the data are Generative Adversarial Networks (GANs) and Variational AutoEncoders (VAEs). VAEs consider an explicit probability model for the data and compute a generative distribution by maximizing a variational lower-bound on the log-likelihood function. GANs, however, compute a generative model by minimizing a distance between observed and generated probability distributions without considering an explicit model for the observed data. The lack of having explicit probability models in GANs prohibits computation of sample likelihoods in their frameworks and limits their use in statistical inference problems. In this work, we resolve this issue by constructing an explicit probability model that can be used to compute sample likelihood statistics in GANs. In particular, we prove that under this probability model, a family of Wasserstein GANs with an entropy regularization can be viewed as a generative model that maximizes a variational lower-bound on average sample log likelihoods, an approach that VAEs are based on. This result makes a principled connection between two modern generative models, namely GANs and VAEs. In addition to the aforementioned theoretical results, we compute likelihood statistics for GANs trained on Gaussian, MNIST, SVHN, CIFAR-10 and LSUN datasets. Our numerical results validate the proposed theory.", "bibtex": "@InProceedings{pmlr-v97-balaji19a,\n title = \t {Entropic {GAN}s meet {VAE}s: A Statistical Approach to Compute Sample Likelihoods in {GAN}s},\n author = {Balaji, Yogesh and Hassani, Hamed and Chellappa, Rama and Feizi, Soheil},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {414--423},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/balaji19a/balaji19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/balaji19a.html},\n abstract = \t {Building on the success of deep learning, two modern approaches to learn a probability model from the data are Generative Adversarial Networks (GANs) and Variational AutoEncoders (VAEs). VAEs consider an explicit probability model for the data and compute a generative distribution by maximizing a variational lower-bound on the log-likelihood function. GANs, however, compute a generative model by minimizing a distance between observed and generated probability distributions without considering an explicit model for the observed data. The lack of having explicit probability models in GANs prohibits computation of sample likelihoods in their frameworks and limits their use in statistical inference problems. In this work, we resolve this issue by constructing an explicit probability model that can be used to compute sample likelihood statistics in GANs. In particular, we prove that under this probability model, a family of Wasserstein GANs with an entropy regularization can be viewed as a generative model that maximizes a variational lower-bound on average sample log likelihoods, an approach that VAEs are based on. This result makes a principled connection between two modern generative models, namely GANs and VAEs. In addition to the aforementioned theoretical results, we compute likelihood statistics for GANs trained on Gaussian, MNIST, SVHN, CIFAR-10 and LSUN datasets. Our numerical results validate the proposed theory.}\n}", "pdf": "http://proceedings.mlr.press/v97/balaji19a/balaji19a.pdf", "supp": "", "pdf_size": 1709617, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4502964466526434508&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Maryland, College Park; Department of Electrical and Systems Engineering, University of Pennsylvania; Department of Electrical and Computer Engineering, University of Maryland, College Park; Department of Computer Science, University of Maryland, College Park", "aff_domain": "cs.umd.edu; ; ;cs.umd.edu", "email": "cs.umd.edu; ; ;cs.umd.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/balaji19a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "University of Maryland, College Park;University of Pennsylvania", "aff_unique_dep": "Department of Computer Science;Department of Electrical and Systems Engineering", "aff_unique_url": "https://www/umd.edu;https://www.upenn.edu", "aff_unique_abbr": "UMD;UPenn", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "College Park;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Equivariant Transformer Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3908", "id": "3908", "author_site": "Kai Sheng Tai, Peter Bailis, Gregory Valiant", "author": "Kai Sheng Tai; Peter Bailis; Gregory Valiant", "abstract": "How can prior knowledge on the transformation invariances of a domain be incorporated into the architecture of a neural network? We propose Equivariant Transformers (ETs), a family of differentiable image-to-image mappings that improve the robustness of models towards pre-defined continuous transformation groups. Through the use of specially-derived canonical coordinate systems, ETs incorporate functions that are equivariant by construction with respect to these transformations. We show empirically that ETs can be flexibly composed to improve model robustness towards more complicated transformation groups in several parameters. On a real-world image classification task, ETs improve the sample efficiency of ResNet classifiers, achieving relative improvements in error rate of up to 15% in the limited data regime while increasing model parameter count by less than 1%.", "bibtex": "@InProceedings{pmlr-v97-tai19a,\n title = \t {Equivariant Transformer Networks},\n author = {Tai, Kai Sheng and Bailis, Peter and Valiant, Gregory},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6086--6095},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tai19a/tai19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tai19a.html},\n abstract = \t {How can prior knowledge on the transformation invariances of a domain be incorporated into the architecture of a neural network? We propose Equivariant Transformers (ETs), a family of differentiable image-to-image mappings that improve the robustness of models towards pre-defined continuous transformation groups. Through the use of specially-derived canonical coordinate systems, ETs incorporate functions that are equivariant by construction with respect to these transformations. We show empirically that ETs can be flexibly composed to improve model robustness towards more complicated transformation groups in several parameters. On a real-world image classification task, ETs improve the sample efficiency of ResNet classifiers, achieving relative improvements in error rate of up to 15% in the limited data regime while increasing model parameter count by less than 1%.}\n}", "pdf": "http://proceedings.mlr.press/v97/tai19a/tai19a.pdf", "supp": "", "pdf_size": 533035, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=740882376854558881&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Stanford University; Stanford University; Stanford University", "aff_domain": "cs.stanford.edu; ; ", "email": "cs.stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/tai19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Error Feedback Fixes SignSGD and other Gradient Compression Schemes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4158", "id": "4158", "author_site": "Sai Praneeth Reddy Karimireddy, Quentin Rebjock, Sebastian Stich, Martin Jaggi", "author": "Sai Praneeth Karimireddy; Quentin Rebjock; Sebastian Stich; Martin Jaggi", "abstract": "Sign-based algorithms (e.g. signSGD) have been proposed as a biased gradient compression technique to alleviate the communication bottleneck in training large neural networks across multiple workers. We show simple convex counter-examples where signSGD does not converge to the optimum. Further, even when it does converge, signSGD may generalize poorly when compared with SGD. These issues arise because of the biased nature of the sign compression operator. We then show that using error-feedback, i.e. incorporating the error made by the compression operator into the next step, overcomes these issues. We prove that our algorithm (EF-SGD) with arbitrary compression operator achieves the same rate of convergence as SGD without any additional assumptions. Thus EF-SGD achieves gradient compression for free. Our experiments thoroughly substantiate the theory.", "bibtex": "@InProceedings{pmlr-v97-karimireddy19a,\n title = \t {Error Feedback Fixes {S}ign{SGD} and other Gradient Compression Schemes},\n author = {Karimireddy, Sai Praneeth and Rebjock, Quentin and Stich, Sebastian and Jaggi, Martin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3252--3261},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/karimireddy19a/karimireddy19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/karimireddy19a.html},\n abstract = \t {Sign-based algorithms (e.g. signSGD) have been proposed as a biased gradient compression technique to alleviate the communication bottleneck in training large neural networks across multiple workers. We show simple convex counter-examples where signSGD does not converge to the optimum. Further, even when it does converge, signSGD may generalize poorly when compared with SGD. These issues arise because of the biased nature of the sign compression operator. We then show that using error-feedback, i.e. incorporating the error made by the compression operator into the next step, overcomes these issues. We prove that our algorithm (EF-SGD) with arbitrary compression operator achieves the same rate of convergence as SGD without any additional assumptions. Thus EF-SGD achieves gradient compression for free. Our experiments thoroughly substantiate the theory.}\n}", "pdf": "http://proceedings.mlr.press/v97/karimireddy19a/karimireddy19a.pdf", "supp": "", "pdf_size": 1640197, "gs_citation": 616, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15067189376913629578&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "EPFL, Switzerland; EPFL, Switzerland; EPFL, Switzerland; EPFL, Switzerland", "aff_domain": "epfl.ch; ; ; ", "email": "epfl.ch; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/karimireddy19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Escaping Saddle Points with Adaptive Gradient Methods", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4059", "id": "4059", "author_site": "Matthew Staib, Sashank Jakkam Reddi, Satyen Kale, Sanjiv Kumar, Suvrit Sra", "author": "Matthew Staib; Sashank Reddi; Satyen Kale; Sanjiv Kumar; Suvrit Sra", "abstract": "Adaptive methods such as Adam and RMSProp are widely used in deep learning but are not well understood. In this paper, we seek a crisp, clean and precise characterization of their behavior in nonconvex settings. To this end, we first provide a novel view of adaptive methods as preconditioned SGD, where the preconditioner is estimated in an online manner. By studying the preconditioner on its own, we elucidate its purpose: it rescales the stochastic gradient noise to be isotropic near stationary points, which helps escape saddle points. Furthermore, we show that adaptive methods can efficiently estimate the aforementioned preconditioner. By gluing together these two components, we provide the first (to our knowledge) second-order convergence result for any adaptive method. The key insight from our analysis is that, compared to SGD, adaptive methods escape saddle points faster, and can converge faster overall to second-order stationary points.", "bibtex": "@InProceedings{pmlr-v97-staib19a,\n title = \t {Escaping Saddle Points with Adaptive Gradient Methods},\n author = {Staib, Matthew and Reddi, Sashank and Kale, Satyen and Kumar, Sanjiv and Sra, Suvrit},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5956--5965},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/staib19a/staib19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/staib19a.html},\n abstract = \t {Adaptive methods such as Adam and RMSProp are widely used in deep learning but are not well understood. In this paper, we seek a crisp, clean and precise characterization of their behavior in nonconvex settings. To this end, we first provide a novel view of adaptive methods as preconditioned SGD, where the preconditioner is estimated in an online manner. By studying the preconditioner on its own, we elucidate its purpose: it rescales the stochastic gradient noise to be isotropic near stationary points, which helps escape saddle points. Furthermore, we show that adaptive methods can efficiently estimate the aforementioned preconditioner. By gluing together these two components, we provide the first (to our knowledge) second-order convergence result for any adaptive method. The key insight from our analysis is that, compared to SGD, adaptive methods escape saddle points faster, and can converge faster overall to second-order stationary points.}\n}", "pdf": "http://proceedings.mlr.press/v97/staib19a/staib19a.pdf", "supp": "", "pdf_size": 636281, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14397912646560188130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "MIT EECS + Google Research New York; Google Research New York; Google Research New York; Google Research New York; MIT EECS", "aff_domain": "mit.edu; ; ; ; ", "email": "mit.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/staib19a.html", "aff_unique_index": "0+1;1;1;1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": "Electrical Engineering & Computer Science;Google Research", "aff_unique_url": "https://web.mit.edu;https://research.google", "aff_unique_abbr": "MIT;Google Research", "aff_campus_unique_index": "0+1;1;1;1;0", "aff_campus_unique": "Cambridge;New York", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Estimate Sequences for Variance-Reduced Stochastic Composite Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4062", "id": "4062", "author_site": "Andrei Kulunchakov, Julien Mairal", "author": "Andrei Kulunchakov; Julien Mairal", "abstract": "In this paper, we propose a unified view of gradient-based algorithms for stochastic convex composite optimization by extending the concept of estimate sequence introduced by Nesterov. This point of view covers the stochastic gradient descent method, variants of the approaches SAGA, SVRG, and has several advantages: (i) we provide a generic proof of convergence for the aforementioned methods; (ii) we show that this SVRG variant is adaptive to strong convexity; (iii) we naturally obtain new algorithms with the same guarantees; (iv) we derive generic strategies to make these algorithms robust to stochastic noise, which is useful when data is corrupted by small random perturbations. Finally, we show that this viewpoint is useful to obtain new accelerated algorithms in the sense of Nesterov.", "bibtex": "@InProceedings{pmlr-v97-kulunchakov19a,\n title = \t {Estimate Sequences for Variance-Reduced Stochastic Composite Optimization},\n author = {Kulunchakov, Andrei and Mairal, Julien},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3541--3550},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kulunchakov19a/kulunchakov19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kulunchakov19a.html},\n abstract = \t {In this paper, we propose a unified view of gradient-based algorithms for stochastic convex composite optimization by extending the concept of estimate sequence introduced by Nesterov. This point of view covers the stochastic gradient descent method, variants of the approaches SAGA, SVRG, and has several advantages: (i) we provide a generic proof of convergence for the aforementioned methods; (ii) we show that this SVRG variant is adaptive to strong convexity; (iii) we naturally obtain new algorithms with the same guarantees; (iv) we derive generic strategies to make these algorithms robust to stochastic noise, which is useful when data is corrupted by small random perturbations. Finally, we show that this viewpoint is useful to obtain new accelerated algorithms in the sense of Nesterov.}\n}", "pdf": "http://proceedings.mlr.press/v97/kulunchakov19a/kulunchakov19a.pdf", "supp": "", "pdf_size": 432176, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1475332924934890495&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France", "aff_domain": "inria.fr;inria.fr", "email": "inria.fr;inria.fr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/kulunchakov19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Universite Grenoble Alpes", "aff_unique_dep": "", "aff_unique_url": "https://www.univ-grenoble-alpes.fr", "aff_unique_abbr": "UGA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Grenoble", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Estimating Information Flow in Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4128", "id": "4128", "author_site": "Ziv Goldfeld, Ewout van den Berg, Kristjan Greenewald, Igor Melnyk, Nam Nguyen, Brian Kingsbury, Yury Polyanskiy", "author": "Ziv Goldfeld; Ewout Van Den Berg; Kristjan Greenewald; Igor Melnyk; Nam Nguyen; Brian Kingsbury; Yury Polyanskiy", "abstract": "We study the estimation of the mutual information I(X;T_$\\ell$) between the input X to a deep neural network (DNN) and the output vector T_$\\ell$ of its $\\ell$-th hidden layer (an \u201cinternal representation\u201d). Focusing on feedforward networks with fixed weights and noisy internal representations, we develop a rigorous framework for accurate estimation of I(X;T_$\\ell$). By relating I(X;T_$\\ell$) to information transmission over additive white Gaussian noise channels, we reveal that compression, i.e. reduction in I(X;T_$\\ell$) over the course of training, is driven by progressive geometric clustering of the representations of samples from the same class. Experimental results verify this connection. Finally, we shift focus to purely deterministic DNNs, where I(X;T_$\\ell$) is provably vacuous, and show that nevertheless, these models also cluster inputs belonging to the same class. The binning-based approximation of I(X;T_$\\ell$) employed in past works to measure compression is identified as a measure of clustering, thus clarifying that these experiments were in fact tracking the same clustering phenomenon. Leveraging the clustering perspective, we provide new evidence that compression and generalization may not be causally related and discuss potential future research ideas.", "bibtex": "@InProceedings{pmlr-v97-goldfeld19a,\n title = \t {Estimating Information Flow in Deep Neural Networks},\n author = {Goldfeld, Ziv and Van Den Berg, Ewout and Greenewald, Kristjan and Melnyk, Igor and Nguyen, Nam and Kingsbury, Brian and Polyanskiy, Yury},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2299--2308},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/goldfeld19a/goldfeld19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/goldfeld19a.html},\n abstract = \t {We study the estimation of the mutual information I(X;T_$\\ell$) between the input X to a deep neural network (DNN) and the output vector T_$\\ell$ of its $\\ell$-th hidden layer (an \u201cinternal representation\u201d). Focusing on feedforward networks with fixed weights and noisy internal representations, we develop a rigorous framework for accurate estimation of I(X;T_$\\ell$). By relating I(X;T_$\\ell$) to information transmission over additive white Gaussian noise channels, we reveal that compression, i.e. reduction in I(X;T_$\\ell$) over the course of training, is driven by progressive geometric clustering of the representations of samples from the same class. Experimental results verify this connection. Finally, we shift focus to purely deterministic DNNs, where I(X;T_$\\ell$) is provably vacuous, and show that nevertheless, these models also cluster inputs belonging to the same class. The binning-based approximation of I(X;T_$\\ell$) employed in past works to measure compression is identified as a measure of clustering, thus clarifying that these experiments were in fact tracking the same clustering phenomenon. Leveraging the clustering perspective, we provide new evidence that compression and generalization may not be causally related and discuss potential future research ideas.}\n}", "pdf": "http://proceedings.mlr.press/v97/goldfeld19a/goldfeld19a.pdf", "supp": "", "pdf_size": 0, "gs_citation": 181, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16281237775524920214&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/goldfeld19a.html" }, { "title": "Explaining Deep Neural Networks with a Polynomial Time Algorithm for Shapley Value Approximation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3901", "id": "3901", "author_site": "Marco Ancona, Cengiz Oztireli, Markus Gross", "author": "Marco Ancona; Cengiz Oztireli; Markus Gross", "abstract": "The problem of explaining the behavior of deep neural networks has recently gained a lot of attention. While several attribution methods have been proposed, most come without strong theoretical foundations, which raises questions about their reliability. On the other hand, the literature on cooperative game theory suggests Shapley values as a unique way of assigning relevance scores such that certain desirable properties are satisfied. Unfortunately, the exact evaluation of Shapley values is prohibitively expensive, exponential in the number of input features. In this work, by leveraging recent results on uncertainty propagation, we propose a novel, polynomial-time approximation of Shapley values in deep neural networks. We show that our method produces significantly better approximations of Shapley values than existing state-of-the-art attribution methods.", "bibtex": "@InProceedings{pmlr-v97-ancona19a,\n title = \t {Explaining Deep Neural Networks with a Polynomial Time Algorithm for Shapley Value Approximation},\n author = {Ancona, Marco and Oztireli, Cengiz and Gross, Markus},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {272--281},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ancona19a/ancona19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ancona19a.html},\n abstract = \t {The problem of explaining the behavior of deep neural networks has recently gained a lot of attention. While several attribution methods have been proposed, most come without strong theoretical foundations, which raises questions about their reliability. On the other hand, the literature on cooperative game theory suggests Shapley values as a unique way of assigning relevance scores such that certain desirable properties are satisfied. Unfortunately, the exact evaluation of Shapley values is prohibitively expensive, exponential in the number of input features. In this work, by leveraging recent results on uncertainty propagation, we propose a novel, polynomial-time approximation of Shapley values in deep neural networks. We show that our method produces significantly better approximations of Shapley values than existing state-of-the-art attribution methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/ancona19a/ancona19a.pdf", "supp": "", "pdf_size": 680381, "gs_citation": 319, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14974458415676786852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, ETH Zurich, Switzerland; Disney Research Zurich, Switzerland; Department of Computer Science, ETH Zurich, Switzerland + Disney Research Zurich, Switzerland", "aff_domain": "inf.ethz.ch; ; ", "email": "inf.ethz.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ancona19a.html", "aff_unique_index": "0;1;0+1", "aff_unique_norm": "ETH Zurich;Disney Research", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.ethz.ch;https://www.disneyresearch.com", "aff_unique_abbr": "ETHZ;Disney Research", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "Switzerland" }, { "title": "Exploiting Worker Correlation for Label Aggregation in Crowdsourcing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4013", "id": "4013", "author_site": "Yuan Li, Benjamin Rubinstein, Trevor Cohn", "author": "Yuan Li; Benjamin Rubinstein; Trevor Cohn", "abstract": "Crowdsourcing has emerged as a core component of data science pipelines. From collected noisy worker labels, aggregation models that incorporate worker reliability parameters aim to infer a latent true annotation. In this paper, we argue that existing crowdsourcing approaches do not sufficiently model worker correlations observed in practical settings; we propose in response an enhanced Bayesian classifier combination (EBCC) model, with inference based on a mean-field variational approach. An introduced mixture of intra-class reliabilities\u2014connected to tensor decomposition and item clustering\u2014induces inter-worker correlation. EBCC does not suffer the limitations of existing correlation models: intractable marginalisation of missing labels and poor scaling to large worker cohorts. Extensive empirical comparison on 17 real-world datasets sees EBCC achieving the highest mean accuracy across 10 benchmark crowdsourcing methods.", "bibtex": "@InProceedings{pmlr-v97-li19i,\n title = \t {Exploiting Worker Correlation for Label Aggregation in Crowdsourcing},\n author = {Li, Yuan and Rubinstein, Benjamin and Cohn, Trevor},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3886--3895},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19i/li19i.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19i.html},\n abstract = \t {Crowdsourcing has emerged as a core component of data science pipelines. From collected noisy worker labels, aggregation models that incorporate worker reliability parameters aim to infer a latent true annotation. In this paper, we argue that existing crowdsourcing approaches do not sufficiently model worker correlations observed in practical settings; we propose in response an enhanced Bayesian classifier combination (EBCC) model, with inference based on a mean-field variational approach. An introduced mixture of intra-class reliabilities\u2014connected to tensor decomposition and item clustering\u2014induces inter-worker correlation. EBCC does not suffer the limitations of existing correlation models: intractable marginalisation of missing labels and poor scaling to large worker cohorts. Extensive empirical comparison on 17 real-world datasets sees EBCC achieving the highest mean accuracy across 10 benchmark crowdsourcing methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19i/li19i.pdf", "supp": "", "pdf_size": 512171, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9931591841767604034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Computing and Information Systems, University of Melbourne, Victoria, Australia; School of Computing and Information Systems, University of Melbourne, Victoria, Australia; School of Computing and Information Systems, University of Melbourne, Victoria, Australia", "aff_domain": "student.unimelb.edu.au;unimelb.edu.au;unimelb.edu.au", "email": "student.unimelb.edu.au;unimelb.edu.au;unimelb.edu.au", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/li19i.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Melbourne", "aff_unique_dep": "School of Computing and Information Systems", "aff_unique_url": "https://www.unimelb.edu.au", "aff_unique_abbr": "UniMelb", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Melbourne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Australia" }, { "title": "Exploiting structure of uncertainty for efficient matroid semi-bandits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3850", "id": "3850", "author_site": "Pierre Perrault, Vianney Perchet, Michal Valko", "author": "Pierre Perrault; Vianney Perchet; Michal Valko", "abstract": "We improve the efficiency of algorithms for stochastic combinatorial semi-bandits. In most interesting problems, state-of-the-art algorithms take advantage of structural properties of rewards, such as independence. However, while being minimax optimal in terms of regret, these algorithms are intractable. In our paper, we first reduce their implementation to a specific submodular maximization. Then, in case of matroid constraints, we design adapted approximation routines, thereby providing the first efficient algorithms that exploit the reward structure. In particular, we improve the state-of-the-art efficient gap-free regret bound by a factor sqrt(k), where k is the maximum action size. Finally, we show how our improvement translates to more general budgeted combinatorial semi-bandits.", "bibtex": "@InProceedings{pmlr-v97-perrault19a,\n title = \t {Exploiting structure of uncertainty for efficient matroid semi-bandits},\n author = {Perrault, Pierre and Perchet, Vianney and Valko, Michal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5123--5132},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/perrault19a/perrault19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/perrault19a.html},\n abstract = \t {We improve the efficiency of algorithms for stochastic combinatorial semi-bandits. In most interesting problems, state-of-the-art algorithms take advantage of structural properties of rewards, such as independence. However, while being minimax optimal in terms of regret, these algorithms are intractable. In our paper, we first reduce their implementation to a specific submodular maximization. Then, in case of matroid constraints, we design adapted approximation routines, thereby providing the first efficient algorithms that exploit the reward structure. In particular, we improve the state-of-the-art efficient gap-free regret bound by a factor sqrt(k), where k is the maximum action size. Finally, we show how our improvement translates to more general budgeted combinatorial semi-bandits.}\n}", "pdf": "http://proceedings.mlr.press/v97/perrault19a/perrault19a.pdf", "supp": "", "pdf_size": 482990, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8668505028726558247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "SequeL team, INRIA Lille - Nord Europe+1; CMLA, ENS Paris-Saclay+2; Criteo AI Lab+3+4", "aff_domain": "inria.fr; ; ", "email": "inria.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/perrault19a.html", "aff_unique_index": "0;2;3", "aff_unique_norm": "INRIA Lille - Nord Europe;;\u00c9cole Normale Sup\u00e9rieure Paris-Saclay;Criteo", "aff_unique_dep": "SequeL team;;CMLA;Criteo AI Lab", "aff_unique_url": "https://www.inria.fr/en/centre/lille-nord-europe;;https://www.ens-paris-saclay.fr;https://www.criteo.com", "aff_unique_abbr": "INRIA;;ENS Paris-Saclay;Criteo", "aff_campus_unique_index": "0;2;", "aff_campus_unique": "Lille;;Paris-Saclay", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France;" }, { "title": "Exploration Conscious Reinforcement Learning Revisited", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3577", "id": "3577", "author_site": "Lior Shani, Yonathan Efroni, Shie Mannor", "author": "Lior Shani; Yonathan Efroni; Shie Mannor", "abstract": "The Exploration-Exploitation tradeoff arises in Reinforcement Learning when one cannot tell if a policy is optimal. Then, there is a constant need to explore new actions instead of exploiting past experience. In practice, it is common to resolve the tradeoff by using a fixed exploration mechanism, such as $\\epsilon$-greedy exploration or by adding Gaussian noise, while still trying to learn an optimal policy. In this work, we take a different approach and study exploration-conscious criteria, that result in optimal policies with respect to the exploration mechanism. Solving these criteria, as we establish, amounts to solving a surrogate Markov Decision Process. We continue and analyze properties of exploration-conscious optimal policies and characterize two general approaches to solve such criteria. Building on the approaches, we apply simple changes in existing tabular and deep Reinforcement Learning algorithms and empirically demonstrate superior performance relatively to their non-exploration-conscious counterparts, both for discrete and continuous action spaces.", "bibtex": "@InProceedings{pmlr-v97-shani19a,\n title = \t {Exploration Conscious Reinforcement Learning Revisited},\n author = {Shani, Lior and Efroni, Yonathan and Mannor, Shie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5680--5689},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shani19a/shani19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/shani19a.html},\n abstract = \t {The Exploration-Exploitation tradeoff arises in Reinforcement Learning when one cannot tell if a policy is optimal. Then, there is a constant need to explore new actions instead of exploiting past experience. In practice, it is common to resolve the tradeoff by using a fixed exploration mechanism, such as $\\epsilon$-greedy exploration or by adding Gaussian noise, while still trying to learn an optimal policy. In this work, we take a different approach and study exploration-conscious criteria, that result in optimal policies with respect to the exploration mechanism. Solving these criteria, as we establish, amounts to solving a surrogate Markov Decision Process. We continue and analyze properties of exploration-conscious optimal policies and characterize two general approaches to solve such criteria. Building on the approaches, we apply simple changes in existing tabular and deep Reinforcement Learning algorithms and empirically demonstrate superior performance relatively to their non-exploration-conscious counterparts, both for discrete and continuous action spaces.}\n}", "pdf": "http://proceedings.mlr.press/v97/shani19a/shani19a.pdf", "supp": "", "pdf_size": 369849, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2069086734091208368&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical Engineering, Technion, Haifa, Israel; Department of Electrical Engineering, Technion, Haifa, Israel; Department of Electrical Engineering, Technion, Haifa, Israel", "aff_domain": "gmail.com;gmail.com; ", "email": "gmail.com;gmail.com; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/shani19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Technion", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "title": "Exploring interpretable LSTM neural networks over multi-variable data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4209", "id": "4209", "author_site": "Tian Guo, Tao Lin, Nino Antulov-Fantulin", "author": "Tian Guo; Tao Lin; Nino Antulov-Fantulin", "abstract": "For recurrent neural networks trained on time series with target and exogenous variables, in addition to accurate prediction, it is also desired to provide interpretable insights into the data. In this paper, we explore the structure of LSTM recurrent neural networks to learn variable-wise hidden states, with the aim to capture different dynamics in multi-variable time series and distinguish the contribution of variables to the prediction. With these variable-wise hidden states, a mixture attention mechanism is proposed to model the generative process of the target. Then we develop associated training methods to jointly learn network parameters, variable and temporal importance w.r.t the prediction of the target variable. Extensive experiments on real datasets demonstrate enhanced prediction performance by capturing the dynamics of different variables. Meanwhile, we evaluate the interpretation results both qualitatively and quantitatively. It exhibits the prospect as an end-to-end framework for both forecasting and knowledge extraction over multi-variable data.", "bibtex": "@InProceedings{pmlr-v97-guo19b,\n title = \t {Exploring interpretable {LSTM} neural networks over multi-variable data},\n author = {Guo, Tian and Lin, Tao and Antulov-Fantulin, Nino},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2494--2504},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/guo19b/guo19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/guo19b.html},\n abstract = \t {For recurrent neural networks trained on time series with target and exogenous variables, in addition to accurate prediction, it is also desired to provide interpretable insights into the data. In this paper, we explore the structure of LSTM recurrent neural networks to learn variable-wise hidden states, with the aim to capture different dynamics in multi-variable time series and distinguish the contribution of variables to the prediction. With these variable-wise hidden states, a mixture attention mechanism is proposed to model the generative process of the target. Then we develop associated training methods to jointly learn network parameters, variable and temporal importance w.r.t the prediction of the target variable. Extensive experiments on real datasets demonstrate enhanced prediction performance by capturing the dynamics of different variables. Meanwhile, we evaluate the interpretation results both qualitatively and quantitatively. It exhibits the prospect as an end-to-end framework for both forecasting and knowledge extraction over multi-variable data.}\n}", "pdf": "http://proceedings.mlr.press/v97/guo19b/guo19b.pdf", "supp": "", "pdf_size": 1086693, "gs_citation": 242, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10587890434348996432&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "ETH, Z\u00fcrich, Switzerland; EPFL, Switzerland; ETH, Z\u00fcrich, Switzerland", "aff_domain": "gess.ethz.ch; ; ", "email": "gess.ethz.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/guo19b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "ETH Zurich;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.epfl.ch", "aff_unique_abbr": "ETH;EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Z\u00fcrich;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Exploring the Landscape of Spatial Robustness", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4080", "id": "4080", "author_site": "Logan Engstrom, Brandon Tran, Dimitris Tsipras, Ludwig Schmidt, Aleksander Madry", "author": "Logan Engstrom; Brandon Tran; Dimitris Tsipras; Ludwig Schmidt; Aleksander Madry", "abstract": "The study of adversarial robustness has so far largely focused on perturbations bound in $\\ell_p$-norms. However, state-of-the-art models turn out to be also vulnerable to other, more natural classes of perturbations such as translations and rotations. In this work, we thoroughly investigate the vulnerability of neural network\u2013based classifiers to rotations and translations. While data augmentation offers relatively small robustness, we use ideas from robust optimization and test-time input aggregation to significantly improve robustness. Finally we find that, in contrast to the $\\ell_p$-norm case, first-order methods cannot reliably find worst-case perturbations. This highlights spatial robustness as a fundamentally different setting requiring additional study.", "bibtex": "@InProceedings{pmlr-v97-engstrom19a,\n title = \t {Exploring the Landscape of Spatial Robustness},\n author = {Engstrom, Logan and Tran, Brandon and Tsipras, Dimitris and Schmidt, Ludwig and Madry, Aleksander},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1802--1811},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/engstrom19a/engstrom19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/engstrom19a.html},\n abstract = \t {The study of adversarial robustness has so far largely focused on perturbations bound in $\\ell_p$-norms. However, state-of-the-art models turn out to be also vulnerable to other, more natural classes of perturbations such as translations and rotations. In this work, we thoroughly investigate the vulnerability of neural network\u2013based classifiers to rotations and translations. While data augmentation offers relatively small robustness, we use ideas from robust optimization and test-time input aggregation to significantly improve robustness. Finally we find that, in contrast to the $\\ell_p$-norm case, first-order methods cannot reliably find worst-case perturbations. This highlights spatial robustness as a fundamentally different setting requiring additional study.}\n}", "pdf": "http://proceedings.mlr.press/v97/engstrom19a/engstrom19a.pdf", "supp": "", "pdf_size": 1919615, "gs_citation": 552, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15432085216309660699&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "EECS, MIT, Massachusetts, USA; EECS, MIT, Massachusetts, USA; EECS, MIT, Massachusetts, USA; EECS, MIT, Massachusetts, USA; EECS, MIT, Massachusetts, USA", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "email": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/engstrom19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Electrical Engineering and Computer Science", "aff_unique_url": "https://www.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Massachusetts", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Extrapolating Beyond Suboptimal Demonstrations via Inverse Reinforcement Learning from Observations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4186", "id": "4186", "author_site": "Daniel Brown, Wonjoon Goo, Prabhat Nagarajan, Scott Niekum", "author": "Daniel Brown; Wonjoon Goo; Prabhat Nagarajan; Scott Niekum", "abstract": "A critical flaw of existing inverse reinforcement learning (IRL) methods is their inability to significantly outperform the demonstrator. This is because IRL typically seeks a reward function that makes the demonstrator appear near-optimal, rather than inferring the underlying intentions of the demonstrator that may have been poorly executed in practice. In this paper, we introduce a novel reward-learning-from-observation algorithm, Trajectory-ranked Reward EXtrapolation (T-REX), that extrapolates beyond a set of (approximately) ranked demonstrations in order to infer high-quality reward functions from a set of potentially poor demonstrations. When combined with deep reinforcement learning, T-REX outperforms state-of-the-art imitation learning and IRL methods on multiple Atari and MuJoCo benchmark tasks and achieves performance that is often more than twice the performance of the best demonstration. We also demonstrate that T-REX is robust to ranking noise and can accurately extrapolate intention by simply watching a learner noisily improve at a task over time.", "bibtex": "@InProceedings{pmlr-v97-brown19a,\n title = \t {Extrapolating Beyond Suboptimal Demonstrations via Inverse Reinforcement Learning from Observations},\n author = {Brown, Daniel and Goo, Wonjoon and Nagarajan, Prabhat and Niekum, Scott},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {783--792},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/brown19a/brown19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/brown19a.html},\n abstract = \t {A critical flaw of existing inverse reinforcement learning (IRL) methods is their inability to significantly outperform the demonstrator. This is because IRL typically seeks a reward function that makes the demonstrator appear near-optimal, rather than inferring the underlying intentions of the demonstrator that may have been poorly executed in practice. In this paper, we introduce a novel reward-learning-from-observation algorithm, Trajectory-ranked Reward EXtrapolation (T-REX), that extrapolates beyond a set of (approximately) ranked demonstrations in order to infer high-quality reward functions from a set of potentially poor demonstrations. When combined with deep reinforcement learning, T-REX outperforms state-of-the-art imitation learning and IRL methods on multiple Atari and MuJoCo benchmark tasks and achieves performance that is often more than twice the performance of the best demonstration. We also demonstrate that T-REX is robust to ranking noise and can accurately extrapolate intention by simply watching a learner noisily improve at a task over time.}\n}", "pdf": "http://proceedings.mlr.press/v97/brown19a/brown19a.pdf", "supp": "", "pdf_size": 1349122, "gs_citation": 469, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14944046691955331663&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, University of Texas at Austin, USA; Department of Computer Science, University of Texas at Austin, USA; Preferred Networks, Japan; Department of Computer Science, University of Texas at Austin, USA", "aff_domain": "cs.utexas.edu;cs.utexas.edu; ; ", "email": "cs.utexas.edu;cs.utexas.edu; ; ", "github": "https://github.com/hiwonjoon/ICML2019-TREX", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/brown19a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Texas at Austin;Preferred Networks", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.utexas.edu;https://www.preferred-networks.com", "aff_unique_abbr": "UT Austin;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Japan" }, { "title": "Fair Regression: Quantitative Definitions and Reduction-Based Algorithms", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4075", "id": "4075", "author_site": "Alekh Agarwal, Miroslav Dudik, Steven Wu", "author": "Alekh Agarwal; Miroslav Dudik; Zhiwei Steven Wu", "abstract": "In this paper, we study the prediction of a real-valued target, such as a risk score or recidivism rate, while guaranteeing a quantitative notion of fairness with respect to a protected attribute such as gender or race. We call this class of problems fair regression. We propose general schemes for fair regression under two notions of fairness: (1) statistical parity, which asks that the prediction be statistically independent of the protected attribute, and (2) bounded group loss, which asks that the prediction error restricted to any protected group remain below some pre-determined level. While we only study these two notions of fairness, our schemes are applicable to arbitrary Lipschitz-continuous losses, and so they encompass least-squares regression, logistic regression, quantile regression, and many other tasks. Our schemes only require access to standard risk minimization algorithms (such as standard classification or least-squares regression) while providing theoretical guarantees on the optimality and fairness of the obtained solutions. In addition to analyzing theoretical properties of our schemes, we empirically demonstrate their ability to uncover fairness\u2013accuracy frontiers on several standard datasets.", "bibtex": "@InProceedings{pmlr-v97-agarwal19d,\n title = \t {Fair Regression: Quantitative Definitions and Reduction-Based Algorithms},\n author = {Agarwal, Alekh and Dudik, Miroslav and Wu, Zhiwei Steven},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {120--129},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/agarwal19d/agarwal19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/agarwal19d.html},\n abstract = \t {In this paper, we study the prediction of a real-valued target, such as a risk score or recidivism rate, while guaranteeing a quantitative notion of fairness with respect to a protected attribute such as gender or race. We call this class of problems fair regression. We propose general schemes for fair regression under two notions of fairness: (1) statistical parity, which asks that the prediction be statistically independent of the protected attribute, and (2) bounded group loss, which asks that the prediction error restricted to any protected group remain below some pre-determined level. While we only study these two notions of fairness, our schemes are applicable to arbitrary Lipschitz-continuous losses, and so they encompass least-squares regression, logistic regression, quantile regression, and many other tasks. Our schemes only require access to standard risk minimization algorithms (such as standard classification or least-squares regression) while providing theoretical guarantees on the optimality and fairness of the obtained solutions. In addition to analyzing theoretical properties of our schemes, we empirically demonstrate their ability to uncover fairness\u2013accuracy frontiers on several standard datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/agarwal19d/agarwal19d.pdf", "supp": "", "pdf_size": 710086, "gs_citation": 366, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1215439280701490214&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Microsoft Research, Redmond, WA; Microsoft Research, New York, NY; University of Minnesota, Minneapolis, MN", "aff_domain": "microsoft.com;microsoft.com;umn.edu", "email": "microsoft.com;microsoft.com;umn.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/agarwal19d.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Microsoft;University of Minnesota", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www.minnesota.edu", "aff_unique_abbr": "MSR;UMN", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Redmond;New York;Minneapolis", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fair k-Center Clustering for Data Summarization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4242", "id": "4242", "author_site": "Matth\u00e4us Kleindessner, Pranjal Awasthi, Jamie Morgenstern", "author": "Matth\u00e4us Kleindessner; Pranjal Awasthi; Jamie Morgenstern", "abstract": "In data summarization we want to choose $k$ prototypes in order to summarize a data set. We study a setting where the data set comprises several demographic groups and we are restricted to choose $k_i$ prototypes belonging to group $i$. A common approach to the problem without the fairness constraint is to optimize a centroid-based clustering objective such as $k$-center. A natural extension then is to incorporate the fairness constraint into the clustering problem. Existing algorithms for doing so run in time super-quadratic in the size of the data set, which is in contrast to the standard $k$-center problem being approximable in linear time. In this paper, we resolve this gap by providing a simple approximation algorithm for the $k$-center problem under the fairness constraint with running time linear in the size of the data set and $k$. If the number of demographic groups is small, the approximation guarantee of our algorithm only incurs a constant-factor overhead.", "bibtex": "@InProceedings{pmlr-v97-kleindessner19a,\n title = \t {Fair k-Center Clustering for Data Summarization},\n author = {Kleindessner, Matth{\\\"a}us and Awasthi, Pranjal and Morgenstern, Jamie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3448--3457},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kleindessner19a/kleindessner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kleindessner19a.html},\n abstract = \t {In data summarization we want to choose $k$ prototypes in order to summarize a data set. We study a setting where the data set comprises several demographic groups and we are restricted to choose $k_i$ prototypes belonging to group $i$. A common approach to the problem without the fairness constraint is to optimize a centroid-based clustering objective such as $k$-center. A natural extension then is to incorporate the fairness constraint into the clustering problem. Existing algorithms for doing so run in time super-quadratic in the size of the data set, which is in contrast to the standard $k$-center problem being approximable in linear time. In this paper, we resolve this gap by providing a simple approximation algorithm for the $k$-center problem under the fairness constraint with running time linear in the size of the data set and $k$. If the number of demographic groups is small, the approximation guarantee of our algorithm only incurs a constant-factor overhead.}\n}", "pdf": "http://proceedings.mlr.press/v97/kleindessner19a/kleindessner19a.pdf", "supp": "", "pdf_size": 3064212, "gs_citation": 219, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10384783714256817355&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Rutgers University, NJ; Department of Computer Science, Rutgers University, NJ; College of Computing, Georgia Tech, GA", "aff_domain": "rutgers.edu;rutgers.edu;cs.gatech.edu", "email": "rutgers.edu;rutgers.edu;cs.gatech.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kleindessner19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Rutgers University;Georgia Institute of Technology", "aff_unique_dep": "Department of Computer Science;College of Computing", "aff_unique_url": "https://www.rutgers.edu;https://www.gatech.edu", "aff_unique_abbr": "Rutgers;Georgia Tech", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "New Brunswick;Georgia Tech", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fairness risk measures", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3728", "id": "3728", "author_site": "Robert C Williamson, Aditya Menon", "author": "Robert Williamson; Aditya Menon", "abstract": "Ensuring that classifiers are non-discriminatory or fair with respect to a sensitive feature (e.g., race or gender) is a topical problem. Progress in this task requires fixing a definition of fairness, and there have been several proposals in this regard over the past few years. Several of these, however, assume either binary sensitive features (thus precluding categorical or real-valued sensitive groups), or result in non-convex objectives (thus adversely affecting the optimisation landscape). In this paper, we propose a new definition of fairness that generalises some existing proposals, while allowing for generic sensitive features and resulting in a convex objective. The key idea is to enforce that the expected losses (or risks) across each subgroup induced by the sensitive feature are commensurate. We show how this relates to the rich literature on risk measures from mathematical finance. As a special case, this leads to a new convex fairness-aware objective based on minimising the conditional value at risk (CVaR).", "bibtex": "@InProceedings{pmlr-v97-williamson19a,\n title = \t {Fairness risk measures},\n author = {Williamson, Robert and Menon, Aditya},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6786--6797},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/williamson19a/williamson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/williamson19a.html},\n abstract = \t {Ensuring that classifiers are non-discriminatory or fair with respect to a sensitive feature (e.g., race or gender) is a topical problem. Progress in this task requires fixing a definition of fairness, and there have been several proposals in this regard over the past few years. Several of these, however, assume either binary sensitive features (thus precluding categorical or real-valued sensitive groups), or result in non-convex objectives (thus adversely affecting the optimisation landscape). In this paper, we propose a new definition of fairness that generalises some existing proposals, while allowing for generic sensitive features and resulting in a convex objective. The key idea is to enforce that the expected losses (or risks) across each subgroup induced by the sensitive feature are commensurate. We show how this relates to the rich literature on risk measures from mathematical finance. As a special case, this leads to a new convex fairness-aware objective based on minimising the conditional value at risk (CVaR).}\n}", "pdf": "http://proceedings.mlr.press/v97/williamson19a/williamson19a.pdf", "supp": "", "pdf_size": 470439, "gs_citation": 177, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17814143916103119426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Australian National University; Google Research", "aff_domain": "anu.edu.au; ", "email": "anu.edu.au; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/williamson19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Australian National University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.anu.edu.au;https://research.google", "aff_unique_abbr": "ANU;Google Research", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1", "aff_country_unique": "Australia;United States" }, { "title": "Fairness without Harm: Decoupled Classifiers with Preference Guarantees", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3684", "id": "3684", "author_site": "Berk Ustun, Yang Liu, David Parkes", "author": "Berk Ustun; Yang Liu; David Parkes", "abstract": "In domains such as medicine, it can be acceptable for machine learning models to include", "bibtex": "@InProceedings{pmlr-v97-ustun19a,\n title = \t {Fairness without Harm: Decoupled Classifiers with Preference Guarantees},\n author = {Ustun, Berk and Liu, Yang and Parkes, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6373--6382},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ustun19a/ustun19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ustun19a.html},\n abstract = \t {In domains such as medicine, it can be acceptable for machine learning models to include", "pdf": "http://proceedings.mlr.press/v97/ustun19a/ustun19a.pdf", "supp": "", "pdf_size": 3920318, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1756589823727387061&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Harvard University; UC Santa Cruz; Harvard University", "aff_domain": "seas.harvard.edu; ; ", "email": "seas.harvard.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ustun19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Harvard University;University of California, Santa Cruz", "aff_unique_dep": ";", "aff_unique_url": "https://www.harvard.edu;https://www.ucsc.edu", "aff_unique_abbr": "Harvard;UCSC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Santa Cruz", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Fairness-Aware Learning for Continuous Attributes and Treatments", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4130", "id": "4130", "author_site": "Jeremie Mary, Cl\u00e9ment Calauz\u00e8nes, Noureddine El Karoui", "author": "Jeremie Mary; Cl\u00e9ment Calauz\u00e8nes; Noureddine El Karoui", "abstract": "We address the problem of algorithmic fairness: ensuring that the outcome of a classifier is not biased towards certain values of sensitive variables such as age, race or gender. As common fairness metrics can be expressed as measures of (conditional) independence between variables, we propose to use the R\u00e9nyi maximum correlation coefficient to generalize fairness measurement to continuous variables. We exploit Witsenhausen\u2019s characterization of the R\u00e9nyi correlation coefficient to propose a differentiable implementation linked to $f$-divergences. This allows us to generalize fairness-aware learning to continuous variables by using a penalty that upper bounds this coefficient. Theses allows fairness to be extented to variables such as mixed ethnic groups or financial status without thresholds effects. This penalty can be estimated on mini-batches allowing to use deep nets. Experiments show favorable comparisons to state of the art on binary variables and prove the ability to protect continuous ones", "bibtex": "@InProceedings{pmlr-v97-mary19a,\n title = \t {Fairness-Aware Learning for Continuous Attributes and Treatments},\n author = {Mary, Jeremie and Calauz{\\`e}nes, Cl{\\'e}ment and Karoui, Noureddine El},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4382--4391},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mary19a/mary19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mary19a.html},\n abstract = \t {We address the problem of algorithmic fairness: ensuring that the outcome of a classifier is not biased towards certain values of sensitive variables such as age, race or gender. As common fairness metrics can be expressed as measures of (conditional) independence between variables, we propose to use the R\u00e9nyi maximum correlation coefficient to generalize fairness measurement to continuous variables. We exploit Witsenhausen\u2019s characterization of the R\u00e9nyi correlation coefficient to propose a differentiable implementation linked to $f$-divergences. This allows us to generalize fairness-aware learning to continuous variables by using a penalty that upper bounds this coefficient. Theses allows fairness to be extented to variables such as mixed ethnic groups or financial status without thresholds effects. This penalty can be estimated on mini-batches allowing to use deep nets. Experiments show favorable comparisons to state of the art on binary variables and prove the ability to protect continuous ones}\n}", "pdf": "http://proceedings.mlr.press/v97/mary19a/mary19a.pdf", "supp": "", "pdf_size": 2521077, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=968179841308142583&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Criteo AI Lab, Paris, France+University of California, Berkeley, USA; Criteo AI Lab, Paris, France+University of California, Berkeley, USA; Criteo AI Lab, Paris, France+University of California, Berkeley, USA", "aff_domain": "criteo.com;criteo.com;berkeley.edu", "email": "criteo.com;criteo.com;berkeley.edu", "github": "https://github.com/criteo-research/continuous-fairness", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mary19a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Criteo;University of California, Berkeley", "aff_unique_dep": "Criteo AI Lab;", "aff_unique_url": "https://www.criteo.com;https://www.berkeley.edu", "aff_unique_abbr": "Criteo;UC Berkeley", "aff_campus_unique_index": "0+1;0+1;0+1", "aff_campus_unique": "Paris;Berkeley", "aff_country_unique_index": "0+1;0+1;0+1", "aff_country_unique": "France;United States" }, { "title": "Fairwashing: the risk of rationalization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4244", "id": "4244", "author_site": "Ulrich AIVODJI, Hiromi Arai, Olivier Fortineau, S\u00e9bastien Gambs, Satoshi Hara, Alain Tapp", "author": "Ulrich Aivodji; Hiromi Arai; Olivier Fortineau; S\u00e9bastien Gambs; Satoshi Hara; Alain Tapp", "abstract": "Black-box explanation is the problem of explaining how a machine learning model \u2013 whose internal logic is hidden to the auditor and generally complex \u2013 produces its outcomes. Current approaches for solving this problem include model explanation, outcome explanation as well as model inspection. While these techniques can be beneficial by providing interpretability, they can be used in a negative manner to perform fairwashing, which we define as promoting the false perception that a machine learning model respects some ethical values. In particular, we demonstrate that it is possible to systematically rationalize decisions taken by an unfair black-box model using the model explanation as well as the outcome explanation approaches with a given fairness metric. Our solution, LaundryML, is based on a regularized rule list enumeration algorithm whose objective is to search for fair rule lists approximating an unfair black-box model. We empirically evaluate our rationalization technique on black-box models trained on real-world datasets and show that one can obtain rule lists with high fidelity to the black-box model while being considerably less unfair at the same time.", "bibtex": "@InProceedings{pmlr-v97-aivodji19a,\n title = \t {Fairwashing: the risk of rationalization},\n author = {Aivodji, Ulrich and Arai, Hiromi and Fortineau, Olivier and Gambs, S{\\'e}bastien and Hara, Satoshi and Tapp, Alain},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {161--170},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/aivodji19a/aivodji19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/aivodji19a.html},\n abstract = \t {Black-box explanation is the problem of explaining how a machine learning model \u2013 whose internal logic is hidden to the auditor and generally complex \u2013 produces its outcomes. Current approaches for solving this problem include model explanation, outcome explanation as well as model inspection. While these techniques can be beneficial by providing interpretability, they can be used in a negative manner to perform fairwashing, which we define as promoting the false perception that a machine learning model respects some ethical values. In particular, we demonstrate that it is possible to systematically rationalize decisions taken by an unfair black-box model using the model explanation as well as the outcome explanation approaches with a given fairness metric. Our solution, LaundryML, is based on a regularized rule list enumeration algorithm whose objective is to search for fair rule lists approximating an unfair black-box model. We empirically evaluate our rationalization technique on black-box models trained on real-world datasets and show that one can obtain rule lists with high fidelity to the black-box model while being considerably less unfair at the same time.}\n}", "pdf": "http://proceedings.mlr.press/v97/aivodji19a/aivodji19a.pdf", "supp": "", "pdf_size": 863056, "gs_citation": 212, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2523692918696533409&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Universit \u00b4e du Qu \u00b4ebec `a Montr \u00b4eal; RIKEN Center for Advanced Intelligence Project+JST PRESTO; ENSTA ParisTech; Universit \u00b4e du Qu \u00b4ebec `a Montr \u00b4eal; Osaka University; UdeM+MILA", "aff_domain": "courrier.uqam.ca; ; ; ; ; ", "email": "courrier.uqam.ca; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/aivodji19a.html", "aff_unique_index": "0;1+2;3;0;4;5+6", "aff_unique_norm": "Universit\u00e9 du Qu\u00e9bec \u00e0 Montr\u00e9al;RIKEN;Japan Science and Technology Agency;ENSTA ParisTech;Osaka University;Universit\u00e9 de Montr\u00e9al;Mila", "aff_unique_dep": ";Center for Advanced Intelligence Project;PRESTO;;;;", "aff_unique_url": "https://uqam.ca;https://www.riken.jp/en/;https://www.jst.go.jp;https://www.ensta.fr;https://www.osaka-u.ac.jp;https://www.udem\u8499\u7279\u5229\u5c14\u5927\u5b66.ca;https://mila.quebec", "aff_unique_abbr": "UQAM;RIKEN;JST;ENSTA;Osaka U;UdeM;MILA", "aff_campus_unique_index": "0;;0;", "aff_campus_unique": "Montr\u00e9al;", "aff_country_unique_index": "0;1+1;2;0;1;0+0", "aff_country_unique": "Canada;Japan;France" }, { "title": "Fast Algorithm for Generalized Multinomial Models with Ranking Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4007", "id": "4007", "author_site": "Jiaqi Gu, Guosheng Yin", "author": "Jiaqi Gu; Guosheng Yin", "abstract": "We develop a framework of generalized multinomial models, which includes both the popular Plackett\u2013Luce model and Bradley\u2013Terry model as special cases. From a theoretical perspective, we prove that the maximum likelihood estimator (MLE) under generalized multinomial models corresponds to the stationary distribution of an inhomogeneous Markov chain uniquely. Based on this property, we propose an iterative algorithm that is easy to implement and interpret, and is guaranteed to converge. Numerical experiments on synthetic data and real data demonstrate the advantages of our Markov chain based algorithm over existing ones. Our algorithm converges to the MLE with fewer iterations and at a faster convergence rate. The new algorithm is readily applicable to problems such as page ranking or sports ranking data.", "bibtex": "@InProceedings{pmlr-v97-gu19a,\n title = \t {Fast Algorithm for Generalized Multinomial Models with Ranking Data},\n author = {Gu, Jiaqi and Yin, Guosheng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2445--2453},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gu19a/gu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gu19a.html},\n abstract = \t {We develop a framework of generalized multinomial models, which includes both the popular Plackett\u2013Luce model and Bradley\u2013Terry model as special cases. From a theoretical perspective, we prove that the maximum likelihood estimator (MLE) under generalized multinomial models corresponds to the stationary distribution of an inhomogeneous Markov chain uniquely. Based on this property, we propose an iterative algorithm that is easy to implement and interpret, and is guaranteed to converge. Numerical experiments on synthetic data and real data demonstrate the advantages of our Markov chain based algorithm over existing ones. Our algorithm converges to the MLE with fewer iterations and at a faster convergence rate. The new algorithm is readily applicable to problems such as page ranking or sports ranking data.}\n}", "pdf": "http://proceedings.mlr.press/v97/gu19a/gu19a.pdf", "supp": "", "pdf_size": 360765, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7010532156897864145&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Statistics and Actuarial Science, University of Hong Kong, Hong Kong SAR, China; Department of Statistics and Actuarial Science, University of Hong Kong, Hong Kong SAR, China", "aff_domain": "hku.hk;hku.hk", "email": "hku.hk;hku.hk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/gu19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "Department of Statistics and Actuarial Science", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Fast Context Adaptation via Meta-Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4086", "id": "4086", "author_site": "Luisa Zintgraf, Kyriacos Shiarlis, Vitaly Kurin, Katja Hofmann, Shimon Whiteson", "author": "Luisa Zintgraf; Kyriacos Shiarli; Vitaly Kurin; Katja Hofmann; Shimon Whiteson", "abstract": "We propose CAVIA for meta-learning, a simple extension to MAML that is less prone to meta-overfitting, easier to parallelise, and more interpretable. CAVIA partitions the model parameters into two parts: context parameters that serve as additional input to the model and are adapted on individual tasks, and shared parameters that are meta-trained and shared across tasks. At test time, only the context parameters are updated, leading to a low-dimensional task representation. We show empirically that CAVIA outperforms MAML for regression, classification, and reinforcement learning. Our experiments also highlight weaknesses in current benchmarks, in that the amount of adaptation needed in some cases is small.", "bibtex": "@InProceedings{pmlr-v97-zintgraf19a,\n title = \t {Fast Context Adaptation via Meta-Learning},\n author = {Zintgraf, Luisa and Shiarli, Kyriacos and Kurin, Vitaly and Hofmann, Katja and Whiteson, Shimon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7693--7702},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zintgraf19a/zintgraf19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zintgraf19a.html},\n abstract = \t {We propose CAVIA for meta-learning, a simple extension to MAML that is less prone to meta-overfitting, easier to parallelise, and more interpretable. CAVIA partitions the model parameters into two parts: context parameters that serve as additional input to the model and are adapted on individual tasks, and shared parameters that are meta-trained and shared across tasks. At test time, only the context parameters are updated, leading to a low-dimensional task representation. We show empirically that CAVIA outperforms MAML for regression, classification, and reinforcement learning. Our experiments also highlight weaknesses in current benchmarks, in that the amount of adaptation needed in some cases is small.}\n}", "pdf": "http://proceedings.mlr.press/v97/zintgraf19a/zintgraf19a.pdf", "supp": "", "pdf_size": 648947, "gs_citation": 480, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=731845317332872337&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Oxford; University of Oxford+Latent Logic; University of Oxford+Latent Logic; Microsoft Research; University of Oxford+Latent Logic", "aff_domain": "cs.ox.ac.uk; ; ; ; ", "email": "cs.ox.ac.uk; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zintgraf19a.html", "aff_unique_index": "0;0+1;0+1;2;0+1", "aff_unique_norm": "University of Oxford;Latent Logic;Microsoft", "aff_unique_dep": ";;Microsoft Research", "aff_unique_url": "https://www.ox.ac.uk;https://www.latentlogic.com;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Oxford;;MSR", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;0+0;1;0+0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Fast Direct Search in an Optimally Compressed Continuous Target Space for Efficient Multi-Label Active Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3596", "id": "3596", "author_site": "weishi shi, Qi Yu", "author": "Weishi Shi; Qi Yu", "abstract": "Active learning for multi-label classification poses fundamental challenges given the complex label correlations and a potentially large and sparse label space. We propose a novel CS-BPCA process that integrates compressed sensing and Bayesian principal component analysis to perform a two-level label transformation, resulting in an optimally compressed continuous target space. Besides leveraging correlation and sparsity of a large label space for effective compression, an optimal compressing rate and the relative importance of the resultant targets are automatically determined through Bayesian inference. Furthermore, the orthogonality of the transformed space completely decouples the correlations among targets, which significantly simplifies multi-label sampling in the target space. We define a novel sampling function that leverages a multi-output Gaussian Process (MOGP). Gradient-free optimization strategies are developed to achieve fast online hyper-parameter learning and model retraining for active learning. Experimental results over multiple real-world datasets and comparison with competitive multi-label active learning models demonstrate the effectiveness of the proposed framework.", "bibtex": "@InProceedings{pmlr-v97-shi19b,\n title = \t {Fast Direct Search in an Optimally Compressed Continuous Target Space for Efficient Multi-Label Active Learning},\n author = {Shi, Weishi and Yu, Qi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5769--5778},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shi19b/shi19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/shi19b.html},\n abstract = \t {Active learning for multi-label classification poses fundamental challenges given the complex label correlations and a potentially large and sparse label space. We propose a novel CS-BPCA process that integrates compressed sensing and Bayesian principal component analysis to perform a two-level label transformation, resulting in an optimally compressed continuous target space. Besides leveraging correlation and sparsity of a large label space for effective compression, an optimal compressing rate and the relative importance of the resultant targets are automatically determined through Bayesian inference. Furthermore, the orthogonality of the transformed space completely decouples the correlations among targets, which significantly simplifies multi-label sampling in the target space. We define a novel sampling function that leverages a multi-output Gaussian Process (MOGP). Gradient-free optimization strategies are developed to achieve fast online hyper-parameter learning and model retraining for active learning. Experimental results over multiple real-world datasets and comparison with competitive multi-label active learning models demonstrate the effectiveness of the proposed framework.}\n}", "pdf": "http://proceedings.mlr.press/v97/shi19b/shi19b.pdf", "supp": "", "pdf_size": 1372774, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7143185849096493482&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Golisano College of Computing and Information Sciences, Rochester Institute of Technology, Rochester, USA; Golisano College of Computing and Information Sciences, Rochester Institute of Technology, Rochester, USA", "aff_domain": "rit.edu;rit.edu", "email": "rit.edu;rit.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/shi19b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Rochester Institute of Technology", "aff_unique_dep": "Golisano College of Computing and Information Sciences", "aff_unique_url": "https://www.rit.edu", "aff_unique_abbr": "RIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Rochester", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Fast Incremental von Neumann Graph Entropy Computation: Theory, Algorithm, and Applications", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3574", "id": "3574", "author_site": "Pin-Yu Chen, Lingfei Wu, Sijia Liu, Indika Rajapakse", "author": "Pin-Yu Chen; Lingfei Wu; Sijia Liu; Indika Rajapakse", "abstract": "The von Neumann graph entropy (VNGE) facilitates measurement of information divergence and distance between graphs in a graph sequence. It has been successfully applied to various learning tasks driven by network-based data. While effective, VNGE is computationally demanding as it requires the full eigenspectrum of the graph Laplacian matrix. In this paper, we propose a new computational framework, Fast Incremental von Neumann Graph EntRopy (FINGER), which approaches VNGE with a performance guarantee. FINGER reduces the cubic complexity of VNGE to linear complexity in the number of nodes and edges, and thus enables online computation based on incremental graph changes. We also show asymptotic equivalence of FINGER to the exact VNGE, and derive its approximation error bounds. Based on FINGER, we propose efficient algorithms for computing Jensen-Shannon distance between graphs. Our experimental results on different random graph models demonstrate the computational efficiency and the asymptotic equivalence of FINGER. In addition, we apply FINGER to two real-world applications and one synthesized anomaly detection dataset, and corroborate its superior performance over seven baseline graph similarity methods.", "bibtex": "@InProceedings{pmlr-v97-chen19j,\n title = \t {Fast Incremental von Neumann Graph Entropy Computation: Theory, Algorithm, and Applications},\n author = {Chen, Pin-Yu and Wu, Lingfei and Liu, Sijia and Rajapakse, Indika},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1091--1101},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19j/chen19j.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19j.html},\n abstract = \t {The von Neumann graph entropy (VNGE) facilitates measurement of information divergence and distance between graphs in a graph sequence. It has been successfully applied to various learning tasks driven by network-based data. While effective, VNGE is computationally demanding as it requires the full eigenspectrum of the graph Laplacian matrix. In this paper, we propose a new computational framework, Fast Incremental von Neumann Graph EntRopy (FINGER), which approaches VNGE with a performance guarantee. FINGER reduces the cubic complexity of VNGE to linear complexity in the number of nodes and edges, and thus enables online computation based on incremental graph changes. We also show asymptotic equivalence of FINGER to the exact VNGE, and derive its approximation error bounds. Based on FINGER, we propose efficient algorithms for computing Jensen-Shannon distance between graphs. Our experimental results on different random graph models demonstrate the computational efficiency and the asymptotic equivalence of FINGER. In addition, we apply FINGER to two real-world applications and one synthesized anomaly detection dataset, and corroborate its superior performance over seven baseline graph similarity methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19j/chen19j.pdf", "supp": "", "pdf_size": 533781, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15943782295657868941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "IBM Research; IBM Research; IBM Research; University of Michigan, Ann Arbor, USA", "aff_domain": "ibm.com; ; ; ", "email": "ibm.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chen19j.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "IBM;University of Michigan", "aff_unique_dep": "IBM Research;", "aff_unique_url": "https://www.ibm.com/research;https://www.umich.edu", "aff_unique_abbr": "IBM;UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fast Rates for a kNN Classifier Robust to Unknown Asymmetric Label Noise", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3851", "id": "3851", "author_site": "Henry Reeve, Ata Kaban", "author": "Henry Reeve; Ata Kaban", "abstract": "We consider classification in the presence of class-dependent asymmetric label noise with unknown noise probabilities. In this setting, identifiability conditions are known, but additional assumptions were shown to be required for finite sample rates, and so far only the parametric rate has been obtained. Assuming these identifiability conditions, together with a measure-smoothness condition on the regression function and Tsybakov\u2019s margin condition, we show that the Robust kNN classifier of Gao et al. attains, the mini-max optimal rates of the noise-free setting, up to a log factor, even when trained on data with unknown asymmetric label noise. Hence, our results provide a solid theoretical backing for this empirically successful algorithm. By contrast the standard kNN is not even consistent in the setting of asymmetric label noise. A key idea in our analysis is a simple kNN based method for estimating the maximum of a function that requires far less assumptions than existing mode estimators do, and which may be of independent interest for noise proportion estimation and randomised optimisation problems.", "bibtex": "@InProceedings{pmlr-v97-reeve19a,\n title = \t {Fast Rates for a k{NN} Classifier Robust to Unknown Asymmetric Label Noise},\n author = {Reeve, Henry and Kaban, Ata},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5401--5409},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/reeve19a/reeve19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/reeve19a.html},\n abstract = \t {We consider classification in the presence of class-dependent asymmetric label noise with unknown noise probabilities. In this setting, identifiability conditions are known, but additional assumptions were shown to be required for finite sample rates, and so far only the parametric rate has been obtained. Assuming these identifiability conditions, together with a measure-smoothness condition on the regression function and Tsybakov\u2019s margin condition, we show that the Robust kNN classifier of Gao et al. attains, the mini-max optimal rates of the noise-free setting, up to a log factor, even when trained on data with unknown asymmetric label noise. Hence, our results provide a solid theoretical backing for this empirically successful algorithm. By contrast the standard kNN is not even consistent in the setting of asymmetric label noise. A key idea in our analysis is a simple kNN based method for estimating the maximum of a function that requires far less assumptions than existing mode estimators do, and which may be of independent interest for noise proportion estimation and randomised optimisation problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/reeve19a/reeve19a.pdf", "supp": "", "pdf_size": 376048, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13492563355363108032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "School of Computer Science, University of Birmingham, UK; School of Computer Science, University of Birmingham, UK", "aff_domain": "gmail.com; ", "email": "gmail.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/reeve19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Birmingham", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.birmingham.ac.uk", "aff_unique_abbr": "UoB", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Birmingham", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Fast and Flexible Inference of Joint Distributions from their Marginals", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3600", "id": "3600", "author_site": "Charles Frogner, Tomaso Poggio", "author": "Charlie Frogner; Tomaso Poggio", "abstract": "Across the social sciences and elsewhere, practitioners frequently have to reason about relationships between random variables, despite lacking joint observations of the variables. This is sometimes called an \"ecological\" inference; given samples from the marginal distributions of the variables, one attempts to infer their joint distribution. The problem is inherently ill-posed, yet only a few models have been proposed for bringing prior information into the problem, often relying on restrictive or unrealistic assumptions and lacking a unified approach. In this paper, we treat the inference problem generally and propose a unified class of models that encompasses some of those previously proposed while including many new ones. Previous work has relied on either relaxation or approximate inference via MCMC, with the latter known to mix prohibitively slowly for this type of problem. Here we instead give a single exact inference algorithm that works for the entire model class via an efficient fixed point iteration called Dykstra\u2019s method. We investigate empirically both the computational cost of our algorithm and the accuracy of the new models on real datasets, showing favorable performance in both cases and illustrating the impact of increased flexibility in modeling enabled by this work.", "bibtex": "@InProceedings{pmlr-v97-frogner19a,\n title = \t {Fast and Flexible Inference of Joint Distributions from their Marginals},\n author = {Frogner, Charlie and Poggio, Tomaso},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2002--2011},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/frogner19a/frogner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/frogner19a.html},\n abstract = \t {Across the social sciences and elsewhere, practitioners frequently have to reason about relationships between random variables, despite lacking joint observations of the variables. This is sometimes called an \"ecological\" inference; given samples from the marginal distributions of the variables, one attempts to infer their joint distribution. The problem is inherently ill-posed, yet only a few models have been proposed for bringing prior information into the problem, often relying on restrictive or unrealistic assumptions and lacking a unified approach. In this paper, we treat the inference problem generally and propose a unified class of models that encompasses some of those previously proposed while including many new ones. Previous work has relied on either relaxation or approximate inference via MCMC, with the latter known to mix prohibitively slowly for this type of problem. Here we instead give a single exact inference algorithm that works for the entire model class via an efficient fixed point iteration called Dykstra\u2019s method. We investigate empirically both the computational cost of our algorithm and the accuracy of the new models on real datasets, showing favorable performance in both cases and illustrating the impact of increased flexibility in modeling enabled by this work.}\n}", "pdf": "http://proceedings.mlr.press/v97/frogner19a/frogner19a.pdf", "supp": "", "pdf_size": 1212918, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10595659236851539491&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "CSAIL, Massachusetts Institute of Technology, Cambridge, Massachusetts; Center for Brains, Minds, and Machines, Massachusetts Institute of Technology, Cambridge, Massachusetts", "aff_domain": "mit.edu; ", "email": "mit.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/frogner19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Fast and Simple Natural-Gradient Variational Inference with Mixture of Exponential-family Approximations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3799", "id": "3799", "author_site": "Wu Lin, Mohammad Emtiyaz Khan, Mark Schmidt", "author": "Wu Lin; Mohammad Emtiyaz Khan; Mark Schmidt", "abstract": "Natural-gradient methods enable fast and simple algorithms for variational inference, but due to computational difficulties, their use is mostly limited to minimal exponential-family (EF) approximations. In this paper, we extend their application to estimate structured approximations such as mixtures of EF distributions. Such approximations can fit complex, multimodal posterior distributions and are generally more accurate than unimodal EF approximations. By using a minimal conditional-EF representation of such approximations, we derive simple natural-gradient updates. Our empirical results demonstrate a faster convergence of our natural-gradient method compared to black-box gradient-based methods. Our work expands the scope of natural gradients for Bayesian inference and makes them more widely applicable than before.", "bibtex": "@InProceedings{pmlr-v97-lin19b,\n title = \t {Fast and Simple Natural-Gradient Variational Inference with Mixture of Exponential-family Approximations},\n author = {Lin, Wu and Khan, Mohammad Emtiyaz and Schmidt, Mark},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3992--4002},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lin19b/lin19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/lin19b.html},\n abstract = \t {Natural-gradient methods enable fast and simple algorithms for variational inference, but due to computational difficulties, their use is mostly limited to minimal exponential-family (EF) approximations. In this paper, we extend their application to estimate structured approximations such as mixtures of EF distributions. Such approximations can fit complex, multimodal posterior distributions and are generally more accurate than unimodal EF approximations. By using a minimal conditional-EF representation of such approximations, we derive simple natural-gradient updates. Our empirical results demonstrate a faster convergence of our natural-gradient method compared to black-box gradient-based methods. Our work expands the scope of natural gradients for Bayesian inference and makes them more widely applicable than before.}\n}", "pdf": "http://proceedings.mlr.press/v97/lin19b/lin19b.pdf", "supp": "", "pdf_size": 3134496, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9800018690635650774&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "University of British Columbia, Vancouver, Canada; RIKEN Center for Advanced Intelligence Project, Tokyo, Japan; University of British Columbia, Vancouver, Canada", "aff_domain": "cs.ubc.ca; ; ", "email": "cs.ubc.ca; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/lin19b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of British Columbia;RIKEN Center for Advanced Intelligence Project", "aff_unique_dep": ";Center for Advanced Intelligence Project", "aff_unique_url": "https://www.ubc.ca;https://www.riken.jp/en/c-aip/", "aff_unique_abbr": "UBC;RIKEN C-AIP", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Vancouver;Tokyo", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Canada;Japan" }, { "title": "Fast and Stable Maximum Likelihood Estimation for Incomplete Multinomial Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4002", "id": "4002", "author_site": "Chenyang ZHANG, Guosheng Yin", "author": "Chenyang Zhang; Guosheng Yin", "abstract": "We propose a fixed-point iteration approach to the maximum likelihood estimation for the incomplete multinomial model, which provides a unified framework for ranking data analysis. Incomplete observations typically fall in a subset of categories, and thus cannot be distinguished as belonging to a unique category. We develop a minorization\u2013maximization (MM) type of algorithm, which requires relatively fewer iterations and shorter time to achieve convergence. Under such a general framework, incomplete multinomial models can be reformulated to include several well-known ranking models as special cases, such as the Bradley\u2013Terry, Plackett\u2013Luce models and their variants. The simple form of iteratively updating equations in our algorithm involves only basic matrix operations, which makes it efficient and easy to implement with large data. Experimental results show that our algorithm runs faster than existing methods on synthetic data and real data.", "bibtex": "@InProceedings{pmlr-v97-zhang19o,\n title = \t {Fast and Stable Maximum Likelihood Estimation for Incomplete Multinomial Models},\n author = {Zhang, Chenyang and Yin, Guosheng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7463--7471},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19o/zhang19o.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19o.html},\n abstract = \t {We propose a fixed-point iteration approach to the maximum likelihood estimation for the incomplete multinomial model, which provides a unified framework for ranking data analysis. Incomplete observations typically fall in a subset of categories, and thus cannot be distinguished as belonging to a unique category. We develop a minorization\u2013maximization (MM) type of algorithm, which requires relatively fewer iterations and shorter time to achieve convergence. Under such a general framework, incomplete multinomial models can be reformulated to include several well-known ranking models as special cases, such as the Bradley\u2013Terry, Plackett\u2013Luce models and their variants. The simple form of iteratively updating equations in our algorithm involves only basic matrix operations, which makes it efficient and easy to implement with large data. Experimental results show that our algorithm runs faster than existing methods on synthetic data and real data.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19o/zhang19o.pdf", "supp": "", "pdf_size": 643361, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ez0BlZYe228J:scholar.google.com/&scioq=Fast+and+Stable+Maximum+Likelihood+Estimation+for+Incomplete+Multinomial+Models&hl=en&as_sdt=0,5", "gs_version_total": 8, "aff": "Department of Statistics and Actuarial Science, University of Hong Kong, Hong Kong; Department of Statistics and Actuarial Science, University of Hong Kong, Hong Kong", "aff_domain": "Chenyang Zhang;hku.hk", "email": "Chenyang Zhang;hku.hk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhang19o.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Hong Kong", "aff_unique_dep": "Department of Statistics and Actuarial Science", "aff_unique_url": "https://www.hku.hk", "aff_unique_abbr": "HKU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Faster Algorithms for Binary Matrix Factorization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4153", "id": "4153", "author_site": "Ravi Kumar, Rina Panigrahy, Ali Rahimi, David Woodruff", "author": "Ravi Kumar; Rina Panigrahy; Ali Rahimi; David Woodruff", "abstract": "We give faster approximation algorithms for well-studied variants of Binary Matrix Factorization (BMF), where we are given a binary $m \\times n$ matrix $A$ and would like to find binary rank-$k$ matrices $U, V$ to minimize the Frobenius norm of $U \\cdot V - A$. In the first setting, $U \\cdot V$ denotes multiplication over $\\mathbb{Z}$, and we give a constant-factor approximation algorithm that runs in $2^{O(k^2 \\log k)} \\textrm{poly}(mn)$ time, improving upon the previous $\\min(2^{2^k}, 2^n) \\textrm{poly}(mn)$ time. Our techniques generalize to minimizing $\\|U \\cdot V - A\\|_p$ for $p \\geq 1$, in $2^{O(k^{\\lceil p/2 \\rceil + 1}\\log k)} \\textrm{poly}(mn)$ time. For $p = 1$, this has a graph-theoretic consequence, namely, a $2^{O(k^2)} \\poly(mn)$-time algorithm to approximate a graph as a union of disjoint bicliques. In the second setting, $U \\cdot V$ is over $\\GF(2)$, and we give a bicriteria constant-factor approximation algorithm that runs in $2^{O(k^3)} \\poly(mn)$ time to find binary rank-$O(k \\log m)$ matrices $U$, $V$ whose cost is as good as the best rank-$k$ approximation, improving upon $\\min(2^{2^k}mn, \\min(m,n)^{k^{O(1)}} \\textrm{poly}(mn))$ time.", "bibtex": "@InProceedings{pmlr-v97-kumar19a,\n title = \t {Faster Algorithms for Binary Matrix Factorization},\n author = {Kumar, Ravi and Panigrahy, Rina and Rahimi, Ali and Woodruff, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3551--3559},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kumar19a/kumar19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kumar19a.html},\n abstract = \t {We give faster approximation algorithms for well-studied variants of Binary Matrix Factorization (BMF), where we are given a binary $m \\times n$ matrix $A$ and would like to find binary rank-$k$ matrices $U, V$ to minimize the Frobenius norm of $U \\cdot V - A$. In the first setting, $U \\cdot V$ denotes multiplication over $\\mathbb{Z}$, and we give a constant-factor approximation algorithm that runs in $2^{O(k^2 \\log k)} \\textrm{poly}(mn)$ time, improving upon the previous $\\min(2^{2^k}, 2^n) \\textrm{poly}(mn)$ time. Our techniques generalize to minimizing $\\|U \\cdot V - A\\|_p$ for $p \\geq 1$, in $2^{O(k^{\\lceil p/2 \\rceil + 1}\\log k)} \\textrm{poly}(mn)$ time. For $p = 1$, this has a graph-theoretic consequence, namely, a $2^{O(k^2)} \\poly(mn)$-time algorithm to approximate a graph as a union of disjoint bicliques. In the second setting, $U \\cdot V$ is over $\\GF(2)$, and we give a bicriteria constant-factor approximation algorithm that runs in $2^{O(k^3)} \\poly(mn)$ time to find binary rank-$O(k \\log m)$ matrices $U$, $V$ whose cost is as good as the best rank-$k$ approximation, improving upon $\\min(2^{2^k}mn, \\min(m,n)^{k^{O(1)}} \\textrm{poly}(mn))$ time.}\n}", "pdf": "http://proceedings.mlr.press/v97/kumar19a/kumar19a.pdf", "supp": "", "pdf_size": 370442, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11706854248671931704&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google; Google; Google; CMU + Google + Simons Institute for the Theory of Computing", "aff_domain": "cs.cmu.edu; ; ; ", "email": "cs.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kumar19a.html", "aff_unique_index": "0;0;0;1+0+2", "aff_unique_norm": "Google;Carnegie Mellon University;Simons Institute for the Theory of Computing", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.cmu.edu;https://simons.berkeley.edu", "aff_unique_abbr": "Google;CMU;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0+0+0", "aff_country_unique": "United States" }, { "title": "Faster Attend-Infer-Repeat with Tractable Probabilistic Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4081", "id": "4081", "author_site": "Karl Stelzner, Robert Peharz, Kristian Kersting", "author": "Karl Stelzner; Robert Peharz; Kristian Kersting", "abstract": "The recent Attend-Infer-Repeat (AIR) framework marks a milestone in structured probabilistic modeling, as it tackles the challenging problem of unsupervised scene understanding via Bayesian inference. AIR expresses the composition of visual scenes from individual objects, and uses variational autoencoders to model the appearance of those objects. However, inference in the overall model is highly intractable, which hampers its learning speed and makes it prone to suboptimal solutions. In this paper, we show that the speed and robustness of learning in AIR can be considerably improved by replacing the intractable object representations with tractable probabilistic models. In particular, we opt for sum-product networks (SPNs), expressive deep probabilistic models with a rich set of tractable inference routines. The resulting model, called SuPAIR, learns an order of magnitude faster than AIR, treats object occlusions in a consistent manner, and allows for the inclusion of a background noise model, improving the robustness of Bayesian scene understanding.", "bibtex": "@InProceedings{pmlr-v97-stelzner19a,\n title = \t {Faster Attend-Infer-Repeat with Tractable Probabilistic Models},\n author = {Stelzner, Karl and Peharz, Robert and Kersting, Kristian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5966--5975},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/stelzner19a/stelzner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/stelzner19a.html},\n abstract = \t {The recent Attend-Infer-Repeat (AIR) framework marks a milestone in structured probabilistic modeling, as it tackles the challenging problem of unsupervised scene understanding via Bayesian inference. AIR expresses the composition of visual scenes from individual objects, and uses variational autoencoders to model the appearance of those objects. However, inference in the overall model is highly intractable, which hampers its learning speed and makes it prone to suboptimal solutions. In this paper, we show that the speed and robustness of learning in AIR can be considerably improved by replacing the intractable object representations with tractable probabilistic models. In particular, we opt for sum-product networks (SPNs), expressive deep probabilistic models with a rich set of tractable inference routines. The resulting model, called SuPAIR, learns an order of magnitude faster than AIR, treats object occlusions in a consistent manner, and allows for the inclusion of a background noise model, improving the robustness of Bayesian scene understanding.}\n}", "pdf": "http://proceedings.mlr.press/v97/stelzner19a/stelzner19a.pdf", "supp": "", "pdf_size": 2417069, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5945379216326385822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "CS Dept., TU Darmstadt, Darmstadt, Germany; Eng. Dept. (CBL), University of Cambridge, UK; Centre for Cognitive Science, TU Darmstadt, Darmstadt, Germany", "aff_domain": "cs.tu-darmstadt.de; ; ", "email": "cs.tu-darmstadt.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/stelzner19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt;University of Cambridge", "aff_unique_dep": "Computer Science Department;Engineering Department", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.cam.ac.uk", "aff_unique_abbr": "TU Darmstadt;Cambridge", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Darmstadt;Cambridge", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Faster Stochastic Alternating Direction Method of Multipliers for Nonconvex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3615", "id": "3615", "author_site": "Feihu Huang, Songcan Chen, Heng Huang", "author": "Feihu Huang; Songcan Chen; Heng Huang", "abstract": "In this paper, we propose a faster stochastic alternating direction method of multipliers (ADMM) for nonconvex optimization by using a new stochastic path-integrated differential estimator (SPIDER), called as SPIDER-ADMM. Moreover, we prove that the SPIDER-ADMM achieves a record-breaking incremental first-order oracle (IFO) complexity for finding an $\\epsilon$-approximate solution. As one of major contribution of this paper, we provide a new theoretical analysis framework for nonconvex stochastic ADMM methods with providing the optimal IFO complexity. Based on this new analysis framework, we study the unsolved optimal IFO complexity of the existing non-convex SVRG-ADMM and SAGA-ADMM methods, and prove their the optimal IFO complexity. Thus, the SPIDER-ADMM improves the existing stochastic ADMM methods. Moreover, we extend SPIDER-ADMM to the online setting, and propose a faster online SPIDER-ADMM. Our theoretical analysis also derives the IFO complexity of the online SPIDER-ADMM. Finally, the experimental results on benchmark datasets validate that the proposed algorithms have faster convergence rate than the existing ADMM algorithms for nonconvex optimization.", "bibtex": "@InProceedings{pmlr-v97-huang19a,\n title = \t {Faster Stochastic Alternating Direction Method of Multipliers for Nonconvex Optimization},\n author = {Huang, Feihu and Chen, Songcan and Huang, Heng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2839--2848},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/huang19a/huang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/huang19a.html},\n abstract = \t {In this paper, we propose a faster stochastic alternating direction method of multipliers (ADMM) for nonconvex optimization by using a new stochastic path-integrated differential estimator (SPIDER), called as SPIDER-ADMM. Moreover, we prove that the SPIDER-ADMM achieves a record-breaking incremental first-order oracle (IFO) complexity for finding an $\\epsilon$-approximate solution. As one of major contribution of this paper, we provide a new theoretical analysis framework for nonconvex stochastic ADMM methods with providing the optimal IFO complexity. Based on this new analysis framework, we study the unsolved optimal IFO complexity of the existing non-convex SVRG-ADMM and SAGA-ADMM methods, and prove their the optimal IFO complexity. Thus, the SPIDER-ADMM improves the existing stochastic ADMM methods. Moreover, we extend SPIDER-ADMM to the online setting, and propose a faster online SPIDER-ADMM. Our theoretical analysis also derives the IFO complexity of the online SPIDER-ADMM. Finally, the experimental results on benchmark datasets validate that the proposed algorithms have faster convergence rate than the existing ADMM algorithms for nonconvex optimization.}\n}", "pdf": "http://proceedings.mlr.press/v97/huang19a/huang19a.pdf", "supp": "", "pdf_size": 2082103, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3761436661110647288&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Electrical & Computer Engineering, University of Pittsburgh, PA 15261, USA; College of Computer Science & Technology, Nanjing University of Aeronautics and Astronautics, Nanjing 211106, China + MIIT Key Laboratory of Pattern Analysis & Machine Intelligence; Department of Electrical & Computer Engineering, University of Pittsburgh, PA 15261, USA + JD Finance America Corporation", "aff_domain": "pitt.edu; ;pitt.edu", "email": "pitt.edu; ;pitt.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/huang19a.html", "aff_unique_index": "0;1+2;0+3", "aff_unique_norm": "University of Pittsburgh;Nanjing University of Aeronautics and Astronautics;MIIT;JD", "aff_unique_dep": "Department of Electrical & Computer Engineering;College of Computer Science & Technology;Key Laboratory of Pattern Analysis & Machine Intelligence;JD Finance America Corporation", "aff_unique_url": "https://www.pitt.edu;http://www.nuaa.edu.cn;http://www.mitt.com/;", "aff_unique_abbr": "Pitt;NUAA;MIIT;", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Pittsburgh;Nanjing;", "aff_country_unique_index": "0;1+1;0+0", "aff_country_unique": "United States;China" }, { "title": "Fault Tolerance in Iterative-Convergent Machine Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4193", "id": "4193", "author_site": "Aurick Qiao, Bryon Aragam, Bingjing Zhang, Eric Xing", "author": "Aurick Qiao; Bryon Aragam; Bingjing Zhang; Eric Xing", "abstract": "Machine learning (ML) training algorithms often possess an inherent self-correcting behavior due to their iterative- convergent nature. Recent systems exploit this property to achieve adaptability and efficiency in unreliable computing environments by relaxing the consistency of execution and allowing calculation errors to be self-corrected during training. However, the behavior of such systems are only well understood for specific types of calculation errors, such as those caused by staleness, reduced precision, or asynchronicity, and for specific algorithms, such as stochastic gradient descent. In this paper, we develop a general framework to quantify the effects of calculation errors on iterative-convergent algorithms. We then use this framework to derive a worst-case upper bound on the cost of arbitrary perturbations to model parameters during training and to design new strategies for checkpoint-based fault tolerance. Our system, SCAR, can reduce the cost of partial failures by 78%{\u2013}95% when compared with traditional checkpoint-based fault tolerance across a variety of ML models and training algorithms, providing near-optimal performance in recovering from failures.", "bibtex": "@InProceedings{pmlr-v97-qiao19a,\n title = \t {Fault Tolerance in Iterative-Convergent Machine Learning},\n author = {Qiao, Aurick and Aragam, Bryon and Zhang, Bingjing and Xing, Eric},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5220--5230},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/qiao19a/qiao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/qiao19a.html},\n abstract = \t {Machine learning (ML) training algorithms often possess an inherent self-correcting behavior due to their iterative- convergent nature. Recent systems exploit this property to achieve adaptability and efficiency in unreliable computing environments by relaxing the consistency of execution and allowing calculation errors to be self-corrected during training. However, the behavior of such systems are only well understood for specific types of calculation errors, such as those caused by staleness, reduced precision, or asynchronicity, and for specific algorithms, such as stochastic gradient descent. In this paper, we develop a general framework to quantify the effects of calculation errors on iterative-convergent algorithms. We then use this framework to derive a worst-case upper bound on the cost of arbitrary perturbations to model parameters during training and to design new strategies for checkpoint-based fault tolerance. Our system, SCAR, can reduce the cost of partial failures by 78%{\u2013}95% when compared with traditional checkpoint-based fault tolerance across a variety of ML models and training algorithms, providing near-optimal performance in recovering from failures.}\n}", "pdf": "http://proceedings.mlr.press/v97/qiao19a/qiao19a.pdf", "supp": "", "pdf_size": 1091920, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6527154743755385468&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Petuum, Inc., Pittsburgh, Pennsylvania, USA+Computer Science Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA+Machine Learning Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA; Machine Learning Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA; Computer Science Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA; Computer Science Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA+Machine Learning Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA", "aff_domain": "petuum.com; ; ; ", "email": "petuum.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/qiao19a.html", "aff_unique_index": "0+1+1;1;1;1+1", "aff_unique_norm": "Petuum, Inc.;Carnegie Mellon University", "aff_unique_dep": ";Computer Science Department", "aff_unique_url": "https://www.petuum.com;https://www.cmu.edu", "aff_unique_abbr": ";CMU", "aff_campus_unique_index": "1+1;1;1;1+1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0+0+0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Feature Grouping as a Stochastic Regularizer for High-Dimensional Structured Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4127", "id": "4127", "author_site": "Sergul Aydore, Thirion Bertrand, Gael Varoquaux", "author": "Sergul Aydore; Bertrand Thirion; Gael Varoquaux", "abstract": "In many applications where collecting data is expensive, for example neuroscience or medical imaging, the sample size is typically small compared to the feature dimension. These datasets call for intelligent regularization that exploits known structure, such as correlations between the features arising from the measurement device. However, existing structured regularizers need specially crafted solvers, which are difficult to apply to complex models. We propose a new regularizer specifically designed to leverage structure in the data in a way that can be applied efficiently to complex models. Our approach relies on feature grouping, using a fast clustering algorithm inside a stochastic gradient descent loop: given a family of feature groupings that capture feature covariations, we randomly select these groups at each iteration. Experiments on two real-world datasets demonstrate that the proposed approach produces models that generalize better than those trained with conventional regularizers, and also improves convergence speed, and has a linear computational cost.", "bibtex": "@InProceedings{pmlr-v97-aydore19a,\n title = \t {Feature Grouping as a Stochastic Regularizer for High-Dimensional Structured Data},\n author = {Aydore, Sergul and Thirion, Bertrand and Varoquaux, Gael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {385--394},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/aydore19a/aydore19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/aydore19a.html},\n abstract = \t {In many applications where collecting data is expensive, for example neuroscience or medical imaging, the sample size is typically small compared to the feature dimension. These datasets call for intelligent regularization that exploits known structure, such as correlations between the features arising from the measurement device. However, existing structured regularizers need specially crafted solvers, which are difficult to apply to complex models. We propose a new regularizer specifically designed to leverage structure in the data in a way that can be applied efficiently to complex models. Our approach relies on feature grouping, using a fast clustering algorithm inside a stochastic gradient descent loop: given a family of feature groupings that capture feature covariations, we randomly select these groups at each iteration. Experiments on two real-world datasets demonstrate that the proposed approach produces models that generalize better than those trained with conventional regularizers, and also improves convergence speed, and has a linear computational cost.}\n}", "pdf": "http://proceedings.mlr.press/v97/aydore19a/aydore19a.pdf", "supp": "", "pdf_size": 1911923, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11613171711375782355&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Stevens Institute of Technology, New Jersey, USA; Inria Saclay, Palaiseau, France; Inria Saclay, Palaiseau, France", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/aydore19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Stevens Institute of Technology;INRIA", "aff_unique_dep": ";", "aff_unique_url": "https://www.stevens.edu;https://www.inria.fr", "aff_unique_abbr": "SIT;Inria", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "New Jersey;Saclay", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;France" }, { "title": "Feature-Critic Networks for Heterogeneous Domain Generalization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4052", "id": "4052", "author_site": "Yiying Li, Yongxin Yang, Wei Zhou, Timothy Hospedales", "author": "Yiying Li; Yongxin Yang; Wei Zhou; Timothy Hospedales", "abstract": "The well known domain shift issue causes model performance to degrade when deployed to a new target domain with different statistics to training. Domain adaptation techniques alleviate this, but need some instances from the target domain to drive adaptation. Domain generalisation is the recently topical problem of learning a model that generalises to unseen domains out of the box, and various approaches aim to train a domain-invariant feature extractor, typically by adding some manually designed losses. In this work, we propose a learning to learn approach, where the auxiliary loss that helps generalisation is itself learned. Beyond conventional domain generalisation, we consider a more challenging setting of heterogeneous domain generalisation, where the unseen domains do not share label space with the seen ones, and the goal is to train a feature representation that is useful off-the-shelf for novel data and novel categories. Experimental evaluation demonstrates that our method outperforms state-of-the-art solutions in both settings.", "bibtex": "@InProceedings{pmlr-v97-li19l,\n title = \t {Feature-Critic Networks for Heterogeneous Domain Generalization},\n author = {Li, Yiying and Yang, Yongxin and Zhou, Wei and Hospedales, Timothy},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3915--3924},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19l/li19l.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19l.html},\n abstract = \t {The well known domain shift issue causes model performance to degrade when deployed to a new target domain with different statistics to training. Domain adaptation techniques alleviate this, but need some instances from the target domain to drive adaptation. Domain generalisation is the recently topical problem of learning a model that generalises to unseen domains out of the box, and various approaches aim to train a domain-invariant feature extractor, typically by adding some manually designed losses. In this work, we propose a learning to learn approach, where the auxiliary loss that helps generalisation is itself learned. Beyond conventional domain generalisation, we consider a more challenging setting of heterogeneous domain generalisation, where the unseen domains do not share label space with the seen ones, and the goal is to train a feature representation that is useful off-the-shelf for novel data and novel categories. Experimental evaluation demonstrates that our method outperforms state-of-the-art solutions in both settings.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19l/li19l.pdf", "supp": "", "pdf_size": 507735, "gs_citation": 326, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15160705294700481017&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "College of Computer, National University of Defense Technology, Hunan, China+School of Informatics, The University of Edinburgh, Edinburgh, UK; School of Informatics, The University of Edinburgh, Edinburgh, UK; College of Computer, National University of Defense Technology, Hunan, China; School of Informatics, The University of Edinburgh, Edinburgh, UK+Samsung AI Centre, Cambridge, UK", "aff_domain": "nudt.edu.cn;ed.ac.uk;nudt.edu.cn;ed.ac.uk", "email": "nudt.edu.cn;ed.ac.uk;nudt.edu.cn;ed.ac.uk", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/li19l.html", "aff_unique_index": "0+1;1;0;1+2", "aff_unique_norm": "National University of Defense Technology;University of Edinburgh;Samsung", "aff_unique_dep": "College of Computer;School of Informatics;AI Centre", "aff_unique_url": ";https://www.ed.ac.uk;https://www.samsung.com/uk/", "aff_unique_abbr": ";Edinburgh;", "aff_campus_unique_index": "1;1;1+2", "aff_campus_unique": ";Edinburgh;Cambridge", "aff_country_unique_index": "0+1;1;0;1+1", "aff_country_unique": "China;United Kingdom" }, { "title": "Finding Mixed Nash Equilibria of Generative Adversarial Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4102", "id": "4102", "author_site": "Ya-Ping Hsieh, Chen Liu, Volkan Cevher", "author": "Ya-Ping Hsieh; Chen Liu; Volkan Cevher", "abstract": "Generative adversarial networks (GANs) are known to achieve the state-of-the-art performance on various generative tasks, but these results come at the expense of a notoriously difficult training phase. Current training strategies typically draw a connection to optimization theory, whose scope is restricted to local convergence due to the presence of non-convexity. In this work, we tackle the training of GANs by rethinking the problem formulation from the mixed Nash Equilibria (NE) perspective. Via a classical lifting trick, we show that essentially all existing GAN objectives can be relaxed into their mixed strategy forms, whose global optima can be solved via sampling, in contrast to the exclusive use of optimization framework in previous work. We further propose a mean-approximation sampling scheme, which allows to systematically exploit methods for bi-affine games to delineate novel, practical training algorithms of GANs. Finally, we provide experimental evidence that our approach yields comparable or superior results to contemporary training algorithms, and outperforms classical methods such as SGD, Adam, and RMSProp.", "bibtex": "@InProceedings{pmlr-v97-hsieh19b,\n title = \t {Finding Mixed {N}ash Equilibria of Generative Adversarial Networks},\n author = {Hsieh, Ya-Ping and Liu, Chen and Cevher, Volkan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2810--2819},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hsieh19b/hsieh19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/hsieh19b.html},\n abstract = \t {Generative adversarial networks (GANs) are known to achieve the state-of-the-art performance on various generative tasks, but these results come at the expense of a notoriously difficult training phase. Current training strategies typically draw a connection to optimization theory, whose scope is restricted to local convergence due to the presence of non-convexity. In this work, we tackle the training of GANs by rethinking the problem formulation from the mixed Nash Equilibria (NE) perspective. Via a classical lifting trick, we show that essentially all existing GAN objectives can be relaxed into their mixed strategy forms, whose global optima can be solved via sampling, in contrast to the exclusive use of optimization framework in previous work. We further propose a mean-approximation sampling scheme, which allows to systematically exploit methods for bi-affine games to delineate novel, practical training algorithms of GANs. Finally, we provide experimental evidence that our approach yields comparable or superior results to contemporary training algorithms, and outperforms classical methods such as SGD, Adam, and RMSProp.}\n}", "pdf": "http://proceedings.mlr.press/v97/hsieh19b/hsieh19b.pdf", "supp": "", "pdf_size": 3153098, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14920056197615352388&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "aff": "LIONS, EPFL, Switzerland; LIONS, EPFL, Switzerland; LIONS, EPFL, Switzerland", "aff_domain": "epfl.ch; ; ", "email": "epfl.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hsieh19b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "LIONS", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Finding Options that Minimize Planning Time", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4275", "id": "4275", "author_site": "Yuu Jinnai, David Abel, David Hershkowitz, Michael L. Littman, George Konidaris", "author": "Yuu Jinnai; David Abel; David Hershkowitz; Michael Littman; George Konidaris", "abstract": "We formalize the problem of selecting the optimal set of options for planning as that of computing the smallest set of options so that planning converges in less than a given maximum of value-iteration passes. We first show that the problem is $\\NP$-hard, even if the task is constrained to be deterministic\u2014the first such complexity result for option discovery. We then present the first polynomial-time boundedly suboptimal approximation algorithm for this setting, and empirically evaluate it against both the optimal options and a representative collection of heuristic approaches in simple grid-based domains.", "bibtex": "@InProceedings{pmlr-v97-jinnai19a,\n title = \t {Finding Options that Minimize Planning Time},\n author = {Jinnai, Yuu and Abel, David and Hershkowitz, David and Littman, Michael and Konidaris, George},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3120--3129},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jinnai19a/jinnai19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jinnai19a.html},\n abstract = \t {We formalize the problem of selecting the optimal set of options for planning as that of computing the smallest set of options so that planning converges in less than a given maximum of value-iteration passes. We first show that the problem is $\\NP$-hard, even if the task is constrained to be deterministic\u2014the first such complexity result for option discovery. We then present the first polynomial-time boundedly suboptimal approximation algorithm for this setting, and empirically evaluate it against both the optimal options and a representative collection of heuristic approaches in simple grid-based domains.}\n}", "pdf": "http://proceedings.mlr.press/v97/jinnai19a/jinnai19a.pdf", "supp": "", "pdf_size": 2030890, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3808430920273074074&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 17, "aff": "Brown University; Brown University; Carnegie Mellon University; Brown University; Brown University", "aff_domain": "brown.edu; ; ; ; ", "email": "brown.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/jinnai19a.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Brown University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.brown.edu;https://www.cmu.edu", "aff_unique_abbr": "Brown;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fine-Grained Analysis of Optimization and Generalization for Overparameterized Two-Layer Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3996", "id": "3996", "author_site": "Sanjeev Arora, Simon Du, Wei Hu, Zhiyuan Li, Ruosong Wang", "author": "Sanjeev Arora; Simon Du; Wei Hu; Zhiyuan Li; Ruosong Wang", "abstract": "Recent works have cast some light on the mystery of why deep nets fit any data and generalize despite being very overparametrized. This paper analyzes training and generalization for a simple 2-layer ReLU net with random initialization, and provides the following improvements over recent works: (i) Using a tighter characterization of training speed than recent papers, an explanation for why training a neural net with random labels leads to slower training, as originally observed in [Zhang et al. ICLR\u201917]. (ii) Generalization bound independent of network size, using a data-dependent complexity measure. Our measure distinguishes clearly between random labels and true labels on MNIST and CIFAR, as shown by experiments. Moreover, recent papers require sample complexity to increase (slowly) with the size, while our sample complexity is completely independent of the network size. (iii) Learnability of a broad class of smooth functions by 2-layer ReLU nets trained via gradient descent. The key idea is to track dynamics of training and generalization via properties of a related kernel.", "bibtex": "@InProceedings{pmlr-v97-arora19a,\n title = \t {Fine-Grained Analysis of Optimization and Generalization for Overparameterized Two-Layer Neural Networks},\n author = {Arora, Sanjeev and Du, Simon and Hu, Wei and Li, Zhiyuan and Wang, Ruosong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {322--332},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/arora19a/arora19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/arora19a.html},\n abstract = \t {Recent works have cast some light on the mystery of why deep nets fit any data and generalize despite being very overparametrized. This paper analyzes training and generalization for a simple 2-layer ReLU net with random initialization, and provides the following improvements over recent works: (i) Using a tighter characterization of training speed than recent papers, an explanation for why training a neural net with random labels leads to slower training, as originally observed in [Zhang et al. ICLR\u201917]. (ii) Generalization bound independent of network size, using a data-dependent complexity measure. Our measure distinguishes clearly between random labels and true labels on MNIST and CIFAR, as shown by experiments. Moreover, recent papers require sample complexity to increase (slowly) with the size, while our sample complexity is completely independent of the network size. (iii) Learnability of a broad class of smooth functions by 2-layer ReLU nets trained via gradient descent. The key idea is to track dynamics of training and generalization via properties of a related kernel.}\n}", "pdf": "http://proceedings.mlr.press/v97/arora19a/arora19a.pdf", "supp": "", "pdf_size": 1164176, "gs_citation": 1183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14197482372306728106&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Princeton University, Princeton, NJ, USA+Institute for Advanced Study, Princeton, NJ, USA; Carnegie Mellon University, Pittsburgh, PA, USA; Princeton University, Princeton, NJ, USA; Princeton University, Princeton, NJ, USA; Carnegie Mellon University, Pittsburgh, PA, USA", "aff_domain": "cs.princeton.edu;cs.cmu.edu;cs.princeton.edu;cs.princeton.edu;cs.cmu.edu", "email": "cs.princeton.edu;cs.cmu.edu;cs.princeton.edu;cs.princeton.edu;cs.cmu.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/arora19a.html", "aff_unique_index": "0+1;2;0;0;2", "aff_unique_norm": "Princeton University;Institute for Advanced Study;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://www.ias.edu;https://www.cmu.edu", "aff_unique_abbr": "Princeton;IAS;CMU", "aff_campus_unique_index": "0+0;1;0;0;1", "aff_campus_unique": "Princeton;Pittsburgh", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Fingerprint Policy Optimisation for Robust Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3743", "id": "3743", "author_site": "Supratik Paul, Michael A Osborne, Shimon Whiteson", "author": "Supratik Paul; Michael A. Osborne; Shimon Whiteson", "abstract": "Policy gradient methods ignore the potential value of adjusting environment variables: unobservable state features that are randomly determined by the environment in a physical setting, but are controllable in a simulator. This can lead to slow learning, or convergence to suboptimal policies, if the environment variable has a large impact on the transition dynamics. In this paper, we present fingerprint policy optimisation (FPO), which finds a policy that is optimal in expectation across the distribution of environment variables. The central idea is to use Bayesian optimisation (BO) to actively select the distribution of the environment variable that maximises the improvement generated by each iteration of the policy gradient method. To make this BO practical, we contribute two easy-to-compute low-dimensional fingerprints of the current policy. Our experiments show that FPO can efficiently learn policies that are robust to significant rare events, which are unlikely to be observable under random sampling, but are key to learning good policies.", "bibtex": "@InProceedings{pmlr-v97-paul19a,\n title = \t {Fingerprint Policy Optimisation for Robust Reinforcement Learning},\n author = {Paul, Supratik and Osborne, Michael A. and Whiteson, Shimon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5082--5091},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/paul19a/paul19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/paul19a.html},\n abstract = \t {Policy gradient methods ignore the potential value of adjusting environment variables: unobservable state features that are randomly determined by the environment in a physical setting, but are controllable in a simulator. This can lead to slow learning, or convergence to suboptimal policies, if the environment variable has a large impact on the transition dynamics. In this paper, we present fingerprint policy optimisation (FPO), which finds a policy that is optimal in expectation across the distribution of environment variables. The central idea is to use Bayesian optimisation (BO) to actively select the distribution of the environment variable that maximises the improvement generated by each iteration of the policy gradient method. To make this BO practical, we contribute two easy-to-compute low-dimensional fingerprints of the current policy. Our experiments show that FPO can efficiently learn policies that are robust to significant rare events, which are unlikely to be observable under random sampling, but are key to learning good policies.}\n}", "pdf": "http://proceedings.mlr.press/v97/paul19a/paul19a.pdf", "supp": "", "pdf_size": 587816, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8223030144810823403&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Department of Computer Science, University of Oxford, UK; Department of Engineering Science, University of Oxford, UK; Department of Computer Science, University of Oxford, UK", "aff_domain": "cs.ox.ac.uk; ; ", "email": "cs.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/paul19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Finite-Time Analysis of Distributed TD(0) with Linear Function Approximation on Multi-Agent Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3727", "id": "3727", "author_site": "Thinh Doan, Siva Maguluri, Justin Romberg", "author": "Thinh Doan; Siva Maguluri; Justin Romberg", "abstract": "We study the policy evaluation problem in multi-agent reinforcement learning. In this problem, a group of agents works cooperatively to evaluate the value function for the global discounted accumulative reward problem, which is composed of local rewards observed by the agents. Over a series of time steps, the agents act, get rewarded, update their local estimate of the value function, then communicate with their neighbors. The local update at each agent can be interpreted as a distributed consensus-based variant of the popular temporal difference learning algorithm TD(0). While distributed reinforcement learning algorithms have been presented in the literature, almost nothing is known about their convergence rate. Our main contribution is providing a finite-time analysis for the convergence of the distributed TD(0) algorithm. We do this when the communication network between the agents is time-varying in general. We obtain an explicit upper bound on the rate of convergence of this algorithm as a function of the network topology and the discount factor. Our results mirror what we would expect from using distributed stochastic gradient descent for solving convex optimization problems.", "bibtex": "@InProceedings{pmlr-v97-doan19a,\n title = \t {Finite-Time Analysis of Distributed {TD}(0) with Linear Function Approximation on Multi-Agent Reinforcement Learning},\n author = {Doan, Thinh and Maguluri, Siva and Romberg, Justin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1626--1635},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/doan19a/doan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/doan19a.html},\n abstract = \t {We study the policy evaluation problem in multi-agent reinforcement learning. In this problem, a group of agents works cooperatively to evaluate the value function for the global discounted accumulative reward problem, which is composed of local rewards observed by the agents. Over a series of time steps, the agents act, get rewarded, update their local estimate of the value function, then communicate with their neighbors. The local update at each agent can be interpreted as a distributed consensus-based variant of the popular temporal difference learning algorithm TD(0). While distributed reinforcement learning algorithms have been presented in the literature, almost nothing is known about their convergence rate. Our main contribution is providing a finite-time analysis for the convergence of the distributed TD(0) algorithm. We do this when the communication network between the agents is time-varying in general. We obtain an explicit upper bound on the rate of convergence of this algorithm as a function of the network topology and the discount factor. Our results mirror what we would expect from using distributed stochastic gradient descent for solving convex optimization problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/doan19a/doan19a.pdf", "supp": "", "pdf_size": 352255, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7772389988885351971&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Industrial and Systems Engineering + School of Electrical and Computer Engineering, Georgia Instiute of Technology, GA, 30332, USA; School of Industrial and Systems Engineering, Georgia Instiute of Technology, GA, 30332, USA; School of Electrical and Computer Engineering, Georgia Instiute of Technology, GA, 30332, USA", "aff_domain": "gatech.edu; ; ", "email": "gatech.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/doan19a.html", "aff_unique_index": "0+0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "School of Industrial and Systems Engineering", "aff_unique_url": "https://www.isye.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Georgia Tech", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "First-Order Adversarial Vulnerability of Neural Networks and Input Dimension", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4017", "id": "4017", "author_site": "Carl-Johann Simon-Gabriel, Yann Ollivier, Leon Bottou, Bernhard Sch\u00f6lkopf, David Lopez-Paz", "author": "Carl-Johann Simon-Gabriel; Yann Ollivier; Leon Bottou; Bernhard Sch\u00f6lkopf; David Lopez-Paz", "abstract": "Over the past few years, neural networks were proven vulnerable to adversarial images: targeted but imperceptible image perturbations lead to drastically different predictions. We show that adversarial vulnerability increases with the gradients of the training objective when viewed as a function of the inputs. Surprisingly, vulnerability does not depend on network topology: for many standard network architectures, we prove that at initialization, the L1-norm of these gradients grows as the square root of the input dimension, leaving the networks increasingly vulnerable with growing image size. We empirically show that this dimension-dependence persists after either usual or robust training, but gets attenuated with higher regularization.", "bibtex": "@InProceedings{pmlr-v97-simon-gabriel19a,\n title = \t {First-Order Adversarial Vulnerability of Neural Networks and Input Dimension},\n author = {Simon-Gabriel, Carl-Johann and Ollivier, Yann and Bottou, Leon and Sch{\\\"o}lkopf, Bernhard and Lopez-Paz, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5809--5817},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/simon-gabriel19a/simon-gabriel19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/simon-gabriel19a.html},\n abstract = \t {Over the past few years, neural networks were proven vulnerable to adversarial images: targeted but imperceptible image perturbations lead to drastically different predictions. We show that adversarial vulnerability increases with the gradients of the training objective when viewed as a function of the inputs. Surprisingly, vulnerability does not depend on network topology: for many standard network architectures, we prove that at initialization, the L1-norm of these gradients grows as the square root of the input dimension, leaving the networks increasingly vulnerable with growing image size. We empirically show that this dimension-dependence persists after either usual or robust training, but gets attenuated with higher regularization.}\n}", "pdf": "http://proceedings.mlr.press/v97/simon-gabriel19a/simon-gabriel19a.pdf", "supp": "", "pdf_size": 503973, "gs_citation": 130, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=577929050796401765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Empirical Inference Department, Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+Facebook AI Research, Paris/New York; Facebook AI Research, Paris/New York; Empirical Inference Department, Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany+Facebook AI Research, Paris/New York; Facebook AI Research, Paris/New York; Facebook AI Research, Paris/New York", "aff_domain": "tue.mpg.de; ; ; ; ", "email": "tue.mpg.de; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/simon-gabriel19a.html", "aff_unique_index": "0+1;1;0+1;1;1", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Meta", "aff_unique_dep": "Empirical Inference Department;Facebook AI Research", "aff_unique_url": "https://www.mpituebingen.mpg.de;https://research.facebook.com", "aff_unique_abbr": "MPI-IS;FAIR", "aff_campus_unique_index": "0+1;1;0+1;1;1", "aff_campus_unique": "T\u00fcbingen;Paris", "aff_country_unique_index": "0+1;1;0+1;1;1", "aff_country_unique": "Germany;France" }, { "title": "First-Order Algorithms Converge Faster than $O(1/k)$ on Convex Problems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3677", "id": "3677", "author_site": "Ching-pei Lee, Stephen Wright", "author": "Ching-Pei Lee; Stephen Wright", "abstract": "It is well known that both gradient descent and stochastic coordinate descent achieve a global convergence rate of $O(1/k)$ in the objective value, when applied to a scheme for minimizing a Lipschitz-continuously differentiable, unconstrained convex function. In this work, we improve this rate to $o(1/k)$. We extend the result to proximal gradient and proximal coordinate descent on regularized problems to show similar $o(1/k)$ convergence rates. The result is tight in the sense that a rate of $O(1/k^{1+\\epsilon})$ is not generally attainable for any $\\epsilon>0$, for any of these methods.", "bibtex": "@InProceedings{pmlr-v97-lee19e,\n title = \t {First-Order Algorithms Converge Faster than $O(1/k)$ on Convex Problems},\n author = {Lee, Ching-Pei and Wright, Stephen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3754--3762},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lee19e/lee19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/lee19e.html},\n abstract = \t {It is well known that both gradient descent and stochastic coordinate descent achieve a global convergence rate of $O(1/k)$ in the objective value, when applied to a scheme for minimizing a Lipschitz-continuously differentiable, unconstrained convex function. In this work, we improve this rate to $o(1/k)$. We extend the result to proximal gradient and proximal coordinate descent on regularized problems to show similar $o(1/k)$ convergence rates. The result is tight in the sense that a rate of $O(1/k^{1+\\epsilon})$ is not generally attainable for any $\\epsilon>0$, for any of these methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/lee19e/lee19e.pdf", "supp": "", "pdf_size": 317637, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kW_1SqlzggcJ:scholar.google.com/&scioq=First-Order+Algorithms+Converge+Faster+than+%24O(1/k)%24+on+Convex+Problems&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff": "Department of Computer Sciences and Wisconsin Institute for Discovery, University of Wisconsin-Madison, Madison, Wisconsin, USA; Department of Computer Sciences and Wisconsin Institute for Discovery, University of Wisconsin-Madison, Madison, Wisconsin, USA", "aff_domain": "cs.wisc.edu;cs.wisc.edu", "email": "cs.wisc.edu;cs.wisc.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/lee19e.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Wisconsin-Madison", "aff_unique_dep": "Department of Computer Sciences", "aff_unique_url": "https://www.wisc.edu", "aff_unique_abbr": "UW-Madison", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Madison", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Flat Metric Minimization with Applications in Generative Modeling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3598", "id": "3598", "author_site": "Thomas M\u00f6llenhoff, Daniel Cremers", "author": "Thomas M\u00f6llenhoff; Daniel Cremers", "abstract": "We take the novel perspective to view data not as a probability distribution but rather as a current. Primarily studied in the field of geometric measure theory, k-currents are continuous linear functionals acting on compactly supported smooth differential forms and can be understood as a generalized notion of oriented k-dimensional manifold. By moving from distributions (which are 0-currents) to k-currents, we can explicitly orient the data by attaching a k-dimensional tangent plane to each sample point. Based on the flat metric which is a fundamental distance between currents, we derive FlatGAN, a formulation in the spirit of generative adversarial networks but generalized to k-currents. In our theoretical contribution we prove that the flat metric between a parametrized current and a reference current is Lipschitz continuous in the parameters. In experiments, we show that the proposed shift to k>0 leads to interpretable and disentangled latent representations which behave equivariantly to the specified oriented tangent planes.", "bibtex": "@InProceedings{pmlr-v97-mollenhoff19a,\n title = \t {Flat Metric Minimization with Applications in Generative Modeling},\n author = {M{\\\"o}llenhoff, Thomas and Cremers, Daniel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4626--4635},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mollenhoff19a/mollenhoff19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mollenhoff19a.html},\n abstract = \t {We take the novel perspective to view data not as a probability distribution but rather as a current. Primarily studied in the field of geometric measure theory, k-currents are continuous linear functionals acting on compactly supported smooth differential forms and can be understood as a generalized notion of oriented k-dimensional manifold. By moving from distributions (which are 0-currents) to k-currents, we can explicitly orient the data by attaching a k-dimensional tangent plane to each sample point. Based on the flat metric which is a fundamental distance between currents, we derive FlatGAN, a formulation in the spirit of generative adversarial networks but generalized to k-currents. In our theoretical contribution we prove that the flat metric between a parametrized current and a reference current is Lipschitz continuous in the parameters. In experiments, we show that the proposed shift to k>0 leads to interpretable and disentangled latent representations which behave equivariantly to the specified oriented tangent planes.}\n}", "pdf": "http://proceedings.mlr.press/v97/mollenhoff19a/mollenhoff19a.pdf", "supp": "", "pdf_size": 1092951, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16621113036066180234&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Informatics, Technical University of Munich, Garching, Germany; Department of Informatics, Technical University of Munich, Garching, Germany", "aff_domain": "tum.de; ", "email": "tum.de; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/mollenhoff19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technical University of Munich", "aff_unique_dep": "Department of Informatics", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Garching", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Flexibly Fair Representation Learning by Disentanglement", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4254", "id": "4254", "author_site": "Elliot Creager, David Madras, Joern-Henrik Jacobsen, Marissa Weis, Kevin Swersky, Toniann Pitassi, Richard Zemel", "author": "Elliot Creager; David Madras; Joern-Henrik Jacobsen; Marissa Weis; Kevin Swersky; Toniann Pitassi; Richard Zemel", "abstract": "We consider the problem of learning representations that achieve group and subgroup fairness with respect to multiple sensitive attributes. Taking inspiration from the disentangled representation learning literature, we propose an algorithm for learning compact representations of datasets that are useful for reconstruction and prediction, but are also", "bibtex": "@InProceedings{pmlr-v97-creager19a,\n title = \t {Flexibly Fair Representation Learning by Disentanglement},\n author = {Creager, Elliot and Madras, David and Jacobsen, Joern-Henrik and Weis, Marissa and Swersky, Kevin and Pitassi, Toniann and Zemel, Richard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1436--1445},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/creager19a/creager19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/creager19a.html},\n abstract = \t {We consider the problem of learning representations that achieve group and subgroup fairness with respect to multiple sensitive attributes. Taking inspiration from the disentangled representation learning literature, we propose an algorithm for learning compact representations of datasets that are useful for reconstruction and prediction, but are also", "pdf": "http://proceedings.mlr.press/v97/creager19a/creager19a.pdf", "supp": "", "pdf_size": 1272604, "gs_citation": 430, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15712582973159746533&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "University of Toronto+Vector Institute; University of Toronto+Vector Institute; Vector Institute; University of T\u00a8ubingen+Vector Institute; Google Research; University of Toronto+Vector Institute; University of Toronto+Vector Institute", "aff_domain": "cs.toronto.edu; ; ; ; ; ; ", "email": "cs.toronto.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/creager19a.html", "aff_unique_index": "0+1;0+1;1;2+1;3;0+1;0+1", "aff_unique_norm": "University of Toronto;Vector Institute;University of T\u00fcbingen;Google", "aff_unique_dep": ";;;Google Research", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.uni-tuebingen.de/;https://research.google", "aff_unique_abbr": "U of T;Vector Institute;Uni T\u00fcbingen;Google Research", "aff_campus_unique_index": ";;;1;;", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0+0;0;1+0;2;0+0;0+0", "aff_country_unique": "Canada;Germany;United States" }, { "title": "FloWaveNet : A Generative Flow for Raw Audio", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4060", "id": "4060", "author_site": "Sungwon Kim, Sang-gil Lee, Jongyoon Song, Jaehyeon Kim, Sungroh Yoon", "author": "Sungwon Kim; Sang-Gil Lee; Jongyoon Song; Jaehyeon Kim; Sungroh Yoon", "abstract": "Most modern text-to-speech architectures use a WaveNet vocoder for synthesizing high-fidelity waveform audio, but there have been limitations, such as high inference time, in practical applications due to its ancestral sampling scheme. The recently suggested Parallel WaveNet and ClariNet has achieved real-time audio synthesis capability by incorporating inverse autoregressive flow (IAF) for parallel sampling. However, these approaches require a two-stage training pipeline with a well-trained teacher network and can only produce natural sound by using probability distillation along with heavily-engineered auxiliary loss terms. We propose FloWaveNet, a flow-based generative model for raw audio synthesis. FloWaveNet requires only a single-stage training procedure and a single maximum likelihood loss, without any additional auxiliary terms, and it is inherently parallel due to the characteristics of generative flow. The model can efficiently sample raw audio in real-time, with clarity comparable to previous two-stage parallel models. The code and samples for all models, including our FloWaveNet, are available on GitHub.", "bibtex": "@InProceedings{pmlr-v97-kim19b,\n title = \t {{F}lo{W}ave{N}et : A Generative Flow for Raw Audio},\n author = {Kim, Sungwon and Lee, Sang-Gil and Song, Jongyoon and Kim, Jaehyeon and Yoon, Sungroh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3370--3378},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kim19b/kim19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/kim19b.html},\n abstract = \t {Most modern text-to-speech architectures use a WaveNet vocoder for synthesizing high-fidelity waveform audio, but there have been limitations, such as high inference time, in practical applications due to its ancestral sampling scheme. The recently suggested Parallel WaveNet and ClariNet has achieved real-time audio synthesis capability by incorporating inverse autoregressive flow (IAF) for parallel sampling. However, these approaches require a two-stage training pipeline with a well-trained teacher network and can only produce natural sound by using probability distillation along with heavily-engineered auxiliary loss terms. We propose FloWaveNet, a flow-based generative model for raw audio synthesis. FloWaveNet requires only a single-stage training procedure and a single maximum likelihood loss, without any additional auxiliary terms, and it is inherently parallel due to the characteristics of generative flow. The model can efficiently sample raw audio in real-time, with clarity comparable to previous two-stage parallel models. The code and samples for all models, including our FloWaveNet, are available on GitHub.}\n}", "pdf": "http://proceedings.mlr.press/v97/kim19b/kim19b.pdf", "supp": "", "pdf_size": 8754242, "gs_citation": 221, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6708907651291228140&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Electrical and Computer Engineering, Seoul National University, Seoul, Korea; Electrical and Computer Engineering, Seoul National University, Seoul, Korea; Electrical and Computer Engineering, Seoul National University, Seoul, Korea; Kakao Corporation; ASRI, INMC, Institute of Engineering Research, Seoul National University, Seoul, Korea", "aff_domain": "snu.ac.kr; ; ; ;snu.ac.kr", "email": "snu.ac.kr; ; ; ;snu.ac.kr", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kim19b.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Seoul National University;Kakao Corporation", "aff_unique_dep": "Electrical and Computer Engineering;", "aff_unique_url": "https://www.snu.ac.kr;https://www.kakao.com", "aff_unique_abbr": "SNU;Kakao", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Flow++: Improving Flow-Based Generative Models with Variational Dequantization and Architecture Design", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3759", "id": "3759", "author_site": "Jonathan Ho, Peter Chen, Aravind Srinivas, Rocky Duan, Pieter Abbeel", "author": "Jonathan Ho; Xi Chen; Aravind Srinivas; Yan Duan; Pieter Abbeel", "abstract": "Flow-based generative models are powerful exact likelihood models with efficient sampling and inference. Despite their computational efficiency, flow-based models generally have much worse density modeling performance compared to state-of-the-art autoregressive models. In this paper, we investigate and improve upon three limiting design choices employed by flow-based models in prior work: the use of uniform noise for dequantization, the use of inexpressive affine flows, and the use of purely convolutional conditioning networks in coupling layers. Based on our findings, we propose Flow++, a new flow-based model that is now the state-of-the-art non-autoregressive model for unconditional density estimation on standard image benchmarks. Our work has begun to close the significant performance gap that has so far existed between autoregressive models and flow-based models.", "bibtex": "@InProceedings{pmlr-v97-ho19a,\n title = \t {Flow++: Improving Flow-Based Generative Models with Variational Dequantization and Architecture Design},\n author = {Ho, Jonathan and Chen, Xi and Srinivas, Aravind and Duan, Yan and Abbeel, Pieter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2722--2730},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ho19a/ho19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ho19a.html},\n abstract = \t {Flow-based generative models are powerful exact likelihood models with efficient sampling and inference. Despite their computational efficiency, flow-based models generally have much worse density modeling performance compared to state-of-the-art autoregressive models. In this paper, we investigate and improve upon three limiting design choices employed by flow-based models in prior work: the use of uniform noise for dequantization, the use of inexpressive affine flows, and the use of purely convolutional conditioning networks in coupling layers. Based on our findings, we propose Flow++, a new flow-based model that is now the state-of-the-art non-autoregressive model for unconditional density estimation on standard image benchmarks. Our work has begun to close the significant performance gap that has so far existed between autoregressive models and flow-based models.}\n}", "pdf": "http://proceedings.mlr.press/v97/ho19a/ho19a.pdf", "supp": "", "pdf_size": 6160852, "gs_citation": 553, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7151289546461544772&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "UC Berkeley, Department of Electrical Engineering and Computer Science+covariant.ai; UC Berkeley, Department of Electrical Engineering and Computer Science+covariant.ai; UC Berkeley, Department of Electrical Engineering and Computer Science; covariant.ai; UC Berkeley, Department of Electrical Engineering and Computer Science+covariant.ai", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "email": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "github": "https://github.com/aravindsrinivas/flowpp", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/ho19a.html", "aff_unique_index": "0+1;0+1;0;1;0+1", "aff_unique_norm": "University of California, Berkeley;Covariant AI", "aff_unique_dep": "Department of Electrical Engineering and Computer Science;", "aff_unique_url": "https://www.berkeley.edu;https://www.covariant.ai", "aff_unique_abbr": "UC Berkeley;Covariant AI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0+0;0+0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Formal Privacy for Functional Data with Gaussian Perturbations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4269", "id": "4269", "author_site": "Ardalan Mirshani, Matthew Reimherr, Aleksandra Slavkovi\u0107", "author": "Ardalan Mirshani; Matthew Reimherr; Aleksandra Slavkovi\u0107", "abstract": "Motivated by the rapid rise in statistical tools in", "bibtex": "@InProceedings{pmlr-v97-mirshani19a,\n title = \t {Formal Privacy for Functional Data with {G}aussian Perturbations},\n author = {Mirshani, Ardalan and Reimherr, Matthew and Slavkovi{\\'c}, Aleksandra},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4595--4604},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mirshani19a/mirshani19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mirshani19a.html},\n abstract = \t {Motivated by the rapid rise in statistical tools in", "pdf": "http://proceedings.mlr.press/v97/mirshani19a/mirshani19a.pdf", "supp": "", "pdf_size": 2295779, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12248311817571270138&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Statistics, Pennsylvania State University, State College, PA, USA; Department of Statistics, Pennsylvania State University, State College, PA, USA; Department of Statistics, Pennsylvania State University, State College, PA, USA", "aff_domain": "psu.edu; ; ", "email": "psu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mirshani19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Pennsylvania State University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.psu.edu", "aff_unique_abbr": "PSU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "State College", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Functional Transparency for Structured Data: a Game-Theoretic Approach", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4259", "id": "4259", "author_site": "Guang-He Lee, Wengong Jin, David Alvarez-Melis, Tommi Jaakkola", "author": "Guang-He Lee; Wengong Jin; David Alvarez-Melis; Tommi Jaakkola", "abstract": "We provide a new approach to training neural models to exhibit transparency in a well-defined, functional manner. Our approach naturally operates over structured data and tailors the predictor, functionally, towards a chosen family of (local) witnesses. The estimation problem is setup as a co-operative game between an unrestricted", "bibtex": "@InProceedings{pmlr-v97-lee19b,\n title = \t {Functional Transparency for Structured Data: a Game-Theoretic Approach},\n author = {Lee, Guang-He and Jin, Wengong and Alvarez-Melis, David and Jaakkola, Tommi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3723--3733},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lee19b/lee19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/lee19b.html},\n abstract = \t {We provide a new approach to training neural models to exhibit transparency in a well-defined, functional manner. Our approach naturally operates over structured data and tailors the predictor, functionally, towards a chosen family of (local) witnesses. The estimation problem is setup as a co-operative game between an unrestricted", "pdf": "http://proceedings.mlr.press/v97/lee19b/lee19b.pdf", "supp": "", "pdf_size": 1267059, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8825263024474993124&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "MIT Computer Science and Artificial Intelligence Laboratory; MIT Computer Science and Artificial Intelligence Laboratory; MIT Computer Science and Artificial Intelligence Laboratory; MIT Computer Science and Artificial Intelligence Laboratory", "aff_domain": "csail.mit.edu; ; ; ", "email": "csail.mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/lee19b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT CSAIL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "GDPP: Learning Diverse Generations using Determinantal Point Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3621", "id": "3621", "author_site": "Mohamed Elfeki, Camille Couprie, Morgane Riviere, Mohamed Elhoseiny", "author": "Mohamed Elfeki; Camille Couprie; Morgane Riviere; Mohamed Elhoseiny", "abstract": "Generative models have proven to be an outstanding tool for representing high-dimensional probability distributions and generating realistic looking images. An essential characteristic of generative models is their ability to produce multi-modal outputs. However, while training, they are often susceptible to mode collapse, that is models are limited in mapping input noise to only a few modes of the true data distribution. In this work, we draw inspiration from Determinantal Point Process (DPP) to propose an unsupervised penalty loss that alleviates mode collapse while producing higher quality samples. DPP is an elegant probabilistic measure used to model negative correlations within a subset and hence quantify its diversity. We use DPP kernel to model the diversity in real data as well as in synthetic data. Then, we devise an objective term that encourages generator to synthesize data with a similar diversity to real data. In contrast to previous state-of-the-art generative models that tend to use additional trainable parameters or complex training paradigms, our method does not change the original training scheme. Embedded in an adversarial training and variational autoencoder, our Generative DPP approach shows a consistent resistance to mode-collapse on a wide-variety of synthetic data and natural image datasets including MNIST, CIFAR10, and CelebA, while outperforming state-of-the-art methods for data-efficiency, generation quality, and convergence-time whereas being 5.8x faster than its closest competitor.", "bibtex": "@InProceedings{pmlr-v97-elfeki19a,\n title = \t {{GDPP}: Learning Diverse Generations using Determinantal Point Processes},\n author = {Elfeki, Mohamed and Couprie, Camille and Riviere, Morgane and Elhoseiny, Mohamed},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1774--1783},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/elfeki19a/elfeki19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/elfeki19a.html},\n abstract = \t {Generative models have proven to be an outstanding tool for representing high-dimensional probability distributions and generating realistic looking images. An essential characteristic of generative models is their ability to produce multi-modal outputs. However, while training, they are often susceptible to mode collapse, that is models are limited in mapping input noise to only a few modes of the true data distribution. In this work, we draw inspiration from Determinantal Point Process (DPP) to propose an unsupervised penalty loss that alleviates mode collapse while producing higher quality samples. DPP is an elegant probabilistic measure used to model negative correlations within a subset and hence quantify its diversity. We use DPP kernel to model the diversity in real data as well as in synthetic data. Then, we devise an objective term that encourages generator to synthesize data with a similar diversity to real data. In contrast to previous state-of-the-art generative models that tend to use additional trainable parameters or complex training paradigms, our method does not change the original training scheme. Embedded in an adversarial training and variational autoencoder, our Generative DPP approach shows a consistent resistance to mode-collapse on a wide-variety of synthetic data and natural image datasets including MNIST, CIFAR10, and CelebA, while outperforming state-of-the-art methods for data-efficiency, generation quality, and convergence-time whereas being 5.8x faster than its closest competitor.}\n}", "pdf": "http://proceedings.mlr.press/v97/elfeki19a/elfeki19a.pdf", "supp": "", "pdf_size": 1167726, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6076531250010322466&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "https://github.com/M-Elfeki/GDPP", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/elfeki19a.html" }, { "title": "GEOMetrics: Exploiting Geometric Structure for Graph-Encoded Objects", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3656", "id": "3656", "author_site": "Edward Smith, Scott Fujimoto, Adriana Romero Soriano, David Meger", "author": "Edward Smith; Scott Fujimoto; Adriana Romero; David Meger", "abstract": "Mesh models are a promising approach for encoding the structure of 3D objects. Current mesh reconstruction systems predict uniformly distributed vertex locations of a predetermined graph through a series of graph convolutions, leading to compromises with respect to performance or resolution. In this paper, we argue that the graph representation of geometric objects allows for additional structure, which should be leveraged for enhanced reconstruction. Thus, we propose a system which properly benefits from the advantages of the geometric structure of graph-encoded objects by introducing (1) a graph convolutional update preserving vertex information; (2) an adaptive splitting heuristic allowing detail to emerge; and (3) a training objective operating both on the local surfaces defined by vertices as well as the global structure defined by the mesh. Our proposed method is evaluated on the task of 3D object reconstruction from images with the ShapeNet dataset, where we demonstrate state of the art performance, both visually and numerically, while having far smaller space requirements by generating adaptive meshes.", "bibtex": "@InProceedings{pmlr-v97-smith19a,\n title = \t {{GEOM}etrics: Exploiting Geometric Structure for Graph-Encoded Objects},\n author = {Smith, Edward and Fujimoto, Scott and Romero, Adriana and Meger, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5866--5876},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/smith19a/smith19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/smith19a.html},\n abstract = \t {Mesh models are a promising approach for encoding the structure of 3D objects. Current mesh reconstruction systems predict uniformly distributed vertex locations of a predetermined graph through a series of graph convolutions, leading to compromises with respect to performance or resolution. In this paper, we argue that the graph representation of geometric objects allows for additional structure, which should be leveraged for enhanced reconstruction. Thus, we propose a system which properly benefits from the advantages of the geometric structure of graph-encoded objects by introducing (1) a graph convolutional update preserving vertex information; (2) an adaptive splitting heuristic allowing detail to emerge; and (3) a training objective operating both on the local surfaces defined by vertices as well as the global structure defined by the mesh. Our proposed method is evaluated on the task of 3D object reconstruction from images with the ShapeNet dataset, where we demonstrate state of the art performance, both visually and numerically, while having far smaller space requirements by generating adaptive meshes.}\n}", "pdf": "http://proceedings.mlr.press/v97/smith19a/smith19a.pdf", "supp": "", "pdf_size": 7567783, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15300382945837912303&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, McGill University; Department of Computer Science, McGill University + Mila Qu\u00e9bec AI Institute; Facebook AI Research; Department of Computer Science, McGill University", "aff_domain": "mail.mcgill.ca; ; ; ", "email": "mail.mcgill.ca; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/smith19a.html", "aff_unique_index": "0;0+1;2;0", "aff_unique_norm": "McGill University;Mila Qu\u00e9bec AI Institute;Meta", "aff_unique_dep": "Department of Computer Science;AI Institute;Facebook AI Research", "aff_unique_url": "https://www.mcgill.ca;https://mila.quebec;https://research.facebook.com", "aff_unique_abbr": "McGill;Mila;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+0;1;0", "aff_country_unique": "Canada;United States" }, { "title": "GMNN: Graph Markov Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4121", "id": "4121", "author_site": "Meng Qu, Yoshua Bengio, Jian Tang", "author": "Meng Qu; Yoshua Bengio; Jian Tang", "abstract": "This paper studies semi-supervised object classification in relational data, which is a fundamental problem in relational data modeling. The problem has been extensively studied in the literature of both statistical relational learning (e.g. relational Markov networks) and graph neural networks (e.g. graph convolutional networks). Statistical relational learning methods can effectively model the dependency of object labels through conditional random fields for collective classification, whereas graph neural networks learn effective object representations for classification through end-to-end training. In this paper, we propose the Graph Markov Neural Network (GMNN) that combines the advantages of both worlds. A GMNN models the joint distribution of object labels with a conditional random field, which can be effectively trained with the variational EM algorithm. In the E-step, one graph neural network learns effective object representations for approximating the posterior distributions of object labels. In the M-step, another graph neural network is used to model the local label dependency. Experiments on object classification, link classification, and unsupervised node representation learning show that GMNN achieves state-of-the-art results.", "bibtex": "@InProceedings{pmlr-v97-qu19a,\n title = \t {{GMNN}: Graph {M}arkov Neural Networks},\n author = {Qu, Meng and Bengio, Yoshua and Tang, Jian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5241--5250},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/qu19a/qu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/qu19a.html},\n abstract = \t {This paper studies semi-supervised object classification in relational data, which is a fundamental problem in relational data modeling. The problem has been extensively studied in the literature of both statistical relational learning (e.g. relational Markov networks) and graph neural networks (e.g. graph convolutional networks). Statistical relational learning methods can effectively model the dependency of object labels through conditional random fields for collective classification, whereas graph neural networks learn effective object representations for classification through end-to-end training. In this paper, we propose the Graph Markov Neural Network (GMNN) that combines the advantages of both worlds. A GMNN models the joint distribution of object labels with a conditional random field, which can be effectively trained with the variational EM algorithm. In the E-step, one graph neural network learns effective object representations for approximating the posterior distributions of object labels. In the M-step, another graph neural network is used to model the local label dependency. Experiments on object classification, link classification, and unsupervised node representation learning show that GMNN achieves state-of-the-art results.}\n}", "pdf": "http://proceedings.mlr.press/v97/qu19a/qu19a.pdf", "supp": "", "pdf_size": 1096277, "gs_citation": 350, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3245098325331892267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Montr \u00b4eal Institute for Learning Algorithms (MILA) + University of Montr \u00b4eal + Canadian Institute for Advanced Research (CIFAR); Montr \u00b4eal Institute for Learning Algorithms (MILA) + University of Montr \u00b4eal + Canadian Institute for Advanced Research (CIFAR) + HEC Montr \u00b4eal; Montr \u00b4eal Institute for Learning Algorithms (MILA) + University of Montr \u00b4eal + Canadian Institute for Advanced Research (CIFAR) + HEC Montr \u00b4eal", "aff_domain": "umontreal.ca; ;hec.ca", "email": "umontreal.ca; ;hec.ca", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/qu19a.html", "aff_unique_index": "0+1+2;0+1+2+3;0+1+2+3", "aff_unique_norm": "Montr\u00e9al Institute for Learning Algorithms;University of Montreal;Canadian Institute for Advanced Research;HEC Montr\u00e9al", "aff_unique_dep": ";;;", "aff_unique_url": "https://mila.quebec;https://www.mcgill.ca;https://www.cifar.ca;https://www.hec.ca", "aff_unique_abbr": "MILA;U Montreal;CIFAR;HEC", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0+0;0+0+0+0", "aff_country_unique": "Canada" }, { "title": "GOODE: A Gaussian Off-The-Shelf Ordinary Differential Equation Solver", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4213", "id": "4213", "author_site": "David John, Vincent Heuveline, Michael Schober", "author": "David John; Vincent Heuveline; Michael Schober", "abstract": "There are two types of ordinary differential equations (ODEs): initial value problems (IVPs) and boundary value problems (BVPs). While many probabilistic numerical methods for the solution of IVPs have been presented to-date, there exists no efficient probabilistic general-purpose solver for nonlinear BVPs. Our method based on iterated Gaussian process (GP) regression returns a GP posterior over the solution of nonlinear ODEs, which provides a meaningful error estimation via its predictive posterior standard deviation. Our solver is fast (typically of quadratic convergence rate) and the theory of convergence can be transferred from prior non-probabilistic work. Our method performs on par with standard codes for an established benchmark of test problems.", "bibtex": "@InProceedings{pmlr-v97-john19a,\n title = \t {{GOODE}: A {G}aussian Off-The-Shelf Ordinary Differential Equation Solver},\n author = {John, David and Heuveline, Vincent and Schober, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3152--3162},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/john19a/john19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/john19a.html},\n abstract = \t {There are two types of ordinary differential equations (ODEs): initial value problems (IVPs) and boundary value problems (BVPs). While many probabilistic numerical methods for the solution of IVPs have been presented to-date, there exists no efficient probabilistic general-purpose solver for nonlinear BVPs. Our method based on iterated Gaussian process (GP) regression returns a GP posterior over the solution of nonlinear ODEs, which provides a meaningful error estimation via its predictive posterior standard deviation. Our solver is fast (typically of quadratic convergence rate) and the theory of convergence can be transferred from prior non-probabilistic work. Our method performs on par with standard codes for an established benchmark of test problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/john19a/john19a.pdf", "supp": "", "pdf_size": 460991, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15548404780039128496&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Corporate Research, Robert Bosch GmbH, Renningen, Germany+Engineering Mathematics and Computing Lab, Interdisciplinary Center for Scientific Computing, Heidelberg University, Germany; Engineering Mathematics and Computing Lab, Interdisciplinary Center for Scientific Computing, Heidelberg University, Germany; Bosch Center for Artificial Intelligence, Renningen, Germany", "aff_domain": "de.bosch.com; ; ", "email": "de.bosch.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/john19a.html", "aff_unique_index": "0+1;1;2", "aff_unique_norm": "Robert Bosch GmbH;Heidelberg University;Bosch Center for Artificial Intelligence", "aff_unique_dep": "Corporate Research;Engineering Mathematics and Computing Lab, Interdisciplinary Center for Scientific Computing;Artificial Intelligence", "aff_unique_url": "https://www.bosch.com;https://www.uni-heidelberg.de;https://www.bosch-ai.com", "aff_unique_abbr": "Bosch;Uni Heidelberg;BCAI", "aff_campus_unique_index": "0+1;1;0", "aff_campus_unique": "Renningen;Heidelberg", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "Germany" }, { "title": "Gaining Free or Low-Cost Interpretability with Interpretable Partial Substitute", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3773", "id": "3773", "author": "Tong Wang", "abstract": "This work addresses the situation where a black-box model with good predictive performance is chosen over its interpretable competitors, and we show interpretability is still achievable in this case. Our solution is to find an interpretable substitute on a subset of data where the black-box model is", "bibtex": "@InProceedings{pmlr-v97-wang19a,\n title = \t {Gaining Free or Low-Cost Interpretability with Interpretable Partial Substitute},\n author = {Wang, Tong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6505--6514},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19a/wang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19a.html},\n abstract = \t {This work addresses the situation where a black-box model with good predictive performance is chosen over its interpretable competitors, and we show interpretability is still achievable in this case. Our solution is to find an interpretable substitute on a subset of data where the black-box model is", "pdf": "http://proceedings.mlr.press/v97/wang19a/wang19a.pdf", "supp": "", "pdf_size": 634121, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4874352534347125425&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Department of Business Analytics, University of Iowa, Iowa, USA", "aff_domain": "uiowa.edu", "email": "uiowa.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/wang19a.html", "aff_unique_index": "0", "aff_unique_norm": "University of Iowa", "aff_unique_dep": "Department of Business Analytics", "aff_unique_url": "https://www.uiowa.edu", "aff_unique_abbr": "UIowa", "aff_campus_unique_index": "0", "aff_campus_unique": "Iowa", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Game Theoretic Optimization via Gradient-based Nikaido-Isoda Function", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4156", "id": "4156", "author_site": "Arvind Raghunathan, Anoop Cherian, Devesh Jha", "author": "Arvind Raghunathan; Anoop Cherian; Devesh Jha", "abstract": "Computing Nash equilibrium (NE) of multi-player games has witnessed renewed interest due to recent advances in generative adversarial networks. However, computing equilibrium efficiently is challenging. To this end, we introduce the Gradient-based Nikaido-Isoda (GNI) function which serves: (i) as a merit function, vanishing only at the first-order stationary points of each player\u2019s optimization problem, and (ii) provides error bounds to a stationary Nash point. Gradient descent is shown to converge sublinearly to a first-order stationary point of the GNI function. For the particular case of bilinear min-max games and multi-player quadratic games, the GNI function is convex. Hence, the application of gradient descent in this case yields linear convergence to an NE (when one exists). In our numerical experiments, we observe that the GNI formulation always converges to the first-order stationary point of each player\u2019s optimization problem.", "bibtex": "@InProceedings{pmlr-v97-raghunathan19a,\n title = \t {Game Theoretic Optimization via Gradient-based Nikaido-Isoda Function},\n author = {Raghunathan, Arvind and Cherian, Anoop and Jha, Devesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5291--5300},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/raghunathan19a/raghunathan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/raghunathan19a.html},\n abstract = \t {Computing Nash equilibrium (NE) of multi-player games has witnessed renewed interest due to recent advances in generative adversarial networks. However, computing equilibrium efficiently is challenging. To this end, we introduce the Gradient-based Nikaido-Isoda (GNI) function which serves: (i) as a merit function, vanishing only at the first-order stationary points of each player\u2019s optimization problem, and (ii) provides error bounds to a stationary Nash point. Gradient descent is shown to converge sublinearly to a first-order stationary point of the GNI function. For the particular case of bilinear min-max games and multi-player quadratic games, the GNI function is convex. Hence, the application of gradient descent in this case yields linear convergence to an NE (when one exists). In our numerical experiments, we observe that the GNI formulation always converges to the first-order stationary point of each player\u2019s optimization problem.}\n}", "pdf": "http://proceedings.mlr.press/v97/raghunathan19a/raghunathan19a.pdf", "supp": "", "pdf_size": 1187448, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=454070071243419198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA; Mitsubishi Electric Research Labs (MERL), Cambridge, MA", "aff_domain": "merl.com;merl.com;merl.com", "email": "merl.com;merl.com;merl.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/raghunathan19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Mitsubishi Electric Research Labs", "aff_unique_dep": "", "aff_unique_url": "https://www.merl.com", "aff_unique_abbr": "MERL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Garbage In, Reward Out: Bootstrapping Exploration in Multi-Armed Bandits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3694", "id": "3694", "author_site": "Branislav Kveton, Csaba Szepesvari, Sharan Vaswani, Zheng Wen, Tor Lattimore, Mohammad Ghavamzadeh", "author": "Branislav Kveton; Csaba Szepesvari; Sharan Vaswani; Zheng Wen; Tor Lattimore; Mohammad Ghavamzadeh", "abstract": "We propose a bandit algorithm that explores by randomizing its history of rewards. Specifically, it pulls the arm with the highest mean reward in a non-parametric bootstrap sample of its history with pseudo rewards. We design the pseudo rewards such that the bootstrap mean is optimistic with a sufficiently high probability. We call our algorithm Giro, which stands for garbage in, reward out. We analyze Giro in a Bernoulli bandit and derive a $O(K \\Delta^{-1} \\log n)$ bound on its $n$-round regret, where $\\Delta$ is the difference in the expected rewards of the optimal and the best suboptimal arms, and $K$ is the number of arms. The main advantage of our exploration design is that it easily generalizes to structured problems. To show this, we propose contextual Giro with an arbitrary reward generalization model. We evaluate Giro and its contextual variant on multiple synthetic and real-world problems, and observe that it performs well.", "bibtex": "@InProceedings{pmlr-v97-kveton19a,\n title = \t {Garbage In, Reward Out: Bootstrapping Exploration in Multi-Armed Bandits},\n author = {Kveton, Branislav and Szepesvari, Csaba and Vaswani, Sharan and Wen, Zheng and Lattimore, Tor and Ghavamzadeh, Mohammad},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3601--3610},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kveton19a/kveton19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kveton19a.html},\n abstract = \t {We propose a bandit algorithm that explores by randomizing its history of rewards. Specifically, it pulls the arm with the highest mean reward in a non-parametric bootstrap sample of its history with pseudo rewards. We design the pseudo rewards such that the bootstrap mean is optimistic with a sufficiently high probability. We call our algorithm Giro, which stands for garbage in, reward out. We analyze Giro in a Bernoulli bandit and derive a $O(K \\Delta^{-1} \\log n)$ bound on its $n$-round regret, where $\\Delta$ is the difference in the expected rewards of the optimal and the best suboptimal arms, and $K$ is the number of arms. The main advantage of our exploration design is that it easily generalizes to structured problems. To show this, we propose contextual Giro with an arbitrary reward generalization model. We evaluate Giro and its contextual variant on multiple synthetic and real-world problems, and observe that it performs well.}\n}", "pdf": "http://proceedings.mlr.press/v97/kveton19a/kveton19a.pdf", "supp": "", "pdf_size": 818126, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10191962397719348875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Google Research; DeepMind + University of Alberta; Mila, University of Montreal; Adobe Research; Facebook AI Research; DeepMind", "aff_domain": "google.com; ; ; ; ; ", "email": "google.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/kveton19a.html", "aff_unique_index": "0;1+2;3;4;5;1", "aff_unique_norm": "Google;DeepMind;University of Alberta;University of Montreal;Adobe;Meta", "aff_unique_dep": "Google Research;;;Mila;Adobe Research;Facebook AI Research", "aff_unique_url": "https://research.google;https://deepmind.com;https://www.ualberta.ca;https://www.mila.quebec;https://research.adobe.com;https://research.facebook.com", "aff_unique_abbr": "Google Research;DeepMind;UAlberta;Mila;Adobe;FAIR", "aff_campus_unique_index": "0;;2", "aff_campus_unique": "Mountain View;;Montreal", "aff_country_unique_index": "0;1+2;2;0;0;1", "aff_country_unique": "United States;United Kingdom;Canada" }, { "title": "Gauge Equivariant Convolutional Networks and the Icosahedral CNN", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3872", "id": "3872", "author_site": "Taco Cohen, Maurice Weiler, Berkay Kicanaoglu, Max Welling", "author": "Taco Cohen; Maurice Weiler; Berkay Kicanaoglu; Max Welling", "abstract": "The principle of equivariance to symmetry transformations enables a theoretically grounded approach to neural network architecture design. Equivariant networks have shown excellent performance and data efficiency on vision and medical imaging problems that exhibit symmetries. Here we show how this principle can be extended beyond global symmetries to local gauge transformations. This enables the development of a very general class of convolutional neural networks on manifolds that depend only on the intrinsic geometry, and which includes many popular methods from equivariant and geometric deep learning. We implement gauge equivariant CNNs for signals defined on the surface of the icosahedron, which provides a reasonable approximation of the sphere. By choosing to work with this very regular manifold, we are able to implement the gauge equivariant convolution using a single conv2d call, making it a highly scalable and practical alternative to Spherical CNNs. Using this method, we demonstrate substantial improvements over previous methods on the task of segmenting omnidirectional images and global climate patterns.", "bibtex": "@InProceedings{pmlr-v97-cohen19d,\n title = \t {Gauge Equivariant Convolutional Networks and the Icosahedral {CNN}},\n author = {Cohen, Taco and Weiler, Maurice and Kicanaoglu, Berkay and Welling, Max},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1321--1330},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cohen19d/cohen19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/cohen19d.html},\n abstract = \t {The principle of equivariance to symmetry transformations enables a theoretically grounded approach to neural network architecture design. Equivariant networks have shown excellent performance and data efficiency on vision and medical imaging problems that exhibit symmetries. Here we show how this principle can be extended beyond global symmetries to local gauge transformations. This enables the development of a very general class of convolutional neural networks on manifolds that depend only on the intrinsic geometry, and which includes many popular methods from equivariant and geometric deep learning. We implement gauge equivariant CNNs for signals defined on the surface of the icosahedron, which provides a reasonable approximation of the sphere. By choosing to work with this very regular manifold, we are able to implement the gauge equivariant convolution using a single conv2d call, making it a highly scalable and practical alternative to Spherical CNNs. Using this method, we demonstrate substantial improvements over previous methods on the task of segmenting omnidirectional images and global climate patterns.}\n}", "pdf": "http://proceedings.mlr.press/v97/cohen19d/cohen19d.pdf", "supp": "", "pdf_size": 1358728, "gs_citation": 512, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2560631989966493031&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Qualcomm AI Research, Amsterdam, NL; Qualcomm-University of Amsterdam (QUV A) Lab; Qualcomm-University of Amsterdam (QUV A) Lab; Qualcomm AI Research, Amsterdam, NL", "aff_domain": "gmail.com;uva.nl; ; ", "email": "gmail.com;uva.nl; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/cohen19d.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "Qualcomm AI Research;University of Amsterdam", "aff_unique_dep": "AI Research;Qualcomm-University of Amsterdam (QUV A) Lab", "aff_unique_url": "https://www.qualcomm.com/research;https://www.uva.nl", "aff_unique_abbr": "QAI;UvA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Amsterdam;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Netherlands" }, { "title": "Generalized Approximate Survey Propagation for High-Dimensional Estimation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4031", "id": "4031", "author_site": "Carlo Lucibello, Luca Saglietti, Yue Lu", "author": "Carlo Lucibello; Luca Saglietti; Yue Lu", "abstract": "In Generalized Linear Estimation (GLE) problems, we seek to estimate a signal that is observed through a linear transform followed by a component-wise, possibly nonlinear and noisy, channel. In the Bayesian optimal setting, Generalized Approximate Message Passing (GAMP) is known to achieve optimal performance for GLE. However, its performance can significantly deteriorate whenever there is a mismatch between the assumed and the true generative model, a situation frequently encountered in practice. In this paper, we propose a new algorithm, named Generalized Approximate Survey Propagation (GASP), for solving GLE in the presence of prior or model misspecifications. As a prototypical example, we consider the phase retrieval problem, where we show that GASP outperforms the corresponding GAMP, reducing the reconstruction threshold and, for certain choices of its parameters, approaching Bayesian optimal performance. Furthermore, we present a set of state evolution equations that can precisely characterize the performance of GASP in the high-dimensional limit.", "bibtex": "@InProceedings{pmlr-v97-lucibello19a,\n title = \t {Generalized Approximate Survey Propagation for High-Dimensional Estimation},\n author = {Lucibello, Carlo and Saglietti, Luca and Lu, Yue},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4173--4182},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lucibello19a/lucibello19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lucibello19a.html},\n abstract = \t {In Generalized Linear Estimation (GLE) problems, we seek to estimate a signal that is observed through a linear transform followed by a component-wise, possibly nonlinear and noisy, channel. In the Bayesian optimal setting, Generalized Approximate Message Passing (GAMP) is known to achieve optimal performance for GLE. However, its performance can significantly deteriorate whenever there is a mismatch between the assumed and the true generative model, a situation frequently encountered in practice. In this paper, we propose a new algorithm, named Generalized Approximate Survey Propagation (GASP), for solving GLE in the presence of prior or model misspecifications. As a prototypical example, we consider the phase retrieval problem, where we show that GASP outperforms the corresponding GAMP, reducing the reconstruction threshold and, for certain choices of its parameters, approaching Bayesian optimal performance. Furthermore, we present a set of state evolution equations that can precisely characterize the performance of GASP in the high-dimensional limit.}\n}", "pdf": "http://proceedings.mlr.press/v97/lucibello19a/lucibello19a.pdf", "supp": "", "pdf_size": 2378918, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12124949741026936439&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "Microsoft Research New England, Cambridge, MA 02142, USA+Italian Institute for Genomic Medicine, Turin, Italy; John A. Paulson School of Engineering and Applied Sciences, Harvard University, Cambridge, MA 02138, USA; Bocconi Institute for Data Science and Analytics, Bocconi University, Milan, Italy", "aff_domain": "gmail.com; ;unibocconi.it", "email": "gmail.com; ;unibocconi.it", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/lucibello19a.html", "aff_unique_index": "0+1;2;3", "aff_unique_norm": "Microsoft;Italian Institute for Genomic Medicine;Harvard University;Bocconi University", "aff_unique_dep": "Microsoft Research New England;;John A. Paulson School of Engineering and Applied Sciences;Bocconi Institute for Data Science and Analytics", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/new-england;;https://www.harvard.edu;https://www.bocconi.edu", "aff_unique_abbr": "MSR NE;;Harvard;Bocconi", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Cambridge;;Milan", "aff_country_unique_index": "0+1;0;1", "aff_country_unique": "United States;Italy" }, { "title": "Generalized Linear Rule Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3933", "id": "3933", "author_site": "Dennis Wei, Sanjeeb Dash, Tian Gao, Oktay Gunluk", "author": "Dennis Wei; Sanjeeb Dash; Tian Gao; Oktay Gunluk", "abstract": "This paper considers generalized linear models using rule-based features, also referred to as rule ensembles, for regression and probabilistic classification. Rules facilitate model interpretation while also capturing nonlinear dependences and interactions. Our problem formulation accordingly trades off rule set complexity and prediction accuracy. Column generation is used to optimize over an exponentially large space of rules without pre-generating a large subset of candidates or greedily boosting rules one by one. The column generation subproblem is solved using either integer programming or a heuristic optimizing the same objective. In experiments involving logistic and linear regression, the proposed methods obtain better accuracy-complexity trade-offs than existing rule ensemble algorithms. At one end of the trade-off, the methods are competitive with less interpretable benchmark models.", "bibtex": "@InProceedings{pmlr-v97-wei19a,\n title = \t {Generalized Linear Rule Models},\n author = {Wei, Dennis and Dash, Sanjeeb and Gao, Tian and Gunluk, Oktay},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6687--6696},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wei19a/wei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wei19a.html},\n abstract = \t {This paper considers generalized linear models using rule-based features, also referred to as rule ensembles, for regression and probabilistic classification. Rules facilitate model interpretation while also capturing nonlinear dependences and interactions. Our problem formulation accordingly trades off rule set complexity and prediction accuracy. Column generation is used to optimize over an exponentially large space of rules without pre-generating a large subset of candidates or greedily boosting rules one by one. The column generation subproblem is solved using either integer programming or a heuristic optimizing the same objective. In experiments involving logistic and linear regression, the proposed methods obtain better accuracy-complexity trade-offs than existing rule ensemble algorithms. At one end of the trade-off, the methods are competitive with less interpretable benchmark models.}\n}", "pdf": "http://proceedings.mlr.press/v97/wei19a/wei19a.pdf", "supp": "", "pdf_size": 3004448, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10451117585817236159&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "IBM Research, Yorktown Heights, NY, USA; IBM Research, Yorktown Heights, NY, USA; IBM Research, Yorktown Heights, NY, USA; IBM Research, Yorktown Heights, NY, USA", "aff_domain": "us.ibm.com; ; ; ", "email": "us.ibm.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wei19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "IBM", "aff_unique_dep": "IBM Research", "aff_unique_url": "https://www.ibm.com/research", "aff_unique_abbr": "IBM", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Yorktown Heights", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Generalized Majorization-Minimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3690", "id": "3690", "author_site": "Sobhan Naderi Parizi, Kun He, Reza Aghajani, Stan Sclaroff, Pedro Felzenszwalb", "author": "Sobhan Naderi Parizi; Kun He; Reza Aghajani; Stan Sclaroff; Pedro Felzenszwalb", "abstract": "Non-convex optimization is ubiquitous in machine learning. Majorization-Minimization (MM) is a powerful iterative procedure for optimizing non-convex functions that works by optimizing a sequence of bounds on the function. In MM, the bound at each iteration is required to touch the objective function at the optimizer of the previous bound. We show that this touching constraint is unnecessary and overly restrictive. We generalize MM by relaxing this constraint, and propose a new optimization framework, named Generalized Majorization-Minimization (G-MM), that is more flexible. For instance, G-MM can incorporate application-specific biases into the optimization procedure without changing the objective function. We derive G-MM algorithms for several latent variable models and show empirically that they consistently outperform their MM counterparts in optimizing non-convex objectives. In particular, G-MM algorithms appear to be less sensitive to initialization.", "bibtex": "@InProceedings{pmlr-v97-parizi19a,\n title = \t {Generalized Majorization-Minimization},\n author = {Parizi, Sobhan Naderi and He, Kun and Aghajani, Reza and Sclaroff, Stan and Felzenszwalb, Pedro},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5022--5031},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/parizi19a/parizi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/parizi19a.html},\n abstract = \t {Non-convex optimization is ubiquitous in machine learning. Majorization-Minimization (MM) is a powerful iterative procedure for optimizing non-convex functions that works by optimizing a sequence of bounds on the function. In MM, the bound at each iteration is required to touch the objective function at the optimizer of the previous bound. We show that this touching constraint is unnecessary and overly restrictive. We generalize MM by relaxing this constraint, and propose a new optimization framework, named Generalized Majorization-Minimization (G-MM), that is more flexible. For instance, G-MM can incorporate application-specific biases into the optimization procedure without changing the objective function. We derive G-MM algorithms for several latent variable models and show empirically that they consistently outperform their MM counterparts in optimizing non-convex objectives. In particular, G-MM algorithms appear to be less sensitive to initialization.}\n}", "pdf": "http://proceedings.mlr.press/v97/parizi19a/parizi19a.pdf", "supp": "", "pdf_size": 387641, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15878014999150378372&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Google Research; Facebook Reality Labs; University of California San Diego; Boston University; Brown University", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/parizi19a.html", "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "Google;Meta;University of California, San Diego;Boston University;Brown University", "aff_unique_dep": "Google Research;Facebook Reality Labs;;;", "aff_unique_url": "https://research.google;https://www.facebook.com/realitylabs;https://ucsd.edu;https://www.bu.edu;https://www.brown.edu", "aff_unique_abbr": "Google Research;FRL;UCSD;BU;Brown", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Mountain View;;San Diego", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Generalized No Free Lunch Theorem for Adversarial Robustness", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4225", "id": "4225", "author": "Elvis Dohmatob", "abstract": "This manuscript presents some new impossibility results on adversarial robustness in machine learning, a very important yet largely open problem. We show that if conditioned on a class label the data distribution satisfies the $W_2$ Talagrand transportation-cost inequality (for example, this condition is satisfied if the conditional distribution has density which is log-concave; is the uniform measure on a compact Riemannian manifold with positive Ricci curvature, any classifier can be adversarially fooled with high probability once the perturbations are slightly greater than the natural noise level in the problem. We call this result The Strong \"No Free Lunch\" Theorem as some recent results (Tsipras et al. 2018, Fawzi et al. 2018, etc.) on the subject can be immediately recovered as very particular cases. Our theoretical bounds are demonstrated on both simulated and real data (MNIST). We conclude the manuscript with some speculation on possible future research directions.", "bibtex": "@InProceedings{pmlr-v97-dohmatob19a,\n title = \t {Generalized No Free Lunch Theorem for Adversarial Robustness},\n author = {Dohmatob, Elvis},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1646--1654},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dohmatob19a/dohmatob19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dohmatob19a.html},\n abstract = \t {This manuscript presents some new impossibility results on adversarial robustness in machine learning, a very important yet largely open problem. We show that if conditioned on a class label the data distribution satisfies the $W_2$ Talagrand transportation-cost inequality (for example, this condition is satisfied if the conditional distribution has density which is log-concave; is the uniform measure on a compact Riemannian manifold with positive Ricci curvature, any classifier can be adversarially fooled with high probability once the perturbations are slightly greater than the natural noise level in the problem. We call this result The Strong \"No Free Lunch\" Theorem as some recent results (Tsipras et al. 2018, Fawzi et al. 2018, etc.) on the subject can be immediately recovered as very particular cases. Our theoretical bounds are demonstrated on both simulated and real data (MNIST). We conclude the manuscript with some speculation on possible future research directions.}\n}", "pdf": "http://proceedings.mlr.press/v97/dohmatob19a/dohmatob19a.pdf", "supp": "", "pdf_size": 432878, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3459617799171539317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Criteo, Paris, France", "aff_domain": "criteo.com", "email": "criteo.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/dohmatob19a.html", "aff_unique_index": "0", "aff_unique_norm": "Criteo", "aff_unique_dep": "", "aff_unique_url": "https://www.criteo.com", "aff_unique_abbr": "Criteo", "aff_campus_unique_index": "0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0", "aff_country_unique": "France" }, { "title": "Generative Adversarial User Model for Reinforcement Learning Based Recommendation System", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4202", "id": "4202", "author_site": "Xinshi Chen, Shuang Li, Hui Li, Shaohua Jiang, Yuan Qi, Le Song", "author": "Xinshi Chen; Shuang Li; Hui Li; Shaohua Jiang; Yuan Qi; Le Song", "abstract": "There are great interests as well as many challenges in applying reinforcement learning (RL) to recommendation systems. In this setting, an online user is the environment; neither the reward function nor the environment dynamics are clearly defined, making the application of RL challenging. In this paper, we propose a novel model-based reinforcement learning framework for recommendation systems, where we develop a generative adversarial network to imitate user behavior dynamics and learn her reward function. Using this user model as the simulation environment, we develop a novel Cascading DQN algorithm to obtain a combinatorial recommendation policy which can handle a large number of candidate items efficiently. In our experiments with real data, we show this generative adversarial user model can better explain user behavior than alternatives, and the RL policy based on this model can lead to a better long-term reward for the user and higher click rate for the system.", "bibtex": "@InProceedings{pmlr-v97-chen19f,\n title = \t {Generative Adversarial User Model for Reinforcement Learning Based Recommendation System},\n author = {Chen, Xinshi and Li, Shuang and Li, Hui and Jiang, Shaohua and Qi, Yuan and Song, Le},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1052--1061},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19f/chen19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19f.html},\n abstract = \t {There are great interests as well as many challenges in applying reinforcement learning (RL) to recommendation systems. In this setting, an online user is the environment; neither the reward function nor the environment dynamics are clearly defined, making the application of RL challenging. In this paper, we propose a novel model-based reinforcement learning framework for recommendation systems, where we develop a generative adversarial network to imitate user behavior dynamics and learn her reward function. Using this user model as the simulation environment, we develop a novel Cascading DQN algorithm to obtain a combinatorial recommendation policy which can handle a large number of candidate items efficiently. In our experiments with real data, we show this generative adversarial user model can better explain user behavior than alternatives, and the RL policy based on this model can lead to a better long-term reward for the user and higher click rate for the system.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19f/chen19f.pdf", "supp": "", "pdf_size": 1532696, "gs_citation": 280, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18416272509453441398&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "School of Mathematics, Georgia Institute of Technology, Atlanta, Georgia, USA + Ant Financial, Hangzhou, China; School of Industrial and Systems Engineering, Georgia Institute of Technology, Atlanta, Georgia, USA; Ant Financial, Hangzhou, China; Ant Financial, Hangzhou, China; Ant Financial, Hangzhou, China; School of Computational Science and Engineering, Georgia Institute of Technology, Atlanta, Georgia, USA + Ant Financial, Hangzhou, China", "aff_domain": "gatech.edu; ; ; ; ; ", "email": "gatech.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/chen19f.html", "aff_unique_index": "0+1;0;1;1;1;0+1", "aff_unique_norm": "Georgia Institute of Technology;Ant Financial", "aff_unique_dep": "School of Mathematics;", "aff_unique_url": "https://www.gatech.edu;https://www.antgroup.com", "aff_unique_abbr": "Georgia Tech;Ant Financial", "aff_campus_unique_index": "0+1;0;1;1;1;0+1", "aff_campus_unique": "Atlanta;Hangzhou", "aff_country_unique_index": "0+1;0;1;1;1;0+1", "aff_country_unique": "United States;China" }, { "title": "Generative Modeling of Infinite Occluded Objects for Compositional Scene Representation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4032", "id": "4032", "author_site": "Jinyang Yuan, Bin Li, Xiangyang Xue", "author": "Jinyang Yuan; Bin Li; Xiangyang Xue", "abstract": "We present a deep generative model which explicitly models object occlusions for compositional scene representation. Latent representations of objects are disentangled into location, size, shape, and appearance, and the visual scene can be generated compositionally by integrating these representations and an infinite-dimensional binary vector indicating presences of objects in the scene. By training the model to learn spatial dependences of pixels in the unsupervised setting, the number of objects, pixel-level segregation of objects, and presences of objects in overlapping regions can be estimated through inference of latent variables. Extensive experiments conducted on a series of specially designed datasets demonstrate that the proposed method outperforms two state-of-the-art methods when object occlusions exist.", "bibtex": "@InProceedings{pmlr-v97-yuan19b,\n title = \t {Generative Modeling of Infinite Occluded Objects for Compositional Scene Representation},\n author = {Yuan, Jinyang and Li, Bin and Xue, Xiangyang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7222--7231},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yuan19b/yuan19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/yuan19b.html},\n abstract = \t {We present a deep generative model which explicitly models object occlusions for compositional scene representation. Latent representations of objects are disentangled into location, size, shape, and appearance, and the visual scene can be generated compositionally by integrating these representations and an infinite-dimensional binary vector indicating presences of objects in the scene. By training the model to learn spatial dependences of pixels in the unsupervised setting, the number of objects, pixel-level segregation of objects, and presences of objects in overlapping regions can be estimated through inference of latent variables. Extensive experiments conducted on a series of specially designed datasets demonstrate that the proposed method outperforms two state-of-the-art methods when object occlusions exist.}\n}", "pdf": "http://proceedings.mlr.press/v97/yuan19b/yuan19b.pdf", "supp": "", "pdf_size": 1086191, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14343968903734213192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yuan19b.html" }, { "title": "Geometric Losses for Distributional Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4133", "id": "4133", "author_site": "Arthur Mensch, Mathieu Blondel, Gabriel Peyr\u00e9", "author": "Arthur Mensch; Mathieu Blondel; Gabriel Peyr\u00e9", "abstract": "Building upon recent advances in entropy-regularized optimal transport, and upon Fenchel duality between measures and continuous functions, we propose a generalization of the logistic loss that incorporates a metric or cost between classes. Unlike previous attempts to use optimal transport distances for learning, our loss results in unconstrained convex objective functions, supports infinite (or very large) class spaces, and naturally defines a geometric generalization of the softmax operator. The geometric properties of this loss make it suitable for predicting sparse and singular distributions, for instance supported on curves or hyper-surfaces. We study the theoretical properties of our loss and showcase its effectiveness on two applications: ordinal regression and drawing generation.", "bibtex": "@InProceedings{pmlr-v97-mensch19a,\n title = \t {Geometric Losses for Distributional Learning},\n author = {Mensch, Arthur and Blondel, Mathieu and Peyr{\\'e}, Gabriel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4516--4525},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mensch19a/mensch19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mensch19a.html},\n abstract = \t {Building upon recent advances in entropy-regularized optimal transport, and upon Fenchel duality between measures and continuous functions, we propose a generalization of the logistic loss that incorporates a metric or cost between classes. Unlike previous attempts to use optimal transport distances for learning, our loss results in unconstrained convex objective functions, supports infinite (or very large) class spaces, and naturally defines a geometric generalization of the softmax operator. The geometric properties of this loss make it suitable for predicting sparse and singular distributions, for instance supported on curves or hyper-surfaces. We study the theoretical properties of our loss and showcase its effectiveness on two applications: ordinal regression and drawing generation.}\n}", "pdf": "http://proceedings.mlr.press/v97/mensch19a/mensch19a.pdf", "supp": "", "pdf_size": 394663, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13194208145166918034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "1\u00b4Ecole Normale Sup\u00e9rieure, DMA, Paris, France + 2CNRS, France; 3NTT Communication Science Laboratories, Kyoto, Japan; 1\u00b4Ecole Normale Sup\u00e9rieure, DMA, Paris, France + 2CNRS, France", "aff_domain": "m4x.org;mblondel.org;ens.fr", "email": "m4x.org;mblondel.org;ens.fr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mensch19a.html", "aff_unique_index": "0+1;2;0+1", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;Centre National de la Recherche Scientifique;NTT Communication Science Laboratories", "aff_unique_dep": "DMA;;", "aff_unique_url": "https://www.ens.fr;https://www.cnrs.fr;https://www.ntt-csl.com", "aff_unique_abbr": "ENS;CNRS;NTT CSL", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Paris;;Kyoto", "aff_country_unique_index": "0+0;1;0+0", "aff_country_unique": "France;Japan" }, { "title": "Geometric Scattering for Graph Data Analysis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3844", "id": "3844", "author_site": "Feng Gao, Guy Wolf, Matthew Hirn", "author": "Feng Gao; Guy Wolf; Matthew Hirn", "abstract": "We explore the generalization of scattering transforms from traditional (e.g., image or audio) signals to graph data, analogous to the generalization of ConvNets in geometric deep learning, and the utility of extracted graph features in graph data analysis. In particular, we focus on the capacity of these features to retain informative variability and relations in the data (e.g., between individual graphs, or in aggregate), while relating our construction to previous theoretical results that establish the stability of similar transforms to families of graph deformations. We demonstrate the application of our geometric scattering features in graph classification of social network data, and in data exploration of biochemistry data.", "bibtex": "@InProceedings{pmlr-v97-gao19e,\n title = \t {Geometric Scattering for Graph Data Analysis},\n author = {Gao, Feng and Wolf, Guy and Hirn, Matthew},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2122--2131},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gao19e/gao19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/gao19e.html},\n abstract = \t {We explore the generalization of scattering transforms from traditional (e.g., image or audio) signals to graph data, analogous to the generalization of ConvNets in geometric deep learning, and the utility of extracted graph features in graph data analysis. In particular, we focus on the capacity of these features to retain informative variability and relations in the data (e.g., between individual graphs, or in aggregate), while relating our construction to previous theoretical results that establish the stability of similar transforms to families of graph deformations. We demonstrate the application of our geometric scattering features in graph classification of social network data, and in data exploration of biochemistry data.}\n}", "pdf": "http://proceedings.mlr.press/v97/gao19e/gao19e.pdf", "supp": "", "pdf_size": 1313341, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14220295300060665558&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computational Math., Science and Engineering, Michigan State University, East Lansing, MI, USA + Department of Plant, Soil & Microbial Sciences, Michigan State University, East Lansing, MI, USA; Department of Mathematics and Statistics, Universit \u00b4e de Montr \u00b4eal, Montreal, QC, Canada; Department of Mathematics, Michigan State University, East Lansing, MI, USA", "aff_domain": "msu.edu;umontreal.ca;msu.edu", "email": "msu.edu;umontreal.ca;msu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/gao19e.html", "aff_unique_index": "0+0;1;0", "aff_unique_norm": "Michigan State University;Universit\u00e9 de Montr\u00e9al", "aff_unique_dep": "Department of Computational Math., Science and Engineering;Department of Mathematics and Statistics", "aff_unique_url": "https://www.msu.edu;https://www.mcgill.ca", "aff_unique_abbr": "MSU;McGill", "aff_campus_unique_index": "0+0;1;0", "aff_campus_unique": "East Lansing;Montreal", "aff_country_unique_index": "0+0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Geometry Aware Convolutional Filters for Omnidirectional Images Representation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3612", "id": "3612", "author_site": "Renata Khasanova, Pascal Frossard", "author": "Renata Khasanova; Pascal Frossard", "abstract": "Due to their wide field of view, omnidirectional cameras are frequently used by autonomous vehicles, drones and robots for navigation and other computer vision tasks. The images captured by such cameras, are often analyzed and classified with techniques designed for planar images that unfortunately fail to properly handle the native geometry of such images and therefore results in suboptimal performance. In this paper we aim at improving popular deep convolutional neural networks so that they can properly take into account the specific properties of omnidirectional data. In particular we propose an algorithm that adapts convolutional layers, which often serve as a core building block of a CNN, to the properties of omnidirectional images. Thus, our filters have a shape and size that adapt to the location on the omnidirectional image. We show that our method is not limited to spherical surfaces and is able to incorporate the knowledge about any kind of projective geometry inside the deep learning network. As depicted by our experiments, our method outperforms the existing deep neural network techniques for omnidirectional image classification and compression tasks.", "bibtex": "@InProceedings{pmlr-v97-khasanova19a,\n title = \t {Geometry Aware Convolutional Filters for Omnidirectional Images Representation},\n author = {Khasanova, Renata and Frossard, Pascal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3351--3359},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/khasanova19a/khasanova19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/khasanova19a.html},\n abstract = \t {Due to their wide field of view, omnidirectional cameras are frequently used by autonomous vehicles, drones and robots for navigation and other computer vision tasks. The images captured by such cameras, are often analyzed and classified with techniques designed for planar images that unfortunately fail to properly handle the native geometry of such images and therefore results in suboptimal performance. In this paper we aim at improving popular deep convolutional neural networks so that they can properly take into account the specific properties of omnidirectional data. In particular we propose an algorithm that adapts convolutional layers, which often serve as a core building block of a CNN, to the properties of omnidirectional images. Thus, our filters have a shape and size that adapt to the location on the omnidirectional image. We show that our method is not limited to spherical surfaces and is able to incorporate the knowledge about any kind of projective geometry inside the deep learning network. As depicted by our experiments, our method outperforms the existing deep neural network techniques for omnidirectional image classification and compression tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/khasanova19a/khasanova19a.pdf", "supp": "", "pdf_size": 2345052, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6219932625056515178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering, EPFL, Lausanne, Switzerland; Department of Electrical Engineering, EPFL, Lausanne, Switzerland", "aff_domain": "epfl.ch; ", "email": "epfl.ch; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/khasanova19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Geometry and Symmetry in Short-and-Sparse Deconvolution", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4207", "id": "4207", "author_site": "Han-Wen Kuo, Yenson Lau, Yuqian Zhang, John Wright", "author": "Han-Wen Kuo; Yenson Lau; Yuqian Zhang; John Wright", "abstract": "We study the Short-and-Sparse (SaS) deconvolution problem of recovering a short signal a0 and a sparse signal x0 from their convolution. We propose a method based on nonconvex optimization, which under certain conditions recovers the target short and sparse signals, up to a signed shift symmetry which is intrinsic to this model. This symmetry plays a central role in shaping the optimization landscape for deconvolution. We give a regional analysis, which characterizes this landscape geometrically, on a union of subspaces. Our geometric characterization holds when the length-p0 short signal a0 has shift coherence {\\textmu}, and x0 follows a random sparsity model with sparsity rate $\\theta$ $\\in$ [c1/p0, c2/(p0\\sqrt{\\mu}+\\sqrt{p0})] / (log^2(p0)) . Based on this geometry, we give a provable method that successfully solves SaS deconvolution with high probability.", "bibtex": "@InProceedings{pmlr-v97-kuo19a,\n title = \t {Geometry and Symmetry in Short-and-Sparse Deconvolution},\n author = {Kuo, Han-Wen and Lau, Yenson and Zhang, Yuqian and Wright, John},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3570--3580},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kuo19a/kuo19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kuo19a.html},\n abstract = \t {We study the Short-and-Sparse (SaS) deconvolution problem of recovering a short signal a0 and a sparse signal x0 from their convolution. We propose a method based on nonconvex optimization, which under certain conditions recovers the target short and sparse signals, up to a signed shift symmetry which is intrinsic to this model. This symmetry plays a central role in shaping the optimization landscape for deconvolution. We give a regional analysis, which characterizes this landscape geometrically, on a union of subspaces. Our geometric characterization holds when the length-p0 short signal a0 has shift coherence {\\textmu}, and x0 follows a random sparsity model with sparsity rate $\\theta$ $\\in$ [c1/p0, c2/(p0\\sqrt{\\mu}+\\sqrt{p0})] / (log^2(p0)) . Based on this geometry, we give a provable method that successfully solves SaS deconvolution with high probability.}\n}", "pdf": "http://proceedings.mlr.press/v97/kuo19a/kuo19a.pdf", "supp": "", "pdf_size": 1312000, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9150977488508255788&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "Data Science Institute, Columbia University, New York City, NY, USA+Department of Electrical Engineering, Columbia University, New York City, NY, USA+Department of Applied Math and Applied Physics, Columbia University, New York City, NY, USA; Department of Computer Science, Cornell University, Ithaca, NY, USA; Data Science Institute, Columbia University, New York City, NY, USA+Department of Electrical Engineering, Columbia University, New York City, NY, USA+Department of Applied Math and Applied Physics, Columbia University, New York City, NY, USA; Data Science Institute, Columbia University, New York City, NY, USA+Department of Electrical Engineering, Columbia University, New York City, NY, USA+Department of Applied Math and Applied Physics, Columbia University, New York City, NY, USA", "aff_domain": "columbia.edu; ; ; ", "email": "columbia.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kuo19a.html", "aff_unique_index": "0+0+0;1;0+0+0;0+0+0", "aff_unique_norm": "Columbia University;Cornell University", "aff_unique_dep": "Data Science Institute;Department of Computer Science", "aff_unique_url": "https://www.columbia.edu;https://www.cornell.edu", "aff_unique_abbr": "Columbia;Cornell", "aff_campus_unique_index": "0+0+0;1;0+0+0;0+0+0", "aff_campus_unique": "New York City;Ithaca", "aff_country_unique_index": "0+0+0;0;0+0+0;0+0+0", "aff_country_unique": "United States" }, { "title": "Global Convergence of Block Coordinate Descent in Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3715", "id": "3715", "author_site": "Jinshan ZENG, Tim Tsz-Kit Lau, Shaobo Lin, Yuan Yao", "author": "Jinshan Zeng; Tim Tsz-Kit Lau; Shaobo Lin; Yuan Yao", "abstract": "Deep learning has aroused extensive attention due to its great empirical success. The efficiency of the block coordinate descent (BCD) methods has been recently demonstrated in deep neural network (DNN) training. However, theoretical studies on their convergence properties are limited due to the highly nonconvex nature of DNN training. In this paper, we aim at providing a general methodology for provable convergence guarantees for this type of methods. In particular, for most of the commonly used DNN training models involving both two- and three-splitting schemes, we establish the global convergence to a critical point at a rate of ${\\cal O}(1/k)$, where $k$ is the number of iterations. The results extend to general loss functions which have Lipschitz continuous gradients and deep residual networks (ResNets). Our key development adds several new elements to the Kurdyka-Lojasiewicz inequality framework that enables us to carry out the global convergence analysis of BCD in the general scenario of deep learning.", "bibtex": "@InProceedings{pmlr-v97-zeng19a,\n title = \t {Global Convergence of Block Coordinate Descent in Deep Learning},\n author = {Zeng, Jinshan and Lau, Tim Tsz-Kit and Lin, Shaobo and Yao, Yuan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7313--7323},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zeng19a/zeng19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zeng19a.html},\n abstract = \t {Deep learning has aroused extensive attention due to its great empirical success. The efficiency of the block coordinate descent (BCD) methods has been recently demonstrated in deep neural network (DNN) training. However, theoretical studies on their convergence properties are limited due to the highly nonconvex nature of DNN training. In this paper, we aim at providing a general methodology for provable convergence guarantees for this type of methods. In particular, for most of the commonly used DNN training models involving both two- and three-splitting schemes, we establish the global convergence to a critical point at a rate of ${\\cal O}(1/k)$, where $k$ is the number of iterations. The results extend to general loss functions which have Lipschitz continuous gradients and deep residual networks (ResNets). Our key development adds several new elements to the Kurdyka-Lojasiewicz inequality framework that enables us to carry out the global convergence analysis of BCD in the general scenario of deep learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/zeng19a/zeng19a.pdf", "supp": "", "pdf_size": 566406, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=417585534195879661&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "School of Computer and Information Engineering, Jiangxi Normal University, Nanchang 330022, Jiangxi, China+Department of Mathematics, The Hong Kong University of Science and Technology, Clear Water Bay, Kowloon, Hong Kong; Department of Statistics, Northwestern University, Evanston, IL 60208, USA; Department of Mathematics, City University of Hong Kong, Kowloon, Hong Kong; Department of Mathematics, The Hong Kong University of Science and Technology, Clear Water Bay, Kowloon, Hong Kong", "aff_domain": "ust.hk; ; ;ust.hk", "email": "ust.hk; ; ;ust.hk", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/zeng19a.html", "aff_unique_index": "0+1;2;3;1", "aff_unique_norm": "Jiangxi Normal University;Hong Kong University of Science and Technology;Northwestern University;City University of Hong Kong", "aff_unique_dep": "School of Computer and Information Engineering;Department of Mathematics;Department of Statistics;Department of Mathematics", "aff_unique_url": ";https://www.ust.hk;https://www.northwestern.edu;https://www.cityu.edu.hk", "aff_unique_abbr": ";HKUST;NU;CityU", "aff_campus_unique_index": "0+1;2;1;1", "aff_campus_unique": "Nanchang;Hong Kong SAR;Evanston", "aff_country_unique_index": "0+0;1;0;0", "aff_country_unique": "China;United States" }, { "title": "Good Initializations of Variational Bayes for Deep Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3847", "id": "3847", "author_site": "Simone Rossi, Pietro Michiardi, Maurizio Filippone", "author": "Simone Rossi; Pietro Michiardi; Maurizio Filippone", "abstract": "Stochastic variational inference is an established way to carry out approximate Bayesian inference for deep models flexibly and at scale. While there have been effective proposals for good initializations for loss minimization in deep learning, far less attention has been devoted to the issue of initialization of stochastic variational inference. We address this by proposing a novel layer-wise initialization strategy based on Bayesian linear models. The proposed method is extensively validated on regression and classification tasks, including Bayesian Deep Nets and Conv Nets, showing faster and better convergence compared to alternatives inspired by the literature on initializations for loss minimization.", "bibtex": "@InProceedings{pmlr-v97-rossi19a,\n title = \t {Good Initializations of Variational {B}ayes for Deep Models},\n author = {Rossi, Simone and Michiardi, Pietro and Filippone, Maurizio},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5487--5497},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rossi19a/rossi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rossi19a.html},\n abstract = \t {Stochastic variational inference is an established way to carry out approximate Bayesian inference for deep models flexibly and at scale. While there have been effective proposals for good initializations for loss minimization in deep learning, far less attention has been devoted to the issue of initialization of stochastic variational inference. We address this by proposing a novel layer-wise initialization strategy based on Bayesian linear models. The proposed method is extensively validated on regression and classification tasks, including Bayesian Deep Nets and Conv Nets, showing faster and better convergence compared to alternatives inspired by the literature on initializations for loss minimization.}\n}", "pdf": "http://proceedings.mlr.press/v97/rossi19a/rossi19a.pdf", "supp": "", "pdf_size": 1502323, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10486038050045562123&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Department of Data Science, EURECOM, France; Department of Data Science, EURECOM, France; Department of Data Science, EURECOM, France", "aff_domain": "eurecom.fr; ; ", "email": "eurecom.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/rossi19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "EURECOM", "aff_unique_dep": "Department of Data Science", "aff_unique_url": "https://www.eurecom.fr", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Gradient Descent Finds Global Minima of Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3786", "id": "3786", "author_site": "Simon Du, Jason Lee, Haochuan Li, Liwei Wang, Xiyu Zhai", "author": "Simon Du; Jason Lee; Haochuan Li; Liwei Wang; Xiyu Zhai", "abstract": "Gradient descent finds a global minimum in training deep neural networks despite the objective function being non-convex. The current paper proves gradient descent achieves zero training loss in polynomial time for a deep over-parameterized neural network with residual connections (ResNet). Our analysis relies on the particular structure of the Gram matrix induced by the neural network architecture. This structure allows us to show the Gram matrix is stable throughout the training process and this stability implies the global optimality of the gradient descent algorithm. We further extend our analysis to deep residual convolutional neural networks and obtain a similar convergence result.", "bibtex": "@InProceedings{pmlr-v97-du19c,\n title = \t {Gradient Descent Finds Global Minima of Deep Neural Networks},\n author = {Du, Simon and Lee, Jason and Li, Haochuan and Wang, Liwei and Zhai, Xiyu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1675--1685},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/du19c/du19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/du19c.html},\n abstract = \t {Gradient descent finds a global minimum in training deep neural networks despite the objective function being non-convex. The current paper proves gradient descent achieves zero training loss in polynomial time for a deep over-parameterized neural network with residual connections (ResNet). Our analysis relies on the particular structure of the Gram matrix induced by the neural network architecture. This structure allows us to show the Gram matrix is stable throughout the training process and this stability implies the global optimality of the gradient descent algorithm. We further extend our analysis to deep residual convolutional neural networks and obtain a similar convergence result.}\n}", "pdf": "http://proceedings.mlr.press/v97/du19c/du19c.pdf", "supp": "", "pdf_size": 419208, "gs_citation": 1501, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14337035371775932423&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Machine Learning Department, Carnegie Mellon University; Data Science and Operations Department, University of Southern California; School of Physics, Peking University + Center for Data Science, Peking University, Beijing Institute of Big Data Research; Key Laboratory of Machine Perception, MOE, School of EECS, Peking University; Department of EECS, Massachusetts Institute of Technology", "aff_domain": "cs.cmu.edu; ; ; ; ", "email": "cs.cmu.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/du19c.html", "aff_unique_index": "0;1;2+2;2;3", "aff_unique_norm": "Carnegie Mellon University;University of Southern California;Peking University;Massachusetts Institute of Technology", "aff_unique_dep": "Machine Learning Department;Data Science and Operations Department;School of Physics;Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.cmu.edu;https://www.usc.edu;http://www.pku.edu.cn;https://web.mit.edu", "aff_unique_abbr": "CMU;USC;PKU;MIT", "aff_campus_unique_index": "1;2+2;3", "aff_campus_unique": ";Los Angeles;Beijing;Cambridge", "aff_country_unique_index": "0;0;1+1;1;0", "aff_country_unique": "United States;China" }, { "title": "Graph Convolutional Gaussian Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4019", "id": "4019", "author_site": "Ian Walker, Ben Glocker", "author": "Ian Walker; Ben Glocker", "abstract": "We propose a novel Bayesian nonparametric method to learn translation-invariant relationships on non-Euclidean domains. The resulting graph convolutional Gaussian processes can be applied to problems in machine learning for which the input observations are functions with domains on general graphs. The structure of these models allows for high dimensional inputs while retaining expressibility, as is the case with convolutional neural networks. We present applications of graph convolutional Gaussian processes to images and triangular meshes, demonstrating their versatility and effectiveness, comparing favorably to existing methods, despite being relatively simple models.", "bibtex": "@InProceedings{pmlr-v97-walker19a,\n title = \t {Graph Convolutional {G}aussian Processes},\n author = {Walker, Ian and Glocker, Ben},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6495--6504},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/walker19a/walker19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/walker19a.html},\n abstract = \t {We propose a novel Bayesian nonparametric method to learn translation-invariant relationships on non-Euclidean domains. The resulting graph convolutional Gaussian processes can be applied to problems in machine learning for which the input observations are functions with domains on general graphs. The structure of these models allows for high dimensional inputs while retaining expressibility, as is the case with convolutional neural networks. We present applications of graph convolutional Gaussian processes to images and triangular meshes, demonstrating their versatility and effectiveness, comparing favorably to existing methods, despite being relatively simple models.}\n}", "pdf": "http://proceedings.mlr.press/v97/walker19a/walker19a.pdf", "supp": "", "pdf_size": 1691843, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16979380734609382680&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computing, Imperial College London, United Kingdom; Department of Computing, Imperial College London, United Kingdom", "aff_domain": "imperial.ac.uk; ", "email": "imperial.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/walker19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "Department of Computing", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "Imperial", "aff_campus_unique_index": "0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Graph Element Networks: adaptive, structured computation and memory", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3979", "id": "3979", "author_site": "Ferran Alet, Adarsh Keshav Jeewajee, Maria Bauza Villalonga, Alberto Rodriguez, Tomas Lozano-Perez, Leslie Kaelbling", "author": "Ferran Alet; Adarsh Keshav Jeewajee; Maria Bauza Villalonga; Alberto Rodriguez; Tomas Lozano-Perez; Leslie Kaelbling", "abstract": "We explore the use of graph neural networks (GNNs) to model spatial processes in which there is no a priori graphical structure. Similar to finite element analysis, we assign nodes of a GNN to spatial locations and use a computational process defined on the graph to model the relationship between an initial function defined over a space and a resulting function in the same space. We use GNNs as a computational substrate, and show that the locations of the nodes in space as well as their connectivity can be optimized to focus on the most complex parts of the space. Moreover, this representational strategy allows the learned input-output relationship to generalize over the size of the underlying space and run the same model at different levels of precision, trading computation for accuracy. We demonstrate this method on a traditional PDE problem, a physical prediction problem from robotics, and learning to predict scene images from novel viewpoints.", "bibtex": "@InProceedings{pmlr-v97-alet19a,\n title = \t {Graph Element Networks: adaptive, structured computation and memory},\n author = {Alet, Ferran and Jeewajee, Adarsh Keshav and Villalonga, Maria Bauza and Rodriguez, Alberto and Lozano-Perez, Tomas and Kaelbling, Leslie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {212--222},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/alet19a/alet19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/alet19a.html},\n abstract = \t {We explore the use of graph neural networks (GNNs) to model spatial processes in which there is no a priori graphical structure. Similar to finite element analysis, we assign nodes of a GNN to spatial locations and use a computational process defined on the graph to model the relationship between an initial function defined over a space and a resulting function in the same space. We use GNNs as a computational substrate, and show that the locations of the nodes in space as well as their connectivity can be optimized to focus on the most complex parts of the space. Moreover, this representational strategy allows the learned input-output relationship to generalize over the size of the underlying space and run the same model at different levels of precision, trading computation for accuracy. We demonstrate this method on a traditional PDE problem, a physical prediction problem from robotics, and learning to predict scene images from novel viewpoints.}\n}", "pdf": "http://proceedings.mlr.press/v97/alet19a/alet19a.pdf", "supp": "", "pdf_size": 3543618, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15635052566391015915&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "CSAIL - MIT, Cambridge, MA, USA; CSAIL - MIT, Cambridge, MA, USA; Mechanical Engineering - MIT, Cambridge, MA, USA; Mechanical Engineering - MIT, Cambridge, MA, USA; CSAIL - MIT, Cambridge, MA, USA; CSAIL - MIT, Cambridge, MA, USA", "aff_domain": "mit.edu; ; ; ; ; ", "email": "mit.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/alet19a.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Graph Matching Networks for Learning the Similarity of Graph Structured Objects", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4018", "id": "4018", "author_site": "Yujia Li, Chenjie Gu, Thomas Dullien, Oriol Vinyals, Pushmeet Kohli", "author": "Yujia Li; Chenjie Gu; Thomas Dullien; Oriol Vinyals; Pushmeet Kohli", "abstract": "This paper addresses the challenging problem of retrieval and matching of graph structured objects, and makes two key contributions. First, we demonstrate how Graph Neural Networks (GNN), which have emerged as an effective model for various supervised prediction problems defined on structured data, can be trained to produce embedding of graphs in vector spaces that enables efficient similarity reasoning. Second, we propose a novel Graph Matching Network model that, given a pair of graphs as input, computes a similarity score between them by jointly reasoning on the pair through a new cross-graph attention-based matching mechanism. We demonstrate the effectiveness of our models on different domains including the challenging problem of control-flow graph based function similarity search that plays an important role in the detection of vulnerabilities in software systems. The experimental analysis demonstrates that our models are not only able to exploit structure in the context of similarity learning but they can also outperform domain specific baseline systems that have been carefully hand-engineered for these problems.", "bibtex": "@InProceedings{pmlr-v97-li19d,\n title = \t {Graph Matching Networks for Learning the Similarity of Graph Structured Objects},\n author = {Li, Yujia and Gu, Chenjie and Dullien, Thomas and Vinyals, Oriol and Kohli, Pushmeet},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3835--3845},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19d/li19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19d.html},\n abstract = \t {This paper addresses the challenging problem of retrieval and matching of graph structured objects, and makes two key contributions. First, we demonstrate how Graph Neural Networks (GNN), which have emerged as an effective model for various supervised prediction problems defined on structured data, can be trained to produce embedding of graphs in vector spaces that enables efficient similarity reasoning. Second, we propose a novel Graph Matching Network model that, given a pair of graphs as input, computes a similarity score between them by jointly reasoning on the pair through a new cross-graph attention-based matching mechanism. We demonstrate the effectiveness of our models on different domains including the challenging problem of control-flow graph based function similarity search that plays an important role in the detection of vulnerabilities in software systems. The experimental analysis demonstrates that our models are not only able to exploit structure in the context of similarity learning but they can also outperform domain specific baseline systems that have been carefully hand-engineered for these problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19d/li19d.pdf", "supp": "", "pdf_size": 923510, "gs_citation": 779, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13607991845696425216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "DeepMind; DeepMind; Google; DeepMind; DeepMind", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/li19d.html", "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "DeepMind;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://deepmind.com;https://www.google.com", "aff_unique_abbr": "DeepMind;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Graph Neural Network for Music Score Data and Modeling Expressive Piano Performance", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4004", "id": "4004", "author_site": "Dasaem Jeong, Taegyun Kwon, Yoojin Kim, Juhan Nam", "author": "Dasaem Jeong; Taegyun Kwon; Yoojin Kim; Juhan Nam", "abstract": "Music score is often handled as one-dimensional sequential data. Unlike words in a text document, notes in music score can be played simultaneously by the polyphonic nature and each of them has its own duration. In this paper, we represent the unique form of musical score using graph neural network and apply it for rendering expressive piano performance from the music score. Specifically, we design the model using note-level gated graph neural network and measure-level hierarchical attention network with bidirectional long short-term memory with an iterative feedback method. In addition, to model different styles of performance for a given input score, we employ a variational auto-encoder. The result of the listening test shows that our proposed model generated more human-like performances compared to a baseline model and a hierarchical attention network model that handles music score as a word-like sequence.", "bibtex": "@InProceedings{pmlr-v97-jeong19a,\n title = \t {Graph Neural Network for Music Score Data and Modeling Expressive Piano Performance},\n author = {Jeong, Dasaem and Kwon, Taegyun and Kim, Yoojin and Nam, Juhan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3060--3070},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jeong19a/jeong19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jeong19a.html},\n abstract = \t {Music score is often handled as one-dimensional sequential data. Unlike words in a text document, notes in music score can be played simultaneously by the polyphonic nature and each of them has its own duration. In this paper, we represent the unique form of musical score using graph neural network and apply it for rendering expressive piano performance from the music score. Specifically, we design the model using note-level gated graph neural network and measure-level hierarchical attention network with bidirectional long short-term memory with an iterative feedback method. In addition, to model different styles of performance for a given input score, we employ a variational auto-encoder. The result of the listening test shows that our proposed model generated more human-like performances compared to a baseline model and a hierarchical attention network model that handles music score as a word-like sequence.}\n}", "pdf": "http://proceedings.mlr.press/v97/jeong19a/jeong19a.pdf", "supp": "", "pdf_size": 675477, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9986010523028767752&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Graduate School of Culture Technology, Korea Advanced Institute of Science and Technology (KAIST), Dajeon, South Korea; Graduate School of Culture Technology, Korea Advanced Institute of Science and Technology (KAIST), Dajeon, South Korea; Graduate School of Culture Technology, Korea Advanced Institute of Science and Technology (KAIST), Dajeon, South Korea; Graduate School of Culture Technology, Korea Advanced Institute of Science and Technology (KAIST), Dajeon, South Korea", "aff_domain": "kaist.ac.kr; ; ;kaist.ac.kr", "email": "kaist.ac.kr; ; ;kaist.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/jeong19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Graduate School of Culture Technology", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Dajeon", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Graph Resistance and Learning from Pairwise Comparisons", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4120", "id": "4120", "author_site": "Julien Hendrickx, Alex Olshevsky, Venkatesh Saligrama", "author": "Julien Hendrickx; Alexander Olshevsky; Venkatesh Saligrama", "abstract": "We consider the problem of learning the qualities of a collection of items by performing noisy comparisons among them. Following the standard paradigm, we assume there is a fixed \u201ccomparison graph\u201d and every neighboring pair of items in this graph is compared k times according to the Bradley-Terry-Luce model (where the probability than an item wins a comparison is proportional the item quality). We are interested in how the relative error in quality estimation scales with the comparison graph in the regime where k is large. We show that, asymptotically, the relevant graph-theoretic quantity is the square root of the resistance of the comparison graph. Specifically, we provide an algorithm with relative error decay that scales with the square root of the graph resistance, and provide a lower bound showing that (up to log factors) a better scaling is impossible. The performance guarantee of our algorithm, both in terms of the graph and the skewness of the item quality distribution, significantly outperforms earlier results.", "bibtex": "@InProceedings{pmlr-v97-hendrickx19a,\n title = \t {Graph Resistance and Learning from Pairwise Comparisons},\n author = {Hendrickx, Julien and Olshevsky, Alexander and Saligrama, Venkatesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2702--2711},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hendrickx19a/hendrickx19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hendrickx19a.html},\n abstract = \t {We consider the problem of learning the qualities of a collection of items by performing noisy comparisons among them. Following the standard paradigm, we assume there is a fixed \u201ccomparison graph\u201d and every neighboring pair of items in this graph is compared k times according to the Bradley-Terry-Luce model (where the probability than an item wins a comparison is proportional the item quality). We are interested in how the relative error in quality estimation scales with the comparison graph in the regime where k is large. We show that, asymptotically, the relevant graph-theoretic quantity is the square root of the resistance of the comparison graph. Specifically, we provide an algorithm with relative error decay that scales with the square root of the graph resistance, and provide a lower bound showing that (up to log factors) a better scaling is impossible. The performance guarantee of our algorithm, both in terms of the graph and the skewness of the item quality distribution, significantly outperforms earlier results.}\n}", "pdf": "http://proceedings.mlr.press/v97/hendrickx19a/hendrickx19a.pdf", "supp": "", "pdf_size": 504658, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8892088326040780783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Mathematical Engineering, ICTEAM, UCLouvain, Belgium+Department of Electrical and Computer Engineering, Boston University, USA; Department of Electrical and Computer Engineering, Boston University, USA; Department of Electrical and Computer Engineering, Boston University, USA", "aff_domain": "uclouvain.be;bu.edu;bu.edu", "email": "uclouvain.be;bu.edu;bu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hendrickx19a.html", "aff_unique_index": "0+1;1;1", "aff_unique_norm": "UCLouvain;Boston University", "aff_unique_dep": "Department of Mathematical Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.uclouvain.be;https://www.bu.edu", "aff_unique_abbr": ";BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1", "aff_country_unique": "Belgium;United States" }, { "title": "Graph U-Nets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3676", "id": "3676", "author_site": "Hongyang Gao, Shuiwang Ji", "author": "Hongyang Gao; Shuiwang Ji", "abstract": "We consider the problem of representation learning for graph data. Convolutional neural networks can naturally operate on images, but have significant challenges in dealing with graph data. Given images are special cases of graphs with nodes lie on 2D lattices, graph embedding tasks have a natural correspondence with image pixel-wise prediction tasks such as segmentation. While encoder-decoder architectures like U-Nets have been successfully applied on many image pixel-wise prediction tasks, similar methods are lacking for graph data. This is due to the fact that pooling and up-sampling operations are not natural on graph data. To address these challenges, we propose novel graph pooling (gPool) and unpooling (gUnpool) operations in this work. The gPool layer adaptively selects some nodes to form a smaller graph based on their scalar projection values on a trainable projection vector. We further propose the gUnpool layer as the inverse operation of the gPool layer. The gUnpool layer restores the graph into its original structure using the position information of nodes selected in the corresponding gPool layer. Based on our proposed gPool and gUnpool layers, we develop an encoder-decoder model on graph, known as the graph U-Nets. Our experimental results on node classification and graph classification tasks demonstrate that our methods achieve consistently better performance than previous models.", "bibtex": "@InProceedings{pmlr-v97-gao19a,\n title = \t {Graph U-Nets},\n author = {Gao, Hongyang and Ji, Shuiwang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2083--2092},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gao19a/gao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gao19a.html},\n abstract = \t {We consider the problem of representation learning for graph data. Convolutional neural networks can naturally operate on images, but have significant challenges in dealing with graph data. Given images are special cases of graphs with nodes lie on 2D lattices, graph embedding tasks have a natural correspondence with image pixel-wise prediction tasks such as segmentation. While encoder-decoder architectures like U-Nets have been successfully applied on many image pixel-wise prediction tasks, similar methods are lacking for graph data. This is due to the fact that pooling and up-sampling operations are not natural on graph data. To address these challenges, we propose novel graph pooling (gPool) and unpooling (gUnpool) operations in this work. The gPool layer adaptively selects some nodes to form a smaller graph based on their scalar projection values on a trainable projection vector. We further propose the gUnpool layer as the inverse operation of the gPool layer. The gUnpool layer restores the graph into its original structure using the position information of nodes selected in the corresponding gPool layer. Based on our proposed gPool and gUnpool layers, we develop an encoder-decoder model on graph, known as the graph U-Nets. Our experimental results on node classification and graph classification tasks demonstrate that our methods achieve consistently better performance than previous models.}\n}", "pdf": "http://proceedings.mlr.press/v97/gao19a/gao19a.pdf", "supp": "", "pdf_size": 375680, "gs_citation": 1567, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2250116536319373587&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 18, "aff": "Department of Computer Science & Engineering, Texas A&M University, TX, USA; Department of Computer Science & Engineering, Texas A&M University, TX, USA", "aff_domain": "tamu.edu;tamu.edu", "email": "tamu.edu;tamu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/gao19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "Department of Computer Science & Engineering", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "TX", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Graphical-model based estimation and inference for differential privacy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3639", "id": "3639", "author_site": "Ryan McKenna, Daniel Sheldon, Gerome Miklau", "author": "Ryan Mckenna; Daniel Sheldon; Gerome Miklau", "abstract": "Many privacy mechanisms reveal high-level information about a data distribution through noisy measurements. It is common to use this information to estimate the answers to new queries. In this work, we provide an approach to solve this estimation problem efficiently using graphical models, which is particularly effective when the distribution is high-dimensional but the measurements are over low-dimensional marginals. We show that our approach is far more efficient than existing estimation techniques from the privacy literature and that it can improve the accuracy and scalability of many state-of-the-art mechanisms.", "bibtex": "@InProceedings{pmlr-v97-mckenna19a,\n title = \t {Graphical-model based estimation and inference for differential privacy},\n author = {Mckenna, Ryan and Sheldon, Daniel and Miklau, Gerome},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4435--4444},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mckenna19a/mckenna19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mckenna19a.html},\n abstract = \t {Many privacy mechanisms reveal high-level information about a data distribution through noisy measurements. It is common to use this information to estimate the answers to new queries. In this work, we provide an approach to solve this estimation problem efficiently using graphical models, which is particularly effective when the distribution is high-dimensional but the measurements are over low-dimensional marginals. We show that our approach is far more efficient than existing estimation techniques from the privacy literature and that it can improve the accuracy and scalability of many state-of-the-art mechanisms.}\n}", "pdf": "http://proceedings.mlr.press/v97/mckenna19a/mckenna19a.pdf", "supp": "", "pdf_size": 484711, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8177848662572356462&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "University of Massachusetts, Amherst; University of Massachusetts, Amherst + Mount Holyoke College; University of Massachusetts, Amherst", "aff_domain": "cs.umass.edu; ; ", "email": "cs.umass.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mckenna19a.html", "aff_unique_index": "0;0+1;0", "aff_unique_norm": "University of Massachusetts Amherst;Mount Holyoke College", "aff_unique_dep": ";", "aff_unique_url": "https://www.umass.edu;https://www.mtholyoke.edu", "aff_unique_abbr": "UMass Amherst;MHC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;0+0;0", "aff_country_unique": "United States" }, { "title": "Graphite: Iterative Generative Modeling of Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4008", "id": "4008", "author_site": "Aditya Grover, Aaron Zweig, Stefano Ermon", "author": "Aditya Grover; Aaron Zweig; Stefano Ermon", "abstract": "Graphs are a fundamental abstraction for modeling relational data. However, graphs are discrete and combinatorial in nature, and learning representations suitable for machine learning tasks poses statistical and computational challenges. In this work, we propose Graphite, an algorithmic framework for unsupervised learning of representations over nodes in large graphs using deep latent variable generative models. Our model parameterizes variational autoencoders (VAE) with graph neural networks, and uses a novel iterative graph refinement strategy inspired by low-rank approximations for decoding. On a wide variety of synthetic and benchmark datasets, Graphite outperforms competing approaches for the tasks of density estimation, link prediction, and node classification. Finally, we derive a theoretical connection between message passing in graph neural networks and mean-field variational inference.", "bibtex": "@InProceedings{pmlr-v97-grover19a,\n title = \t {Graphite: Iterative Generative Modeling of Graphs},\n author = {Grover, Aditya and Zweig, Aaron and Ermon, Stefano},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2434--2444},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/grover19a/grover19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/grover19a.html},\n abstract = \t {Graphs are a fundamental abstraction for modeling relational data. However, graphs are discrete and combinatorial in nature, and learning representations suitable for machine learning tasks poses statistical and computational challenges. In this work, we propose Graphite, an algorithmic framework for unsupervised learning of representations over nodes in large graphs using deep latent variable generative models. Our model parameterizes variational autoencoders (VAE) with graph neural networks, and uses a novel iterative graph refinement strategy inspired by low-rank approximations for decoding. On a wide variety of synthetic and benchmark datasets, Graphite outperforms competing approaches for the tasks of density estimation, link prediction, and node classification. Finally, we derive a theoretical connection between message passing in graph neural networks and mean-field variational inference.}\n}", "pdf": "http://proceedings.mlr.press/v97/grover19a/grover19a.pdf", "supp": "", "pdf_size": 948989, "gs_citation": 367, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10899553557620073991&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "Department of Computer Science, Stanford University, USA; Department of Computer Science, Stanford University, USA; Department of Computer Science, Stanford University, USA", "aff_domain": "cs.stanford.edu; ; ", "email": "cs.stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/grover19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Greedy Layerwise Learning Can Scale To ImageNet", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3588", "id": "3588", "author_site": "Eugene Belilovsky, Michael Eickenberg, Edouard Oyallon", "author": "Eugene Belilovsky; Michael Eickenberg; Edouard Oyallon", "abstract": "Shallow supervised 1-hidden layer neural networks have a number of favorable properties that make them easier to interpret, analyze, and optimize than their deep counterparts, but lack their representational power. Here we use 1-hidden layer learning problems to sequentially build deep networks layer by layer, which can inherit properties from shallow networks. Contrary to previous approaches using shallow networks, we focus on problems where deep learning is reported as critical for success. We thus study CNNs on image classification tasks using the large-scale ImageNet dataset and the CIFAR-10 dataset. Using a simple set of ideas for architecture and training we find that solving sequential 1-hidden-layer auxiliary problems lead to a CNN that exceeds AlexNet performance on ImageNet. Extending this training methodology to construct individual layers by solving 2-and-3-hidden layer auxiliary problems, we obtain an 11-layer network that exceeds several members of the VGG model family on ImageNet, and can train a VGG-11 model to the same accuracy as end-to-end learning. To our knowledge, this is the first competitive alternative to end-to-end training of CNNs that can scale to ImageNet. We illustrate several interesting properties of these models and conduct a range of experiments to study the properties this training induces on the intermediate layers.", "bibtex": "@InProceedings{pmlr-v97-belilovsky19a,\n title = \t {Greedy Layerwise Learning Can Scale To {I}mage{N}et},\n author = {Belilovsky, Eugene and Eickenberg, Michael and Oyallon, Edouard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {583--593},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/belilovsky19a/belilovsky19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/belilovsky19a.html},\n abstract = \t {Shallow supervised 1-hidden layer neural networks have a number of favorable properties that make them easier to interpret, analyze, and optimize than their deep counterparts, but lack their representational power. Here we use 1-hidden layer learning problems to sequentially build deep networks layer by layer, which can inherit properties from shallow networks. Contrary to previous approaches using shallow networks, we focus on problems where deep learning is reported as critical for success. We thus study CNNs on image classification tasks using the large-scale ImageNet dataset and the CIFAR-10 dataset. Using a simple set of ideas for architecture and training we find that solving sequential 1-hidden-layer auxiliary problems lead to a CNN that exceeds AlexNet performance on ImageNet. Extending this training methodology to construct individual layers by solving 2-and-3-hidden layer auxiliary problems, we obtain an 11-layer network that exceeds several members of the VGG model family on ImageNet, and can train a VGG-11 model to the same accuracy as end-to-end learning. To our knowledge, this is the first competitive alternative to end-to-end training of CNNs that can scale to ImageNet. We illustrate several interesting properties of these models and conduct a range of experiments to study the properties this training induces on the intermediate layers.}\n}", "pdf": "http://proceedings.mlr.press/v97/belilovsky19a/belilovsky19a.pdf", "supp": "", "pdf_size": 576632, "gs_citation": 227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17442726017389288785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Mila, University of Montreal; University of California, Berkeley; CentraleSupelec, University of Paris-Saclay / INRIA Saclay", "aff_domain": "umontreal.ca; ; ", "email": "umontreal.ca; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/belilovsky19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Montreal;University of California, Berkeley;CentraleSupelec", "aff_unique_dep": "Mila;;", "aff_unique_url": "https://www.mila.quebec;https://www.berkeley.edu;https://www.centralesupelec.fr", "aff_unique_abbr": "Mila;UC Berkeley;CS", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Montreal;Berkeley;University of Paris-Saclay", "aff_country_unique_index": "0;1;2", "aff_country_unique": "Canada;United States;France" }, { "title": "Greedy Orthogonal Pivoting Algorithm for Non-Negative Matrix Factorization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3663", "id": "3663", "author_site": "Kai Zhang, Sheng Zhang, Jun Liu, Jun Wang, Jie Zhang", "author": "Kai Zhang; Sheng Zhang; Jun Liu; Jun Wang; Jie Zhang", "abstract": "Non-negative matrix factorization is a powerful tool for learning useful representations in the data and has been widely applied in many problems such as data mining and signal processing. Orthogonal NMF, which can improve the locality of decomposition, has drawn considerable interest in solving clustering problems in recent years. However, imposing simultaneous non-negative and orthogonal structure can be quite difficult, and so existing algorithms can only solve it approximately. To address this challenge, we propose an innovative procedure called Greedy Orthogonal Pivoting Algorithm (GOPA). The GOPA algorithm fully exploits the sparsity of non-negative orthogonal solutions to break the global problem into a series of local optimizations, in which an adaptive subset of coordinates are updated in a greedy, closed-form manner. The biggest advantage of GOPA is that it promotes exact orthogonality and provides solid empirical evidence that stronger orthogonality does contribute favorably to better clustering performance. On the other hand, we further design randomized and parallel version of GOPA, which can further reduce the computational cost and improve accuracy, making it suitable for large data.", "bibtex": "@InProceedings{pmlr-v97-zhang19r,\n title = \t {Greedy Orthogonal Pivoting Algorithm for Non-Negative Matrix Factorization},\n author = {Zhang, Kai and Zhang, Sheng and Liu, Jun and Wang, Jun and Zhang, Jie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7493--7501},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19r/zhang19r.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19r.html},\n abstract = \t {Non-negative matrix factorization is a powerful tool for learning useful representations in the data and has been widely applied in many problems such as data mining and signal processing. Orthogonal NMF, which can improve the locality of decomposition, has drawn considerable interest in solving clustering problems in recent years. However, imposing simultaneous non-negative and orthogonal structure can be quite difficult, and so existing algorithms can only solve it approximately. To address this challenge, we propose an innovative procedure called Greedy Orthogonal Pivoting Algorithm (GOPA). The GOPA algorithm fully exploits the sparsity of non-negative orthogonal solutions to break the global problem into a series of local optimizations, in which an adaptive subset of coordinates are updated in a greedy, closed-form manner. The biggest advantage of GOPA is that it promotes exact orthogonality and provides solid empirical evidence that stronger orthogonality does contribute favorably to better clustering performance. On the other hand, we further design randomized and parallel version of GOPA, which can further reduce the computational cost and improve accuracy, making it suitable for large data.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19r/zhang19r.pdf", "supp": "", "pdf_size": 454768, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2352134364221074279&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zhang19r.html" }, { "title": "Grid-Wise Control for Multi-Agent Reinforcement Learning in Video Game AI", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3838", "id": "3838", "author_site": "Lei Han, Peng Sun, Yali Du, Jiechao Xiong, Qing Wang, Xinghai Sun, Han Liu, Tong Zhang", "author": "Lei Han; Peng Sun; Yali Du; Jiechao Xiong; Qing Wang; Xinghai Sun; Han Liu; Tong Zhang", "abstract": "We consider the problem of multi-agent reinforcement learning (MARL) in video game AI, where the agents are located in a spatial grid-world environment and the number of agents varies both within and across episodes. The challenge is to flexibly control an arbitrary number of agents while achieving effective collaboration. Existing MARL methods usually suffer from the trade-off between these two considerations. To address the issue, we propose a novel architecture that learns a spatial joint representation of all the agents and outputs grid-wise actions. Each agent will be controlled independently by taking the action from the grid it occupies. By viewing the state information as a grid feature map, we employ a convolutional encoder-decoder as the policy network. This architecture naturally promotes agent communication because of the large receptive field provided by the stacked convolutional layers. Moreover, the spatially shared convolutional parameters enable fast parallel exploration that the experiences discovered by one agent can be immediately transferred to others. The proposed method can be conveniently integrated with general reinforcement learning algorithms, e.g., PPO and Q-learning. We demonstrate the effectiveness of the proposed method in extensive challenging multi-agent tasks in StarCraft II.", "bibtex": "@InProceedings{pmlr-v97-han19a,\n title = \t {Grid-Wise Control for Multi-Agent Reinforcement Learning in Video Game {AI}},\n author = {Han, Lei and Sun, Peng and Du, Yali and Xiong, Jiechao and Wang, Qing and Sun, Xinghai and Liu, Han and Zhang, Tong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2576--2585},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/han19a/han19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/han19a.html},\n abstract = \t {We consider the problem of multi-agent reinforcement learning (MARL) in video game AI, where the agents are located in a spatial grid-world environment and the number of agents varies both within and across episodes. The challenge is to flexibly control an arbitrary number of agents while achieving effective collaboration. Existing MARL methods usually suffer from the trade-off between these two considerations. To address the issue, we propose a novel architecture that learns a spatial joint representation of all the agents and outputs grid-wise actions. Each agent will be controlled independently by taking the action from the grid it occupies. By viewing the state information as a grid feature map, we employ a convolutional encoder-decoder as the policy network. This architecture naturally promotes agent communication because of the large receptive field provided by the stacked convolutional layers. Moreover, the spatially shared convolutional parameters enable fast parallel exploration that the experiences discovered by one agent can be immediately transferred to others. The proposed method can be conveniently integrated with general reinforcement learning algorithms, e.g., PPO and Q-learning. We demonstrate the effectiveness of the proposed method in extensive challenging multi-agent tasks in StarCraft II.}\n}", "pdf": "http://proceedings.mlr.press/v97/han19a/han19a.pdf", "supp": "", "pdf_size": 6474824, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16001186120941254392&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Tencent AI Lab, Shenzhen, China; Tencent AI Lab, Shenzhen, China; University of Technology Sydney, Australia+UBTECH SAIC, Univ. of Sydney, Australia; Tencent AI Lab, Shenzhen, China; Tencent AI Lab, Shenzhen, China; Tencent AI Lab, Shenzhen, China; Northwestern University, IL, USA; Hong Kong University of Science and Technology, Hong Kong, China", "aff_domain": "gmail.com; ; ; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/han19a.html", "aff_unique_index": "0;0;1+2;0;0;0;3;4", "aff_unique_norm": "Tencent;University of Technology Sydney;University of Sydney;Northwestern University;Hong Kong University of Science and Technology", "aff_unique_dep": "AI Lab;;;;", "aff_unique_url": "https://ai.tencent.com;https://www.uts.edu.au;https://www.sydney.edu.au;https://www.northwestern.edu;https://www.ust.hk", "aff_unique_abbr": "Tencent AI Lab;UTS;USYD;NU;HKUST", "aff_campus_unique_index": "0;0;;0;0;0;2;3", "aff_campus_unique": "Shenzhen;;Evanston;Hong Kong", "aff_country_unique_index": "0;0;1+1;0;0;0;2;0", "aff_country_unique": "China;Australia;United States" }, { "title": "Gromov-Wasserstein Learning for Graph Matching and Node Embedding", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3845", "id": "3845", "author_site": "Hongteng Xu, Dixin Luo, Hongyuan Zha, Lawrence Carin", "author": "Hongteng Xu; Dixin Luo; Hongyuan Zha; Lawrence Carin Duke", "abstract": "A novel Gromov-Wasserstein learning framework is proposed to jointly match (align) graphs and learn embedding vectors for the associated graph nodes. Using Gromov-Wasserstein discrepancy, we measure the dissimilarity between two graphs and find their correspondence, according to the learned optimal transport. The node embeddings associated with the two graphs are learned under the guidance of the optimal transport, the distance of which not only reflects the topological structure of each graph but also yields the correspondence across the graphs. These two learning steps are mutually-beneficial, and are unified here by minimizing the Gromov-Wasserstein discrepancy with structural regularizers. This framework leads to an optimization problem that is solved by a proximal point method. We apply the proposed method to matching problems in real-world networks, and demonstrate its superior performance compared to alternative approaches.", "bibtex": "@InProceedings{pmlr-v97-xu19b,\n title = \t {Gromov-{W}asserstein Learning for Graph Matching and Node Embedding},\n author = {Xu, Hongteng and Luo, Dixin and Zha, Hongyuan and Duke, Lawrence Carin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6932--6941},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xu19b/xu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/xu19b.html},\n abstract = \t {A novel Gromov-Wasserstein learning framework is proposed to jointly match (align) graphs and learn embedding vectors for the associated graph nodes. Using Gromov-Wasserstein discrepancy, we measure the dissimilarity between two graphs and find their correspondence, according to the learned optimal transport. The node embeddings associated with the two graphs are learned under the guidance of the optimal transport, the distance of which not only reflects the topological structure of each graph but also yields the correspondence across the graphs. These two learning steps are mutually-beneficial, and are unified here by minimizing the Gromov-Wasserstein discrepancy with structural regularizers. This framework leads to an optimization problem that is solved by a proximal point method. We apply the proposed method to matching problems in real-world networks, and demonstrate its superior performance compared to alternative approaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/xu19b/xu19b.pdf", "supp": "", "pdf_size": 941887, "gs_citation": 323, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17323824579705471287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "In\ufb01nia ML, Inc., Durham, NC, USA+Department of ECE, Duke University, Durham, NC, USA; Department of ECE, Duke University, Durham, NC, USA; College of Computing, Georgia Institute of Technology, Atlanta, GA, USA; Department of ECE, Duke University, Durham, NC, USA", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/xu19b.html", "aff_unique_index": "0+1;1;2;1", "aff_unique_norm": "In\ufb01nia ML, Inc.;Duke University;Georgia Institute of Technology", "aff_unique_dep": ";Department of Electrical and Computer Engineering;College of Computing", "aff_unique_url": ";https://www.duke.edu;https://www.gatech.edu", "aff_unique_abbr": ";Duke;Georgia Tech", "aff_campus_unique_index": "1;1;2;1", "aff_campus_unique": ";Durham;Atlanta", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Guarantees for Spectral Clustering with Fairness Constraints", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4249", "id": "4249", "author_site": "Matth\u00e4us Kleindessner, Samira Samadi, Pranjal Awasthi, Jamie Morgenstern", "author": "Matth\u00e4us Kleindessner; Samira Samadi; Pranjal Awasthi; Jamie Morgenstern", "abstract": "Given the widespread popularity of spectral clustering (SC) for partitioning graph data, we study a version of constrained SC in which we try to incorporate the fairness notion proposed by Chierichetti et al. (2017). According to this notion, a clustering is fair if every demographic group is approximately proportionally represented in each cluster. To this end, we develop variants of both normalized and unnormalized constrained SC and show that they help find fairer clusterings on both synthetic and real data. We also provide a rigorous theoretical analysis of our algorithms on a natural variant of the stochastic block model, where $h$ groups have strong inter-group connectivity, but also exhibit a \u201cnatural\u201d clustering structure which is fair. We prove that our algorithms can recover this fair clustering with high probability.", "bibtex": "@InProceedings{pmlr-v97-kleindessner19b,\n title = \t {Guarantees for Spectral Clustering with Fairness Constraints},\n author = {Kleindessner, Matth{\\\"a}us and Samadi, Samira and Awasthi, Pranjal and Morgenstern, Jamie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3458--3467},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kleindessner19b/kleindessner19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/kleindessner19b.html},\n abstract = \t {Given the widespread popularity of spectral clustering (SC) for partitioning graph data, we study a version of constrained SC in which we try to incorporate the fairness notion proposed by Chierichetti et al. (2017). According to this notion, a clustering is fair if every demographic group is approximately proportionally represented in each cluster. To this end, we develop variants of both normalized and unnormalized constrained SC and show that they help find fairer clusterings on both synthetic and real data. We also provide a rigorous theoretical analysis of our algorithms on a natural variant of the stochastic block model, where $h$ groups have strong inter-group connectivity, but also exhibit a \u201cnatural\u201d clustering structure which is fair. We prove that our algorithms can recover this fair clustering with high probability.}\n}", "pdf": "http://proceedings.mlr.press/v97/kleindessner19b/kleindessner19b.pdf", "supp": "", "pdf_size": 625399, "gs_citation": 219, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10455657164331034065&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Rutgers University, NJ; College of Computing, Georgia Tech, GA; Department of Computer Science, Rutgers University, NJ; College of Computing, Georgia Tech, GA", "aff_domain": "rutgers.edu;gatech.edu;rutgers.edu;cs.gatech.edu", "email": "rutgers.edu;gatech.edu;rutgers.edu;cs.gatech.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kleindessner19b.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Rutgers University;Georgia Institute of Technology", "aff_unique_dep": "Department of Computer Science;College of Computing", "aff_unique_url": "https://www.rutgers.edu;https://www.gatech.edu", "aff_unique_abbr": "Rutgers;Georgia Tech", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "New Brunswick;Georgia Tech", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Guided evolutionary strategies: augmenting random search with surrogate gradients", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3975", "id": "3975", "author_site": "Niru Maheswaranathan, Luke Metz, George Tucker, Dami Choi, Jascha Sohl-Dickstein", "author": "Niru Maheswaranathan; Luke Metz; George Tucker; Dami Choi; Jascha Sohl-Dickstein", "abstract": "Many applications in machine learning require optimizing a function whose true gradient is unknown or computationally expensive, but where surrogate gradient information, directions that may be correlated with the true gradient, is cheaply available. For example, this occurs when an approximate gradient is easier to compute than the full gradient (e.g. in meta-learning or unrolled optimization), or when a true gradient is intractable and is replaced with a surrogate (e.g. in reinforcement learning or training networks with discrete variables). We propose Guided Evolutionary Strategies (GES), a method for optimally using surrogate gradient directions to accelerate random search. GES defines a search distribution for evolutionary strategies that is elongated along a subspace spanned by the surrogate gradients and estimates a descent direction which can then be passed to a first-order optimizer. We analytically and numerically characterize the tradeoffs that result from tuning how strongly the search distribution is stretched along the guiding subspace and use this to derive a setting of the hyperparameters that works well across problems. We evaluate GES on several example problems, demonstrating an improvement over both standard evolutionary strategies and first-order methods that directly follow the surrogate gradient.", "bibtex": "@InProceedings{pmlr-v97-maheswaranathan19a,\n title = \t {Guided evolutionary strategies: augmenting random search with surrogate gradients},\n author = {Maheswaranathan, Niru and Metz, Luke and Tucker, George and Choi, Dami and Sohl-Dickstein, Jascha},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4264--4273},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/maheswaranathan19a/maheswaranathan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/maheswaranathan19a.html},\n abstract = \t {Many applications in machine learning require optimizing a function whose true gradient is unknown or computationally expensive, but where surrogate gradient information, directions that may be correlated with the true gradient, is cheaply available. For example, this occurs when an approximate gradient is easier to compute than the full gradient (e.g. in meta-learning or unrolled optimization), or when a true gradient is intractable and is replaced with a surrogate (e.g. in reinforcement learning or training networks with discrete variables). We propose Guided Evolutionary Strategies (GES), a method for optimally using surrogate gradient directions to accelerate random search. GES defines a search distribution for evolutionary strategies that is elongated along a subspace spanned by the surrogate gradients and estimates a descent direction which can then be passed to a first-order optimizer. We analytically and numerically characterize the tradeoffs that result from tuning how strongly the search distribution is stretched along the guiding subspace and use this to derive a setting of the hyperparameters that works well across problems. We evaluate GES on several example problems, demonstrating an improvement over both standard evolutionary strategies and first-order methods that directly follow the surrogate gradient.}\n}", "pdf": "http://proceedings.mlr.press/v97/maheswaranathan19a/maheswaranathan19a.pdf", "supp": "", "pdf_size": 2810391, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13097058951649931158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Google Research, Brain Team, Mountain View, CA, United States; Google Research, Brain Team, Mountain View, CA, United States; Google Research, Brain Team, Mountain View, CA, United States; Google Research, Brain Team, Mountain View, CA, United States; Google Research, Brain Team, Mountain View, CA, United States", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/maheswaranathan19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research, Brain Team", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "HOList: An Environment for Machine Learning of Higher Order Logic Theorem Proving", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3906", "id": "3906", "author_site": "Kshitij Bansal, Sarah Loos, Markus Rabe, Christian Szegedy, Stewart Wilcox", "author": "Kshitij Bansal; Sarah Loos; Markus Rabe; Christian Szegedy; Stewart Wilcox", "abstract": "We present an environment, benchmark, and deep learning driven automated theorem prover for higher-order logic. Higher-order interactive theorem provers enable the formalization of arbitrary mathematical theories and thereby present an interesting challenge for deep learning. We provide an open-source framework based on the HOL Light theorem prover that can be used as a reinforcement learning environment. HOL Light comes with a broad coverage of basic mathematical theorems on calculus and the formal proof of the Kepler conjecture, from which we derive a challenging benchmark for automated reasoning approaches. We also present a deep reinforcement learning driven automated theorem prover, DeepHOL, that gives strong initial results on this benchmark.", "bibtex": "@InProceedings{pmlr-v97-bansal19a,\n title = \t {{HOL}ist: An Environment for Machine Learning of Higher Order Logic Theorem Proving},\n author = {Bansal, Kshitij and Loos, Sarah and Rabe, Markus and Szegedy, Christian and Wilcox, Stewart},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {454--463},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bansal19a/bansal19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bansal19a.html},\n abstract = \t {We present an environment, benchmark, and deep learning driven automated theorem prover for higher-order logic. Higher-order interactive theorem provers enable the formalization of arbitrary mathematical theories and thereby present an interesting challenge for deep learning. We provide an open-source framework based on the HOL Light theorem prover that can be used as a reinforcement learning environment. HOL Light comes with a broad coverage of basic mathematical theorems on calculus and the formal proof of the Kepler conjecture, from which we derive a challenging benchmark for automated reasoning approaches. We also present a deep reinforcement learning driven automated theorem prover, DeepHOL, that gives strong initial results on this benchmark.}\n}", "pdf": "http://proceedings.mlr.press/v97/bansal19a/bansal19a.pdf", "supp": "", "pdf_size": 268621, "gs_citation": 194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3106446736100509794&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Google Research, Mountain View, California, USA; Google Research, Mountain View, California, USA; Google Research, Mountain View, California, USA; Google Research, Mountain View, California, USA; Google Research, Mountain View, California, USA", "aff_domain": "google.com;google.com; ; ; ", "email": "google.com;google.com; ; ; ", "github": "", "project": "http://deephol.org", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/bansal19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Hessian Aided Policy Gradient", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4137", "id": "4137", "author_site": "Zebang Shen, Alejandro Ribeiro, Hamed Hassani, Hui Qian, Chao Mi", "author": "Zebang Shen; Alejandro Ribeiro; Hamed Hassani; Hui Qian; Chao Mi", "abstract": "Reducing the variance of estimators for policy gradient has long been the focus of reinforcement learning research. \tWhile classic algorithms like REINFORCE find an $\\epsilon$-approximate first-order stationary point in $\\OM({1}/{\\epsilon^4})$ random trajectory simulations, no provable improvement on the complexity has been made so far. \tThis paper presents a Hessian aided policy gradient method with the first improved sample complexity of $\\OM({1}/{\\epsilon^3})$. \tWhile our method exploits information from the policy Hessian, it can be implemented in linear time with respect to the parameter dimension and is hence applicable to sophisticated DNN parameterization. \tSimulations on standard tasks validate the efficiency of our method.", "bibtex": "@InProceedings{pmlr-v97-shen19d,\n title = \t {Hessian Aided Policy Gradient},\n author = {Shen, Zebang and Ribeiro, Alejandro and Hassani, Hamed and Qian, Hui and Mi, Chao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5729--5738},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shen19d/shen19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/shen19d.html},\n abstract = \t {Reducing the variance of estimators for policy gradient has long been the focus of reinforcement learning research. \tWhile classic algorithms like REINFORCE find an $\\epsilon$-approximate first-order stationary point in $\\OM({1}/{\\epsilon^4})$ random trajectory simulations, no provable improvement on the complexity has been made so far. \tThis paper presents a Hessian aided policy gradient method with the first improved sample complexity of $\\OM({1}/{\\epsilon^3})$. \tWhile our method exploits information from the policy Hessian, it can be implemented in linear time with respect to the parameter dimension and is hence applicable to sophisticated DNN parameterization. \tSimulations on standard tasks validate the efficiency of our method.}\n}", "pdf": "http://proceedings.mlr.press/v97/shen19d/shen19d.pdf", "supp": "", "pdf_size": 2652918, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11909529055837242660&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Zhejiang University; University of Pennsylvania; Zhejiang University; Zhejiang University; University of Pennsylvania", "aff_domain": "zju.edu.cn;seas.upenn.edu; ; ; ", "email": "zju.edu.cn;seas.upenn.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/shen19d.html", "aff_unique_index": "0;1;0;0;1", "aff_unique_norm": "Zhejiang University;University of Pennsylvania", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.upenn.edu", "aff_unique_abbr": "ZJU;UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Heterogeneous Model Reuse via Optimizing Multiparty Multiclass Margin", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4036", "id": "4036", "author_site": "Xi-Zhu Wu, Song Liu, Zhi-Hua Zhou", "author": "Xi-Zhu Wu; Song Liu; Zhi-Hua Zhou", "abstract": "Nowadays, many problems require learning a model from data owned by different participants who are restricted to share their examples due to privacy concerns, which is referred to as multiparty learning in the literature. In conventional multiparty learning, a global model is usually trained from scratch via a communication protocol, ignoring the fact that each party may already have a local model trained on her own dataset. In this paper, we define a multiparty multiclass margin to measure the global behavior of a set of heterogeneous local models, and propose a general learning method called HMR (Heterogeneous Model Reuse) to optimize the margin. Our method reuses local models to approximate a global model, even when data are non-i.i.d distributed among parties, by exchanging few examples under predefined budget. Experiments on synthetic and real-world data covering different multiparty scenarios show the effectiveness of our proposal.", "bibtex": "@InProceedings{pmlr-v97-wu19c,\n title = \t {Heterogeneous Model Reuse via Optimizing Multiparty Multiclass Margin},\n author = {Wu, Xi-Zhu and Liu, Song and Zhou, Zhi-Hua},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6840--6849},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wu19c/wu19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/wu19c.html},\n abstract = \t {Nowadays, many problems require learning a model from data owned by different participants who are restricted to share their examples due to privacy concerns, which is referred to as multiparty learning in the literature. In conventional multiparty learning, a global model is usually trained from scratch via a communication protocol, ignoring the fact that each party may already have a local model trained on her own dataset. In this paper, we define a multiparty multiclass margin to measure the global behavior of a set of heterogeneous local models, and propose a general learning method called HMR (Heterogeneous Model Reuse) to optimize the margin. Our method reuses local models to approximate a global model, even when data are non-i.i.d distributed among parties, by exchanging few examples under predefined budget. Experiments on synthetic and real-world data covering different multiparty scenarios show the effectiveness of our proposal.}\n}", "pdf": "http://proceedings.mlr.press/v97/wu19c/wu19c.pdf", "supp": "", "pdf_size": 351358, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1338333402739856275&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China; University of Bristol, Bristol, United Kingdom + The Alan Turing Institute, London, United Kingdom; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, China", "aff_domain": "lamda.nju.edu.cn;bristol.ac.uk;lamda.nju.edu.cn", "email": "lamda.nju.edu.cn;bristol.ac.uk;lamda.nju.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wu19c.html", "aff_unique_index": "0;1+2;0", "aff_unique_norm": "Nanjing University;University of Bristol;Alan Turing Institute", "aff_unique_dep": "National Key Laboratory for Novel Software Technology;;", "aff_unique_url": "http://www.nju.edu.cn;https://www.bristol.ac.uk;https://www.turing.ac.uk", "aff_unique_abbr": "Nanjing U;UoB;ATI", "aff_campus_unique_index": "0;1+2;0", "aff_campus_unique": "Nanjing;Bristol;London", "aff_country_unique_index": "0;1+1;0", "aff_country_unique": "China;United Kingdom" }, { "title": "HexaGAN: Generative Adversarial Nets for Real World Classification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3594", "id": "3594", "author_site": "Uiwon Hwang, Dahuin Jung, Sungroh Yoon", "author": "Uiwon Hwang; Dahuin Jung; Sungroh Yoon", "abstract": "Most deep learning classification studies assume clean data. However, when dealing with the real world data, we encounter three problems such as 1) missing data, 2) class imbalance, and 3) missing label problems. These problems undermine the performance of a classifier. Various preprocessing techniques have been proposed to mitigate one of these problems, but an algorithm that assumes and resolves all three problems together has not been proposed yet. In this paper, we propose HexaGAN, a generative adversarial network framework that shows promising classification performance for all three problems. We interpret the three problems from a single perspective to solve them jointly. To enable this, the framework consists of six components, which interact with each other. We also devise novel loss functions corresponding to the architecture. The designed loss functions allow us to achieve state-of-the-art imputation performance, with up to a 14% improvement, and to generate high-quality class-conditional data. We evaluate the classification performance (F1-score) of the proposed method with 20% missingness and confirm up to a 5% improvement in comparison with the performance of combinations of state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v97-hwang19a,\n title = \t {{H}exa{GAN}: Generative Adversarial Nets for Real World Classification},\n author = {Hwang, Uiwon and Jung, Dahuin and Yoon, Sungroh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2921--2930},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hwang19a/hwang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hwang19a.html},\n abstract = \t {Most deep learning classification studies assume clean data. However, when dealing with the real world data, we encounter three problems such as 1) missing data, 2) class imbalance, and 3) missing label problems. These problems undermine the performance of a classifier. Various preprocessing techniques have been proposed to mitigate one of these problems, but an algorithm that assumes and resolves all three problems together has not been proposed yet. In this paper, we propose HexaGAN, a generative adversarial network framework that shows promising classification performance for all three problems. We interpret the three problems from a single perspective to solve them jointly. To enable this, the framework consists of six components, which interact with each other. We also devise novel loss functions corresponding to the architecture. The designed loss functions allow us to achieve state-of-the-art imputation performance, with up to a 14% improvement, and to generate high-quality class-conditional data. We evaluate the classification performance (F1-score) of the proposed method with 20% missingness and confirm up to a 5% improvement in comparison with the performance of combinations of state-of-the-art methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/hwang19a/hwang19a.pdf", "supp": "", "pdf_size": 2388034, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9625100105337863533&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Electrical and Computer Engineering, Seoul National University, Seoul, Korea; Electrical and Computer Engineering, Seoul National University, Seoul, Korea; ASRI, INMC, Institute of Engineering Research, Seoul National University, Seoul, Korea", "aff_domain": "snu.ac.kr; ; ", "email": "snu.ac.kr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hwang19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Electrical and Computer Engineering", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Hierarchical Decompositional Mixtures of Variational Autoencoders", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4034", "id": "4034", "author_site": "Ping Liang Tan, Robert Peharz", "author": "Ping Liang Tan; Robert Peharz", "abstract": "Variational autoencoders (VAEs) have received considerable attention, since they allow us to learn expressive neural density estimators effectively and efficiently. However, learning and inference in VAEs is still problematic due to the sensitive interplay between the generative model and the inference network. Since these problems become generally more severe in high dimensions, we propose a novel hierarchical mixture model over low-dimensional VAE experts. Our model decomposes the overall learning problem into many smaller problems, which are coordinated by the hierarchical mixture, represented by a sum-product network. In experiments we show that our models outperform classical VAEs on almost all of our experimental benchmarks. Moreover, we show that our model is highly data efficient and degrades very gracefully in extremely low data regimes.ow data regimes.", "bibtex": "@InProceedings{pmlr-v97-tan19b,\n title = \t {Hierarchical Decompositional Mixtures of Variational Autoencoders},\n author = {Tan, Ping Liang and Peharz, Robert},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6115--6124},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tan19b/tan19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/tan19b.html},\n abstract = \t {Variational autoencoders (VAEs) have received considerable attention, since they allow us to learn expressive neural density estimators effectively and efficiently. However, learning and inference in VAEs is still problematic due to the sensitive interplay between the generative model and the inference network. Since these problems become generally more severe in high dimensions, we propose a novel hierarchical mixture model over low-dimensional VAE experts. Our model decomposes the overall learning problem into many smaller problems, which are coordinated by the hierarchical mixture, represented by a sum-product network. In experiments we show that our models outperform classical VAEs on almost all of our experimental benchmarks. Moreover, we show that our model is highly data efficient and degrades very gracefully in extremely low data regimes.ow data regimes.}\n}", "pdf": "http://proceedings.mlr.press/v97/tan19b/tan19b.pdf", "supp": "", "pdf_size": 1331857, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11566802669636050102&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Engineering, University of Cambridge, UK+DSO National Laboratories, Singapore; Department of Engineering, University of Cambridge, UK", "aff_domain": "gmail.com;cam.ac.uk", "email": "gmail.com;cam.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/tan19b.html", "aff_unique_index": "0+1;0", "aff_unique_norm": "University of Cambridge;DSO National Laboratories", "aff_unique_dep": "Department of Engineering;", "aff_unique_url": "https://www.cam.ac.uk;https://www.dso.org.sg", "aff_unique_abbr": "Cambridge;DSO", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0+1;0", "aff_country_unique": "United Kingdom;Singapore" }, { "title": "Hierarchical Importance Weighted Autoencoders", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4323", "id": "4323", "author_site": "Chin-Wei Huang, Kris Sankaran, Eeshan Dhekane, Alexandre Lacoste, Aaron Courville", "author": "Chin-Wei Huang; Kris Sankaran; Eeshan Dhekane; Alexandre Lacoste; Aaron Courville", "abstract": "Importance weighted variational inference (Burda et al., 2015) uses multiple i.i.d. samples to have a tighter variational lower bound. We believe a joint proposal has the potential of reducing the number of redundant samples, and introduce a hierarchical structure to induce correlation. The hope is that the proposals would coordinate to make up for the error made by one another to reduce the variance of the importance estimator. Theoretically, we analyze the condition under which convergence of the estimator variance can be connected to convergence of the lower bound. Empirically, we confirm that maximization of the lower bound does implicitly minimize variance. Further analysis shows that this is a result of negative correlation induced by the proposed hierarchical meta sampling scheme, and performance of inference also improves when the number of samples increases.", "bibtex": "@InProceedings{pmlr-v97-huang19d,\n title = \t {Hierarchical Importance Weighted Autoencoders},\n author = {Huang, Chin-Wei and Sankaran, Kris and Dhekane, Eeshan and Lacoste, Alexandre and Courville, Aaron},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2869--2878},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/huang19d/huang19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/huang19d.html},\n abstract = \t {Importance weighted variational inference (Burda et al., 2015) uses multiple i.i.d. samples to have a tighter variational lower bound. We believe a joint proposal has the potential of reducing the number of redundant samples, and introduce a hierarchical structure to induce correlation. The hope is that the proposals would coordinate to make up for the error made by one another to reduce the variance of the importance estimator. Theoretically, we analyze the condition under which convergence of the estimator variance can be connected to convergence of the lower bound. Empirically, we confirm that maximization of the lower bound does implicitly minimize variance. Further analysis shows that this is a result of negative correlation induced by the proposed hierarchical meta sampling scheme, and performance of inference also improves when the number of samples increases.}\n}", "pdf": "http://proceedings.mlr.press/v97/huang19d/huang19d.pdf", "supp": "", "pdf_size": 1017185, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6799889309902559355&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Mila, University of Montreal; Mila, University of Montreal; Mila, University of Montreal; Element AI; Mila, University of Montreal+CIFAR", "aff_domain": "umontreal.ca; ; ; ; ", "email": "umontreal.ca; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/huang19d.html", "aff_unique_index": "0;0;0;1;0+2", "aff_unique_norm": "University of Montreal;Element AI;Canadian Institute for Advanced Research", "aff_unique_dep": "Mila;;", "aff_unique_url": "https://www.mila.quebec;https://www.elementai.com;https://www.cifar.ca", "aff_unique_abbr": "Mila;Element AI;CIFAR", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Montreal;", "aff_country_unique_index": "0;0;0;0;0+0", "aff_country_unique": "Canada" }, { "title": "Hierarchically Structured Meta-learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3627", "id": "3627", "author_site": "Huaxiu Yao, Ying WEI, Junzhou Huang, Zhenhui (Jessie) Li", "author": "Huaxiu Yao; Ying Wei; Junzhou Huang; Zhenhui Li", "abstract": "In order to learn quickly with few samples, meta-learning utilizes prior knowledge learned from previous tasks. However, a critical challenge in meta-learning is task uncertainty and heterogeneity, which can not be handled via globally sharing knowledge among tasks. In this paper, based on gradient-based meta-learning, we propose a hierarchically structured meta-learning (HSML) algorithm that explicitly tailors the transferable knowledge to different clusters of tasks. Inspired by the way human beings organize knowledge, we resort to a hierarchical task clustering structure to cluster tasks. As a result, the proposed approach not only addresses the challenge via the knowledge customization to different clusters of tasks, but also preserves knowledge generalization among a cluster of similar tasks. To tackle the changing of task relationship, in addition, we extend the hierarchical structure to a continual learning environment. The experimental results show that our approach can achieve state-of-the-art performance in both toy-regression and few-shot image classification problems.", "bibtex": "@InProceedings{pmlr-v97-yao19b,\n title = \t {Hierarchically Structured Meta-learning},\n author = {Yao, Huaxiu and Wei, Ying and Huang, Junzhou and Li, Zhenhui},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7045--7054},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yao19b/yao19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/yao19b.html},\n abstract = \t {In order to learn quickly with few samples, meta-learning utilizes prior knowledge learned from previous tasks. However, a critical challenge in meta-learning is task uncertainty and heterogeneity, which can not be handled via globally sharing knowledge among tasks. In this paper, based on gradient-based meta-learning, we propose a hierarchically structured meta-learning (HSML) algorithm that explicitly tailors the transferable knowledge to different clusters of tasks. Inspired by the way human beings organize knowledge, we resort to a hierarchical task clustering structure to cluster tasks. As a result, the proposed approach not only addresses the challenge via the knowledge customization to different clusters of tasks, but also preserves knowledge generalization among a cluster of similar tasks. To tackle the changing of task relationship, in addition, we extend the hierarchical structure to a continual learning environment. The experimental results show that our approach can achieve state-of-the-art performance in both toy-regression and few-shot image classification problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/yao19b/yao19b.pdf", "supp": "", "pdf_size": 1328799, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3487980416117206371&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "College of Information Science and Technology, Pennsylvania State University, PA, USA+Tencent AI Lab; Tencent AI Lab, Shenzhen, China; Tencent AI Lab, Shenzhen, China; College of Information Science and Technology, Pennsylvania State University, PA, USA", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/yao19b.html", "aff_unique_index": "0+1;1;1;0", "aff_unique_norm": "Pennsylvania State University;Tencent", "aff_unique_dep": "College of Information Science and Technology;Tencent AI Lab", "aff_unique_url": "https://www.psu.edu;https://ai.tencent.com", "aff_unique_abbr": "PSU;Tencent AI Lab", "aff_campus_unique_index": "0;2;2;0", "aff_campus_unique": "University Park;;Shenzhen", "aff_country_unique_index": "0+1;1;1;0", "aff_country_unique": "United States;China" }, { "title": "High-Fidelity Image Generation With Fewer Labels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4273", "id": "4273", "author_site": "Mario Lucic, Michael Tschannen, Marvin Ritter, Xiaohua Zhai, Olivier Bachem, Sylvain Gelly", "author": "Mario Lu\u010di\u0107; Michael Tschannen; Marvin Ritter; Xiaohua Zhai; Olivier Bachem; Sylvain Gelly", "abstract": "Deep generative models are becoming a cornerstone of modern machine learning. Recent work on conditional generative adversarial networks has shown that learning complex, high-dimensional distributions over natural images is within reach. While the latest models are able to generate high-fidelity, diverse natural images at high resolution, they rely on a vast quantity of labeled data. In this work we demonstrate how one can benefit from recent work on self- and semi-supervised learning to outperform the state of the art on both unsupervised ImageNet synthesis, as well as in the conditional setting. In particular, the proposed approach is able to match the sample quality (as measured by FID) of the current state-of-the-art conditional model BigGAN on ImageNet using only 10% of the labels and outperform it using 20% of the labels.", "bibtex": "@InProceedings{pmlr-v97-lucic19a,\n title = \t {High-Fidelity Image Generation With Fewer Labels},\n author = {Lu{\\v{c}}i{\\'c}, Mario and Tschannen, Michael and Ritter, Marvin and Zhai, Xiaohua and Bachem, Olivier and Gelly, Sylvain},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4183--4192},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lucic19a/lucic19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lucic19a.html},\n abstract = \t {Deep generative models are becoming a cornerstone of modern machine learning. Recent work on conditional generative adversarial networks has shown that learning complex, high-dimensional distributions over natural images is within reach. While the latest models are able to generate high-fidelity, diverse natural images at high resolution, they rely on a vast quantity of labeled data. In this work we demonstrate how one can benefit from recent work on self- and semi-supervised learning to outperform the state of the art on both unsupervised ImageNet synthesis, as well as in the conditional setting. In particular, the proposed approach is able to match the sample quality (as measured by FID) of the current state-of-the-art conditional model BigGAN on ImageNet using only 10% of the labels and outperform it using 20% of the labels.}\n}", "pdf": "http://proceedings.mlr.press/v97/lucic19a/lucic19a.pdf", "supp": "", "pdf_size": 840179, "gs_citation": 183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13622749687496052538&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Google Research, Brain Team; ETH Zurich; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team", "aff_domain": "google.com;gmail.com;google.com; ; ; ", "email": "google.com;gmail.com;google.com; ; ; ", "github": "github.com/google/compare_gan", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/lucic19a.html", "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Google;ETH Zurich", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.ethz.ch", "aff_unique_abbr": "Google;ETHZ", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;Switzerland" }, { "title": "Hiring Under Uncertainty", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4304", "id": "4304", "author_site": "Manish Purohit, Sreenivas Gollapudi, Manish Raghavan", "author": "Manish Purohit; Sreenivas Gollapudi; Manish Raghavan", "abstract": "In this paper we introduce the hiring under uncertainty problem to model the questions faced by hiring committees in large enterprises and universities alike. Given a set of $n$ eligible candidates, the decision maker needs to choose the sequence of candidates to make offers so as to hire the $k$ best candidates. However, candidates may choose to reject an offer (for instance, due to a competing offer) and the decision maker has a time limit by which all positions must be filled. Given an estimate of the probabilities of acceptance for each candidate, the hiring under uncertainty problem is to design a strategy of making offers so that the total expected value of all candidates hired by the time limit is maximized. We provide a 2-approximation algorithm for the setting where offers must be made in sequence, an 8-approximation when offers may be made in parallel, and a 10-approximation for the more general stochastic knapsack setting with finite probes.", "bibtex": "@InProceedings{pmlr-v97-purohit19a,\n title = \t {Hiring Under Uncertainty},\n author = {Purohit, Manish and Gollapudi, Sreenivas and Raghavan, Manish},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5181--5189},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/purohit19a/purohit19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/purohit19a.html},\n abstract = \t {In this paper we introduce the hiring under uncertainty problem to model the questions faced by hiring committees in large enterprises and universities alike. Given a set of $n$ eligible candidates, the decision maker needs to choose the sequence of candidates to make offers so as to hire the $k$ best candidates. However, candidates may choose to reject an offer (for instance, due to a competing offer) and the decision maker has a time limit by which all positions must be filled. Given an estimate of the probabilities of acceptance for each candidate, the hiring under uncertainty problem is to design a strategy of making offers so that the total expected value of all candidates hired by the time limit is maximized. We provide a 2-approximation algorithm for the setting where offers must be made in sequence, an 8-approximation when offers may be made in parallel, and a 10-approximation for the more general stochastic knapsack setting with finite probes.}\n}", "pdf": "http://proceedings.mlr.press/v97/purohit19a/purohit19a.pdf", "supp": "", "pdf_size": 468890, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10688341517890240640&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science, Cornell University; Google Research; Google Research", "aff_domain": "cs.cornell.edu; ; ", "email": "cs.cornell.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/purohit19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Cornell University;Google", "aff_unique_dep": "Department of Computer Science;Google Research", "aff_unique_url": "https://www.cornell.edu;https://research.google", "aff_unique_abbr": "Cornell;Google Research", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Homomorphic Sensing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3741", "id": "3741", "author_site": "Manolis Tsakiris, Liangzu Peng", "author": "Manolis Tsakiris; Liangzu Peng", "abstract": "A recent line of research termed \"unlabeled sensing\" and \"shuffled linear regression\" has been exploring under great generality the recovery of signals from subsampled and permuted measurements; a challenging problem in diverse fields of data science and machine learning. In this paper we introduce an abstraction of this problem which we call \"homomorphic sensing\". Given a linear subspace and a finite set of linear transformations we develop an algebraic theory which establishes conditions guaranteeing that points in the subspace are uniquely determined from their homomorphic image under some transformation in the set. As a special case, we recover known conditions for unlabeled sensing, as well as new results and extensions. On the algorithmic level we exhibit two dynamic programming based algorithms, which to the best of our knowledge are the first working solutions for the unlabeled sensing problem for small dimensions. One of them, additionally based on branch-and-bound, when applied to image registration under affine transformations, performs on par with or outperforms state-of-the-art methods on benchmark datasets.", "bibtex": "@InProceedings{pmlr-v97-tsakiris19a,\n title = \t {Homomorphic Sensing},\n author = {Tsakiris, Manolis and Peng, Liangzu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6335--6344},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tsakiris19a/tsakiris19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tsakiris19a.html},\n abstract = \t {A recent line of research termed \"unlabeled sensing\" and \"shuffled linear regression\" has been exploring under great generality the recovery of signals from subsampled and permuted measurements; a challenging problem in diverse fields of data science and machine learning. In this paper we introduce an abstraction of this problem which we call \"homomorphic sensing\". Given a linear subspace and a finite set of linear transformations we develop an algebraic theory which establishes conditions guaranteeing that points in the subspace are uniquely determined from their homomorphic image under some transformation in the set. As a special case, we recover known conditions for unlabeled sensing, as well as new results and extensions. On the algorithmic level we exhibit two dynamic programming based algorithms, which to the best of our knowledge are the first working solutions for the unlabeled sensing problem for small dimensions. One of them, additionally based on branch-and-bound, when applied to image registration under affine transformations, performs on par with or outperforms state-of-the-art methods on benchmark datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/tsakiris19a/tsakiris19a.pdf", "supp": "", "pdf_size": 393782, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5568277261774075964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Information Science and Technology, ShanghaiTech University, Shanghai, China; School of Information Science and Technology, ShanghaiTech University, Shanghai, China", "aff_domain": "shanghaitech.edu.cn; ", "email": "shanghaitech.edu.cn; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/tsakiris19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "ShanghaiTech University", "aff_unique_dep": "School of Information Science and Technology", "aff_unique_url": "https://www.shanghaitech.edu.cn", "aff_unique_abbr": "ShanghaiTech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "How does Disagreement Help Generalization against Label Corruption?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3772", "id": "3772", "author_site": "Xingrui Yu, Bo Han, Jiangchao Yao, Gang Niu, Ivor Tsang, Masashi Sugiyama", "author": "Xingrui Yu; Bo Han; Jiangchao Yao; Gang Niu; Ivor Tsang; Masashi Sugiyama", "abstract": "Learning with noisy labels is one of the hottest problems in weakly-supervised learning. Based on memorization effects of deep neural networks, training on small-loss instances becomes very promising for handling noisy labels. This fosters the state-of-the-art approach \"Co-teaching\" that cross-trains two deep neural networks using the small-loss trick. However, with the increase of epochs, two networks converge to a consensus and Co-teaching reduces to the self-training MentorNet. To tackle this issue, we propose a robust learning paradigm called Co-teaching+, which bridges the \"Update by Disagreement\u201d strategy with the original Co-teaching. First, two networks feed forward and predict all data, but keep prediction disagreement data only. Then, among such disagreement data, each network selects its small-loss data, but back propagates the small-loss data from its peer network and updates its own parameters. Empirical results on benchmark datasets demonstrate that Co-teaching+ is much superior to many state-of-the-art methods in the robustness of trained models.", "bibtex": "@InProceedings{pmlr-v97-yu19b,\n title = \t {How does Disagreement Help Generalization against Label Corruption?},\n author = {Yu, Xingrui and Han, Bo and Yao, Jiangchao and Niu, Gang and Tsang, Ivor and Sugiyama, Masashi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7164--7173},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yu19b/yu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/yu19b.html},\n abstract = \t {Learning with noisy labels is one of the hottest problems in weakly-supervised learning. Based on memorization effects of deep neural networks, training on small-loss instances becomes very promising for handling noisy labels. This fosters the state-of-the-art approach \"Co-teaching\" that cross-trains two deep neural networks using the small-loss trick. However, with the increase of epochs, two networks converge to a consensus and Co-teaching reduces to the self-training MentorNet. To tackle this issue, we propose a robust learning paradigm called Co-teaching+, which bridges the \"Update by Disagreement\u201d strategy with the original Co-teaching. First, two networks feed forward and predict all data, but keep prediction disagreement data only. Then, among such disagreement data, each network selects its small-loss data, but back propagates the small-loss data from its peer network and updates its own parameters. Empirical results on benchmark datasets demonstrate that Co-teaching+ is much superior to many state-of-the-art methods in the robustness of trained models.}\n}", "pdf": "http://proceedings.mlr.press/v97/yu19b/yu19b.pdf", "supp": "", "pdf_size": 915318, "gs_citation": 975, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18204659170085017674&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "CAI, University of Technology Sydney; RIKEN-AIP; Alibaba Damo Academy; RIKEN-AIP; CAI, University of Technology Sydney; University of Tokyo", "aff_domain": "student.uts.edu.au; ; ; ; ; ", "email": "student.uts.edu.au; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/yu19b.html", "aff_unique_index": "0;1;2;1;0;3", "aff_unique_norm": "University of Technology Sydney;RIKEN Center for Advanced Intelligence Project;Alibaba Group;University of Tokyo", "aff_unique_dep": "CAI;Center for Advanced Intelligence Project;Damo Academy;", "aff_unique_url": "https://www.uts.edu.au;https://aip.Riken.jp;https://www.alibaba-group.com;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "UTS;RIKEN-AIP;Alibaba;UTokyo", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;1;0;1", "aff_country_unique": "Australia;Japan;China" }, { "title": "Humor in Word Embeddings: Cockamamie Gobbledegook for Nincompoops", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3915", "id": "3915", "author_site": "Limor Gultchin, Genevieve Patterson, Nancy Baym, Nathaniel Swinger, Adam Kalai", "author": "Limor Gultchin; Genevieve Patterson; Nancy Baym; Nathaniel Swinger; Adam Kalai", "abstract": "While humor is often thought to be beyond the reach of Natural Language Processing, we show that several aspects of single-word humor correlate with simple linear directions in Word Embeddings. In particular: (a) the word vectors capture multiple aspects discussed in humor theories from various disciplines; (b) each individual\u2019s sense of humor can be represented by a vector, which can predict differences in people\u2019s senses of humor on new, unrated, words; and (c) upon clustering humor ratings of multiple demographic groups, different humor preferences emerge across the different groups. Humor ratings are taken from the work of Engelthaler and Hills (2017) as well as from an original crowdsourcing study of 120,000 words. Our dataset further includes annotations for the theoretically-motivated humor features we identify.", "bibtex": "@InProceedings{pmlr-v97-gultchin19a,\n title = \t {Humor in Word Embeddings: Cockamamie Gobbledegook for Nincompoops},\n author = {Gultchin, Limor and Patterson, Genevieve and Baym, Nancy and Swinger, Nathaniel and Kalai, Adam},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2474--2483},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gultchin19a/gultchin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gultchin19a.html},\n abstract = \t {While humor is often thought to be beyond the reach of Natural Language Processing, we show that several aspects of single-word humor correlate with simple linear directions in Word Embeddings. In particular: (a) the word vectors capture multiple aspects discussed in humor theories from various disciplines; (b) each individual\u2019s sense of humor can be represented by a vector, which can predict differences in people\u2019s senses of humor on new, unrated, words; and (c) upon clustering humor ratings of multiple demographic groups, different humor preferences emerge across the different groups. Humor ratings are taken from the work of Engelthaler and Hills (2017) as well as from an original crowdsourcing study of 120,000 words. Our dataset further includes annotations for the theoretically-motivated humor features we identify.}\n}", "pdf": "http://proceedings.mlr.press/v97/gultchin19a/gultchin19a.pdf", "supp": "", "pdf_size": 562709, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13364498492064893478&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "University of Oxford; TRASH; Microsoft Research; Lexington High School; Microsoft Research", "aff_domain": "jesus.ox.ac.uk; ; ; ; ", "email": "jesus.ox.ac.uk; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/gultchin19a.html", "aff_unique_index": "0;2;3;2", "aff_unique_norm": "University of Oxford;;Microsoft;Lexington High School", "aff_unique_dep": ";;Microsoft Research;", "aff_unique_url": "https://www.ox.ac.uk;;https://www.microsoft.com/en-us/research;https://www.lexingtonma.org/domain/4", "aff_unique_abbr": "Oxford;;MSR;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;2;2;2", "aff_country_unique": "United Kingdom;;United States" }, { "title": "Hybrid Models with Deep and Invertible Features", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4312", "id": "4312", "author_site": "Eric Nalisnick, Akihiro Matsukawa, Yee-Whye Teh, Dilan Gorur, Balaji Lakshminarayanan", "author": "Eric Nalisnick; Akihiro Matsukawa; Yee Whye Teh; Dilan Gorur; Balaji Lakshminarayanan", "abstract": "We propose a neural hybrid model consisting of a linear model defined on a set of features computed by a deep, invertible transformation (i.e. a normalizing flow). An attractive property of our model is that both p(features), the density of the features, and p(targets|features), the predictive distribution, can be computed exactly in a single feed-forward pass. We show that our hybrid model, despite the invertibility constraints, achieves similar accuracy to purely predictive models. Yet the generative component remains a good model of the input features despite the hybrid optimization objective. This offers additional capabilities such as detection of out-of-distribution inputs and enabling semi-supervised learning. The availability of the exact joint density p(targets, features) also allows us to compute many quantities readily, making our hybrid model a useful building block for downstream applications of probabilistic deep learning.", "bibtex": "@InProceedings{pmlr-v97-nalisnick19b,\n title = \t {Hybrid Models with Deep and Invertible Features},\n author = {Nalisnick, Eric and Matsukawa, Akihiro and Teh, Yee Whye and Gorur, Dilan and Lakshminarayanan, Balaji},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4723--4732},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nalisnick19b/nalisnick19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/nalisnick19b.html},\n abstract = \t {We propose a neural hybrid model consisting of a linear model defined on a set of features computed by a deep, invertible transformation (i.e. a normalizing flow). An attractive property of our model is that both p(features), the density of the features, and p(targets|features), the predictive distribution, can be computed exactly in a single feed-forward pass. We show that our hybrid model, despite the invertibility constraints, achieves similar accuracy to purely predictive models. Yet the generative component remains a good model of the input features despite the hybrid optimization objective. This offers additional capabilities such as detection of out-of-distribution inputs and enabling semi-supervised learning. The availability of the exact joint density p(targets, features) also allows us to compute many quantities readily, making our hybrid model a useful building block for downstream applications of probabilistic deep learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/nalisnick19b/nalisnick19b.pdf", "supp": "", "pdf_size": 1331589, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4325533827209296493&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind", "aff_domain": "deepmind.com;deepmind.com;deepmind.com;deepmind.com;google.com", "email": "deepmind.com;deepmind.com;deepmind.com;deepmind.com;google.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/nalisnick19b.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "HyperGAN: A Generative Model for Diverse, Performant Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3923", "id": "3923", "author_site": "Neale Ratzlaff, Fuxin Li", "author": "Neale Ratzlaff; Li Fuxin", "abstract": "We introduce HyperGAN, a generative model that learns to generate all the parameters of a deep neural network. HyperGAN first transforms low dimensional noise into a latent space, which can be sampled from to obtain diverse, performant sets of parameters for a target architecture. We utilize an architecture that bears resemblance to generative adversarial networks, but we evaluate the likelihood of generated samples with a classification loss. This is equivalent to minimizing the KL-divergence between the distribution of generated parameters, and the unknown true parameter distribution. We apply HyperGAN to classification, showing that HyperGAN can learn to generate parameters which solve the MNIST and CIFAR-10 datasets with competitive performance to fully supervised learning, while also generating a rich distribution of effective parameters. We also show that HyperGAN can also provide better uncertainty estimates than standard ensembles. This is evidenced by the ability of HyperGAN-generated ensembles to detect out of distribution data as well as adversarial examples.", "bibtex": "@InProceedings{pmlr-v97-ratzlaff19a,\n title = \t {{H}yper{GAN}: A Generative Model for Diverse, Performant Neural Networks},\n author = {Ratzlaff, Neale and Fuxin, Li},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5361--5369},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ratzlaff19a/ratzlaff19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ratzlaff19a.html},\n abstract = \t {We introduce HyperGAN, a generative model that learns to generate all the parameters of a deep neural network. HyperGAN first transforms low dimensional noise into a latent space, which can be sampled from to obtain diverse, performant sets of parameters for a target architecture. We utilize an architecture that bears resemblance to generative adversarial networks, but we evaluate the likelihood of generated samples with a classification loss. This is equivalent to minimizing the KL-divergence between the distribution of generated parameters, and the unknown true parameter distribution. We apply HyperGAN to classification, showing that HyperGAN can learn to generate parameters which solve the MNIST and CIFAR-10 datasets with competitive performance to fully supervised learning, while also generating a rich distribution of effective parameters. We also show that HyperGAN can also provide better uncertainty estimates than standard ensembles. This is evidenced by the ability of HyperGAN-generated ensembles to detect out of distribution data as well as adversarial examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/ratzlaff19a/ratzlaff19a.pdf", "supp": "", "pdf_size": 679563, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9558964378172570115&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Electrical Engineering and Computer Science, Oregon State University; School of Electrical Engineering and Computer Science, Oregon State University", "aff_domain": "oregonstate.edu;oregonstate.edu", "email": "oregonstate.edu;oregonstate.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ratzlaff19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Oregon State University", "aff_unique_dep": "School of Electrical Engineering and Computer Science", "aff_unique_url": "https://osu.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Corvallis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Hyperbolic Disk Embeddings for Directed Acyclic Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4043", "id": "4043", "author_site": "Ryota Suzuki, Ryusuke Takahama, Shun Onoda", "author": "Ryota Suzuki; Ryusuke Takahama; Shun Onoda", "abstract": "Obtaining continuous representations of structural data such as directed acyclic graphs (DAGs) has gained attention in machine learning and artificial intelligence. However, embedding complex DAGs in which both ancestors and descendants of nodes are exponentially increasing is difficult. Tackling in this problem, we develop Disk Embeddings, which is a framework for embedding DAGs into quasi-metric spaces. Existing state-of-the-art methods, Order Embeddings and Hyperbolic Entailment Cones, are instances of Disk Embedding in Euclidean space and spheres respectively. Furthermore, we propose a novel method Hyperbolic Disk Embeddings to handle exponential growth of relations. The results of our experiments show that our Disk Embedding models outperform existing methods especially in complex DAGs other than trees.", "bibtex": "@InProceedings{pmlr-v97-suzuki19a,\n title = \t {Hyperbolic Disk Embeddings for Directed Acyclic Graphs},\n author = {Suzuki, Ryota and Takahama, Ryusuke and Onoda, Shun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6066--6075},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/suzuki19a/suzuki19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/suzuki19a.html},\n abstract = \t {Obtaining continuous representations of structural data such as directed acyclic graphs (DAGs) has gained attention in machine learning and artificial intelligence. However, embedding complex DAGs in which both ancestors and descendants of nodes are exponentially increasing is difficult. Tackling in this problem, we develop Disk Embeddings, which is a framework for embedding DAGs into quasi-metric spaces. Existing state-of-the-art methods, Order Embeddings and Hyperbolic Entailment Cones, are instances of Disk Embedding in Euclidean space and spheres respectively. Furthermore, we propose a novel method Hyperbolic Disk Embeddings to handle exponential growth of relations. The results of our experiments show that our Disk Embedding models outperform existing methods especially in complex DAGs other than trees.}\n}", "pdf": "http://proceedings.mlr.press/v97/suzuki19a/suzuki19a.pdf", "supp": "", "pdf_size": 186222, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15999788633415414766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "LAPRAS Inc., Tokyo, Japan; LAPRAS Inc., Tokyo, Japan; LAPRAS Inc., Tokyo, Japan", "aff_domain": "lapras.com; ; ", "email": "lapras.com; ; ", "github": "", "project": "https://git-scm.com/docs/user-manual", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/suzuki19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "LAPRAS Inc.", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Japan" }, { "title": "IMEXnet A Forward Stable Deep Neural Network", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3861", "id": "3861", "author_site": "Eldad Haber, Keegan Lensink, Eran Treister, Lars Ruthotto", "author": "Eldad Haber; Keegan Lensink; Eran Treister; Lars Ruthotto", "abstract": "Deep convolutional neural networks have revolutionized many machine learning and computer vision tasks, however, some remaining key challenges limit their wider use. These challenges include improving the network\u2019s robustness to perturbations of the input image and the limited \u201cfield of view\u201d of convolution operators. We introduce the IMEXnet that addresses these challenges by adapting semi-implicit methods for partial differential equations. Compared to similar explicit networks, such as residual networks, our network is more stable, which has recently shown to reduce the sensitivity to small changes in the input features and improve generalization. The addition of an implicit step connects all pixels in each channel of the image and therefore addresses the field of view problem while still being comparable to standard convolutions in terms of the number of parameters and computational complexity. We also present a new dataset for semantic segmentation and demonstrate the effectiveness of our architecture using the NYU Depth dataset.", "bibtex": "@InProceedings{pmlr-v97-haber19a,\n title = \t {{IMEX}net A Forward Stable Deep Neural Network},\n author = {Haber, Eldad and Lensink, Keegan and Treister, Eran and Ruthotto, Lars},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2525--2534},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/haber19a/haber19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/haber19a.html},\n abstract = \t {Deep convolutional neural networks have revolutionized many machine learning and computer vision tasks, however, some remaining key challenges limit their wider use. These challenges include improving the network\u2019s robustness to perturbations of the input image and the limited \u201cfield of view\u201d of convolution operators. We introduce the IMEXnet that addresses these challenges by adapting semi-implicit methods for partial differential equations. Compared to similar explicit networks, such as residual networks, our network is more stable, which has recently shown to reduce the sensitivity to small changes in the input features and improve generalization. The addition of an implicit step connects all pixels in each channel of the image and therefore addresses the field of view problem while still being comparable to standard convolutions in terms of the number of parameters and computational complexity. We also present a new dataset for semantic segmentation and demonstrate the effectiveness of our architecture using the NYU Depth dataset.}\n}", "pdf": "http://proceedings.mlr.press/v97/haber19a/haber19a.pdf", "supp": "", "pdf_size": 1077080, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12193339308030426247&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Earth, Ocean and Atmospheric Sciences, University of British Columbia, Vancouver, Canada+Xtract AI, Vancouver, Canada; Department of Earth, Ocean and Atmospheric Sciences, University of British Columbia, Vancouver, Canada+Xtract AI, Vancouver, Canada; Department of Computer Science, Ben Gurion University of the Negev, Be\u2019er Sheva, Israel; Departments of Mathematics and Computer Science, Emory University, Atlanta, GA, USA", "aff_domain": "eoas.ubc.ca; ; ; ", "email": "eoas.ubc.ca; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/haber19a.html", "aff_unique_index": "0+1;0+1;2;3", "aff_unique_norm": "University of British Columbia;Xtract AI;Ben Gurion University of the Negev;Emory University", "aff_unique_dep": "Department of Earth, Ocean and Atmospheric Sciences;;Department of Computer Science;Departments of Mathematics and Computer Science", "aff_unique_url": "https://www.ubc.ca;;https://www.bgu.ac.il;https://www.emory.edu", "aff_unique_abbr": "UBC;;BGU;Emory", "aff_campus_unique_index": "0+0;0+0;1;2", "aff_campus_unique": "Vancouver;Be\u2019er Sheva;Atlanta", "aff_country_unique_index": "0+0;0+0;1;2", "aff_country_unique": "Canada;Israel;United States" }, { "title": "Imitating Latent Policies from Observation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3971", "id": "3971", "author_site": "Ashley Edwards, Himanshu Sahni, Yannick Schroecker, Charles Isbell", "author": "Ashley Edwards; Himanshu Sahni; Yannick Schroecker; Charles Isbell", "abstract": "In this paper, we describe a novel approach to imitation learning that infers latent policies directly from state observations. We introduce a method that characterizes the causal effects of latent actions on observations while simultaneously predicting their likelihood. We then outline an action alignment procedure that leverages a small amount of environment interactions to determine a mapping between the latent and real-world actions. We show that this corrected labeling can be used for imitating the observed behavior, even though no expert actions are given. We evaluate our approach within classic control environments and a platform game and demonstrate that it performs better than standard approaches. Code for this work is available at https://github.com/ashedwards/ILPO.", "bibtex": "@InProceedings{pmlr-v97-edwards19a,\n title = \t {Imitating Latent Policies from Observation},\n author = {Edwards, Ashley and Sahni, Himanshu and Schroecker, Yannick and Isbell, Charles},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1755--1763},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/edwards19a/edwards19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/edwards19a.html},\n abstract = \t {In this paper, we describe a novel approach to imitation learning that infers latent policies directly from state observations. We introduce a method that characterizes the causal effects of latent actions on observations while simultaneously predicting their likelihood. We then outline an action alignment procedure that leverages a small amount of environment interactions to determine a mapping between the latent and real-world actions. We show that this corrected labeling can be used for imitating the observed behavior, even though no expert actions are given. We evaluate our approach within classic control environments and a platform game and demonstrate that it performs better than standard approaches. Code for this work is available at https://github.com/ashedwards/ILPO.}\n}", "pdf": "http://proceedings.mlr.press/v97/edwards19a/edwards19a.pdf", "supp": "", "pdf_size": 3497760, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16539609081927748607&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Georgia Institute of Technology; Georgia Institute of Technology; Georgia Institute of Technology; Georgia Institute of Technology", "aff_domain": "gatech.edu; ; ; ", "email": "gatech.edu; ; ; ", "github": "https://github.com/ashedwards/ILPO", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/edwards19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Imitation Learning from Imperfect Demonstration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3632", "id": "3632", "author_site": "Yueh-Hua Wu, Nontawat Charoenphakdee, Han Bao, Voot Tangkaratt, Masashi Sugiyama", "author": "Yueh-Hua Wu; Nontawat Charoenphakdee; Han Bao; Voot Tangkaratt; Masashi Sugiyama", "abstract": "Imitation learning (IL) aims to learn an optimal policy from demonstrations. However, such demonstrations are often imperfect since collecting optimal ones is costly. To effectively learn from imperfect demonstrations, we propose a novel approach that utilizes confidence scores, which describe the quality of demonstrations. More specifically, we propose two confidence-based IL methods, namely two-step importance weighting IL (2IWIL) and generative adversarial IL with imperfect demonstration and confidence (IC-GAIL). We show that confidence scores given only to a small portion of sub-optimal demonstrations significantly improve the performance of IL both theoretically and empirically.", "bibtex": "@InProceedings{pmlr-v97-wu19a,\n title = \t {Imitation Learning from Imperfect Demonstration},\n author = {Wu, Yueh-Hua and Charoenphakdee, Nontawat and Bao, Han and Tangkaratt, Voot and Sugiyama, Masashi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6818--6827},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wu19a/wu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wu19a.html},\n abstract = \t {Imitation learning (IL) aims to learn an optimal policy from demonstrations. However, such demonstrations are often imperfect since collecting optimal ones is costly. To effectively learn from imperfect demonstrations, we propose a novel approach that utilizes confidence scores, which describe the quality of demonstrations. More specifically, we propose two confidence-based IL methods, namely two-step importance weighting IL (2IWIL) and generative adversarial IL with imperfect demonstration and confidence (IC-GAIL). We show that confidence scores given only to a small portion of sub-optimal demonstrations significantly improve the performance of IL both theoretically and empirically.}\n}", "pdf": "http://proceedings.mlr.press/v97/wu19a/wu19a.pdf", "supp": "", "pdf_size": 1898777, "gs_citation": 201, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5471437882113161886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "National Taiwan University, Taiwan+RIKEN Center for Advanced Intelligence Project, Japan+The University of Tokyo, Japan; RIKEN Center for Advanced Intelligence Project, Japan+The University of Tokyo, Japan; RIKEN Center for Advanced Intelligence Project, Japan+The University of Tokyo, Japan; RIKEN Center for Advanced Intelligence Project, Japan; RIKEN Center for Advanced Intelligence Project, Japan+The University of Tokyo, Japan", "aff_domain": "csie.ntu.edu.tw; ; ; ; ", "email": "csie.ntu.edu.tw; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/wu19a.html", "aff_unique_index": "0+1+2;1+2;1+2;1;1+2", "aff_unique_norm": "National Taiwan University;RIKEN Center for Advanced Intelligence Project;University of Tokyo", "aff_unique_dep": ";Center for Advanced Intelligence Project;", "aff_unique_url": "https://www.ntu.edu.tw;https://www.riken.jp/en/c-aip/;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "NTU;RIKEN C-AIP;UTokyo", "aff_campus_unique_index": "0;;;", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0+1+1;1+1;1+1;1;1+1", "aff_country_unique": "China;Japan" }, { "title": "Imperceptible, Robust, and Targeted Adversarial Examples for Automatic Speech Recognition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4300", "id": "4300", "author_site": "Yao Qin, Nicholas Carlini, Garrison Cottrell, Ian Goodfellow, Colin Raffel", "author": "Yao Qin; Nicholas Carlini; Garrison Cottrell; Ian Goodfellow; Colin Raffel", "abstract": "Adversarial examples are inputs to machine learning models designed by an adversary to cause an incorrect output. So far, adversarial examples have been studied most extensively in the image domain. In this domain, adversarial examples can be constructed by imperceptibly modifying images to cause misclassification, and are practical in the physical world. In contrast, current targeted adversarial examples on speech recognition systems have neither of these properties: humans can easily identify the adversarial perturbations, and they are not effective when played over-the-air. This paper makes progress on both of these fronts. First, we develop effectively imperceptible audio adversarial examples (verified through a human study) by leveraging the psychoacoustic principle of auditory masking, while retaining 100% targeted success rate on arbitrary full-sentence targets. Then, we make progress towards physical-world audio adversarial examples by constructing perturbations which remain effective even after applying highly-realistic simulated environmental distortions.", "bibtex": "@InProceedings{pmlr-v97-qin19a,\n title = \t {Imperceptible, Robust, and Targeted Adversarial Examples for Automatic Speech Recognition},\n author = {Qin, Yao and Carlini, Nicholas and Cottrell, Garrison and Goodfellow, Ian and Raffel, Colin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5231--5240},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/qin19a/qin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/qin19a.html},\n abstract = \t {Adversarial examples are inputs to machine learning models designed by an adversary to cause an incorrect output. So far, adversarial examples have been studied most extensively in the image domain. In this domain, adversarial examples can be constructed by imperceptibly modifying images to cause misclassification, and are practical in the physical world. In contrast, current targeted adversarial examples on speech recognition systems have neither of these properties: humans can easily identify the adversarial perturbations, and they are not effective when played over-the-air. This paper makes progress on both of these fronts. First, we develop effectively imperceptible audio adversarial examples (verified through a human study) by leveraging the psychoacoustic principle of auditory masking, while retaining 100% targeted success rate on arbitrary full-sentence targets. Then, we make progress towards physical-world audio adversarial examples by constructing perturbations which remain effective even after applying highly-realistic simulated environmental distortions.}\n}", "pdf": "http://proceedings.mlr.press/v97/qin19a/qin19a.pdf", "supp": "", "pdf_size": 357344, "gs_citation": 526, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15576446916953427873&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Department of CSE, University of California, San Diego, USA+Google Brain, USA; Google Brain, USA; Google Brain, USA; Department of CSE, University of California, San Diego, USA; Google Brain, USA", "aff_domain": "eng.ucsd.edu; ; ; ;google.com", "email": "eng.ucsd.edu; ; ; ;google.com", "github": "", "project": "http://cseweb.ucsd.edu/~yaq007/imperceptible-robust-adv.html", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/qin19a.html", "aff_unique_index": "0+1;1;1;0;1", "aff_unique_norm": "University of California, San Diego;Google", "aff_unique_dep": "Department of CSE;Google Brain", "aff_unique_url": "https://www.ucsd.edu;https://brain.google.com", "aff_unique_abbr": "UCSD;Google Brain", "aff_campus_unique_index": "0+1;1;1;0;1", "aff_campus_unique": "San Diego;Mountain View", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Importance Sampling Policy Evaluation with an Estimated Behavior Policy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4167", "id": "4167", "author_site": "Josiah Hanna, Scott Niekum, Peter Stone", "author": "Josiah Hanna; Scott Niekum; Peter Stone", "abstract": "We consider the problem of off-policy evaluation in Markov decision processes. Off-policy evaluation is the task of evaluating the expected return of one policy with data generated by a different, behavior policy. Importance sampling is a technique for off-policy evaluation that re-weights off-policy returns to account for differences in the likelihood of the returns between the two policies. In this paper, we study importance sampling with an estimated behavior policy where the behavior policy estimate comes from the same set of data used to compute the importance sampling estimate. We find that this estimator often lowers the mean squared error of off-policy evaluation compared to importance sampling with the true behavior policy or using a behavior policy that is estimated from a separate data set. Intuitively, estimating the behavior policy in this way corrects for error due to sampling in the action-space. Our empirical results also extend to other popular variants of importance sampling and show that estimating a non-Markovian behavior policy can further lower large-sample mean squared error even when the true behavior policy is Markovian.", "bibtex": "@InProceedings{pmlr-v97-hanna19a,\n title = \t {Importance Sampling Policy Evaluation with an Estimated Behavior Policy},\n author = {Hanna, Josiah and Niekum, Scott and Stone, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2605--2613},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hanna19a/hanna19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hanna19a.html},\n abstract = \t {We consider the problem of off-policy evaluation in Markov decision processes. Off-policy evaluation is the task of evaluating the expected return of one policy with data generated by a different, behavior policy. Importance sampling is a technique for off-policy evaluation that re-weights off-policy returns to account for differences in the likelihood of the returns between the two policies. In this paper, we study importance sampling with an estimated behavior policy where the behavior policy estimate comes from the same set of data used to compute the importance sampling estimate. We find that this estimator often lowers the mean squared error of off-policy evaluation compared to importance sampling with the true behavior policy or using a behavior policy that is estimated from a separate data set. Intuitively, estimating the behavior policy in this way corrects for error due to sampling in the action-space. Our empirical results also extend to other popular variants of importance sampling and show that estimating a non-Markovian behavior policy can further lower large-sample mean squared error even when the true behavior policy is Markovian.}\n}", "pdf": "http://proceedings.mlr.press/v97/hanna19a/hanna19a.pdf", "supp": "", "pdf_size": 2013070, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11718610357007396139&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "The University of Texas at Austin; The University of Texas at Austin; The University of Texas at Austin", "aff_domain": "cs.utexas.edu; ; ", "email": "cs.utexas.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hanna19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Improved Convergence for $\\ell_1$ and $\\ell_\u221e$ Regression via Iteratively Reweighted Least Squares", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3597", "id": "3597", "author_site": "Alina Ene, Adrian Vladu", "author": "Alina Ene; Adrian Vladu", "abstract": "The iteratively reweighted least squares method (IRLS) is a popular technique used in practice for solving regression problems. Various versions of this method have been proposed, but their theoretical analyses failed to capture the good practical performance. In this paper we propose a simple and natural version of IRLS for solving $\\ell_\\infty$ and $\\ell_1$ regression, which provably converges to a $(1+\\epsilon)$-approximate solution in $O(m^{1/3}\\log(1/\\epsilon)/\\epsilon^{2/3} + \\log m/\\epsilon^2)$ iterations, where $m$ is the number of rows of the input matrix. Interestingly, this running time is independent of the conditioning of the input, and the dominant term of the running time depends sublinearly in $\\epsilon^{-1}$, which is atypical for the optimization of non-smooth functions. This improves upon the more complex algorithms of Chin et al. (ITCS \u201912), and Christiano et al. (STOC \u201911) by a factor of at least $1/\\epsilon^2$, and yields a truly efficient natural algorithm for the slime mold dynamics (Straszak-Vishnoi, SODA \u201916, ITCS \u201916, ITCS \u201917).", "bibtex": "@InProceedings{pmlr-v97-ene19a,\n title = \t {Improved Convergence for $\\ell_1$ and $\\ell_\u221e$ Regression via Iteratively Reweighted Least Squares},\n author = {Ene, Alina and Vladu, Adrian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1794--1801},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ene19a/ene19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ene19a.html},\n abstract = \t {The iteratively reweighted least squares method (IRLS) is a popular technique used in practice for solving regression problems. Various versions of this method have been proposed, but their theoretical analyses failed to capture the good practical performance. In this paper we propose a simple and natural version of IRLS for solving $\\ell_\\infty$ and $\\ell_1$ regression, which provably converges to a $(1+\\epsilon)$-approximate solution in $O(m^{1/3}\\log(1/\\epsilon)/\\epsilon^{2/3} + \\log m/\\epsilon^2)$ iterations, where $m$ is the number of rows of the input matrix. Interestingly, this running time is independent of the conditioning of the input, and the dominant term of the running time depends sublinearly in $\\epsilon^{-1}$, which is atypical for the optimization of non-smooth functions. This improves upon the more complex algorithms of Chin et al. (ITCS \u201912), and Christiano et al. (STOC \u201911) by a factor of at least $1/\\epsilon^2$, and yields a truly efficient natural algorithm for the slime mold dynamics (Straszak-Vishnoi, SODA \u201916, ITCS \u201916, ITCS \u201917).}\n}", "pdf": "http://proceedings.mlr.press/v97/ene19a/ene19a.pdf", "supp": "", "pdf_size": 643119, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17699108587741889463&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff": "Boston University, Boston, USA; Boston University, Boston, USA + MIT", "aff_domain": "bu.edu;mit.edu", "email": "bu.edu;mit.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ene19a.html", "aff_unique_index": "0;0+1", "aff_unique_norm": "Boston University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.bu.edu;https://web.mit.edu", "aff_unique_abbr": "BU;MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Boston;", "aff_country_unique_index": "0;0+0", "aff_country_unique": "United States" }, { "title": "Improved Dynamic Graph Learning through Fault-Tolerant Sparsification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4263", "id": "4263", "author_site": "Chunjiang Zhu, Sabine Storandt, Kam-Yiu Lam, Song Han, Jinbo Bi", "author": "Chunjiang Zhu; Sabine Storandt; Kam-Yiu Lam; Song Han; Jinbo Bi", "abstract": "Graph sparsification has been used to improve the computational cost of learning over graphs, e.g., Laplacian-regularized estimation and graph semi-supervised learning (SSL). However, when graphs vary over time, repeated sparsification requires polynomial order computational cost per update. We propose a new type of graph sparsification namely fault-tolerant (FT) sparsification to significantly reduce the cost to only a constant. Then the computational cost of subsequent graph learning tasks can be significantly improved with limited loss in their accuracy. In particular, we give theoretical analyze to upper bound the loss in the accuracy of the subsequent Laplacian-regularized estimation and graph SSL, due to the FT sparsification. In addition, FT spectral sparsification can be generalized to FT cut sparsification, for cut-based graph learning. Extensive experiments have confirmed the computational efficiencies and accuracies of the proposed methods for learning on dynamic graphs.", "bibtex": "@InProceedings{pmlr-v97-zhu19b,\n title = \t {Improved Dynamic Graph Learning through Fault-Tolerant Sparsification},\n author = {Zhu, Chunjiang and Storandt, Sabine and Lam, Kam-Yiu and Han, Song and Bi, Jinbo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7624--7633},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhu19b/zhu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhu19b.html},\n abstract = \t {Graph sparsification has been used to improve the computational cost of learning over graphs, e.g., Laplacian-regularized estimation and graph semi-supervised learning (SSL). However, when graphs vary over time, repeated sparsification requires polynomial order computational cost per update. We propose a new type of graph sparsification namely fault-tolerant (FT) sparsification to significantly reduce the cost to only a constant. Then the computational cost of subsequent graph learning tasks can be significantly improved with limited loss in their accuracy. In particular, we give theoretical analyze to upper bound the loss in the accuracy of the subsequent Laplacian-regularized estimation and graph SSL, due to the FT sparsification. In addition, FT spectral sparsification can be generalized to FT cut sparsification, for cut-based graph learning. Extensive experiments have confirmed the computational efficiencies and accuracies of the proposed methods for learning on dynamic graphs.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhu19b/zhu19b.pdf", "supp": "", "pdf_size": 355625, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13990891507194635523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "University of Connecticut; University of Konstanz; City University of Hong Kong; University of Connecticut; University of Connecticut", "aff_domain": "uconn.edu;uni-konstanz.de;cityu.edu.hk;uconn.edu;uconn.edu", "email": "uconn.edu;uni-konstanz.de;cityu.edu.hk;uconn.edu;uconn.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zhu19b.html", "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "University of Connecticut;University of Konstanz;City University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uconn.edu;https://www.uni-konstanz.de;https://www.cityu.edu.hk", "aff_unique_abbr": "UConn;Uni Konstanz;CityU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;1;2;0;0", "aff_country_unique": "United States;Germany;China" }, { "title": "Improved Parallel Algorithms for Density-Based Network Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3626", "id": "3626", "author_site": "Mohsen Ghaffari, Silvio Lattanzi, Slobodan Mitrovi\u0107", "author": "Mohsen Ghaffari; Silvio Lattanzi; Slobodan Mitrovi\u0107", "abstract": "Clustering large-scale networks is a central topic in unsupervised learning with many applications in machine learning and data mining. A classic approach to cluster a network is to identify regions of high edge density, which in the literature is captured by two fundamental problems: the densest subgraph and the $k$-core decomposition problems. We design massively parallel computation (MPC) algorithms for these problems that are considerably faster than prior work. In the case of $k$-core decomposition, our work improves exponentially on the algorithm provided by Esfandiari et al.\u00a0(ICML\u201918). Compared to the prior work on densest subgraph presented by Bahmani et al.\u00a0(VLDB\u201912, \u201914), our result requires quadratically fewer MPC rounds. We complement our analysis with an experimental scalability analysis of our techniques.", "bibtex": "@InProceedings{pmlr-v97-ghaffari19a,\n title = \t {Improved Parallel Algorithms for Density-Based Network Clustering},\n author = {Ghaffari, Mohsen and Lattanzi, Silvio and Mitrovi{\\'c}, Slobodan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2201--2210},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ghaffari19a/ghaffari19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ghaffari19a.html},\n abstract = \t {Clustering large-scale networks is a central topic in unsupervised learning with many applications in machine learning and data mining. A classic approach to cluster a network is to identify regions of high edge density, which in the literature is captured by two fundamental problems: the densest subgraph and the $k$-core decomposition problems. We design massively parallel computation (MPC) algorithms for these problems that are considerably faster than prior work. In the case of $k$-core decomposition, our work improves exponentially on the algorithm provided by Esfandiari et al.\u00a0(ICML\u201918). Compared to the prior work on densest subgraph presented by Bahmani et al.\u00a0(VLDB\u201912, \u201914), our result requires quadratically fewer MPC rounds. We complement our analysis with an experimental scalability analysis of our techniques.}\n}", "pdf": "http://proceedings.mlr.press/v97/ghaffari19a/ghaffari19a.pdf", "supp": "", "pdf_size": 410688, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1342868707425879850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "ETH Zurich; Google Research Zurich; MIT", "aff_domain": "inf.ethz.ch;google.com;mit.edu", "email": "inf.ethz.ch;google.com;mit.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ghaffari19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "ETH Zurich;Google;Massachusetts Institute of Technology", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.ethz.ch;https://research.google;https://web.mit.edu", "aff_unique_abbr": "ETHZ;Google;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Zurich", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Improved Zeroth-Order Variance Reduced Algorithms and Analysis for Nonconvex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3691", "id": "3691", "author_site": "Kaiyi Ji, Zhe Wang, Yi Zhou, Yingbin LIANG", "author": "Kaiyi Ji; Zhe Wang; Yi Zhou; Yingbin Liang", "abstract": "Two types of zeroth-order stochastic algorithms have recently been designed for nonconvex optimization respectively based on the first-order techniques SVRG and SARAH/SPIDER. This paper addresses several important issues that are still open in these methods. First, all existing SVRG-type zeroth-order algorithms suffer from worse function query complexities than either zeroth-order gradient descent (ZO-GD) or stochastic gradient descent (ZO-SGD). In this paper, we propose a new algorithm ZO-SVRG-Coord-Rand and develop a new analysis for an existing ZO-SVRG-Coord algorithm proposed in Liu et al. 2018b, and show that both ZO-SVRG-Coord-Rand and ZO-SVRG-Coord (under our new analysis) outperform other exiting SVRG-type zeroth-order methods as well as ZO-GD and ZO-SGD. Second, the existing SPIDER-type algorithm SPIDER-SZO (Fang et al., 2018) has superior theoretical performance, but suffers from the generation of a large number of Gaussian random variables as well as a $\\sqrt{\\epsilon}$-level stepsize in practice. In this paper, we develop a new algorithm ZO-SPIDER-Coord, which is free from Gaussian variable generation and allows a large constant stepsize while maintaining the same convergence rate and query complexity, and we further show that ZO-SPIDER-Coord automatically achieves a linear convergence rate as the iterate enters into a local PL region without restart and algorithmic modification.", "bibtex": "@InProceedings{pmlr-v97-ji19a,\n title = \t {Improved Zeroth-Order Variance Reduced Algorithms and Analysis for Nonconvex Optimization},\n author = {Ji, Kaiyi and Wang, Zhe and Zhou, Yi and Liang, Yingbin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3100--3109},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ji19a/ji19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ji19a.html},\n abstract = \t {Two types of zeroth-order stochastic algorithms have recently been designed for nonconvex optimization respectively based on the first-order techniques SVRG and SARAH/SPIDER. This paper addresses several important issues that are still open in these methods. First, all existing SVRG-type zeroth-order algorithms suffer from worse function query complexities than either zeroth-order gradient descent (ZO-GD) or stochastic gradient descent (ZO-SGD). In this paper, we propose a new algorithm ZO-SVRG-Coord-Rand and develop a new analysis for an existing ZO-SVRG-Coord algorithm proposed in Liu et al. 2018b, and show that both ZO-SVRG-Coord-Rand and ZO-SVRG-Coord (under our new analysis) outperform other exiting SVRG-type zeroth-order methods as well as ZO-GD and ZO-SGD. Second, the existing SPIDER-type algorithm SPIDER-SZO (Fang et al., 2018) has superior theoretical performance, but suffers from the generation of a large number of Gaussian random variables as well as a $\\sqrt{\\epsilon}$-level stepsize in practice. In this paper, we develop a new algorithm ZO-SPIDER-Coord, which is free from Gaussian variable generation and allows a large constant stepsize while maintaining the same convergence rate and query complexity, and we further show that ZO-SPIDER-Coord automatically achieves a linear convergence rate as the iterate enters into a local PL region without restart and algorithmic modification.}\n}", "pdf": "http://proceedings.mlr.press/v97/ji19a/ji19a.pdf", "supp": "", "pdf_size": 1237899, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10262997724629185505&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, The Ohio State University; Department of Electrical and Computer Engineering, The Ohio State University; Department of Electrical and Computer Engineering, Duke University; Department of Electrical and Computer Engineering, The Ohio State University", "aff_domain": "osu.edu; ; ; ", "email": "osu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/ji19a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Ohio State University;Duke University", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.osu.edu;https://www.duke.edu", "aff_unique_abbr": "OSU;Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Improving Adversarial Robustness via Promoting Ensemble Diversity", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3614", "id": "3614", "author_site": "Tianyu Pang, Kun Xu, Chao Du, Ning Chen, Jun Zhu", "author": "Tianyu Pang; Kun Xu; Chao Du; Ning Chen; Jun Zhu", "abstract": "Though deep neural networks have achieved significant progress on various tasks, often enhanced by model ensemble, existing high-performance models can be vulnerable to adversarial attacks. Many efforts have been devoted to enhancing the robustness of individual networks and then constructing a straightforward ensemble, e.g., by directly averaging the outputs, which ignores the interaction among networks. This paper presents a new method that explores the interaction among individual networks to improve robustness for ensemble models. Technically, we define a new notion of ensemble diversity in the adversarial setting as the diversity among non-maximal predictions of individual members, and present an adaptive diversity promoting (ADP) regularizer to encourage the diversity, which leads to globally better robustness for the ensemble by making adversarial examples difficult to transfer among individual members. Our method is computationally efficient and compatible with the defense methods acting on individual networks. Empirical results on various datasets verify that our method can improve adversarial robustness while maintaining state-of-the-art accuracy on normal examples.", "bibtex": "@InProceedings{pmlr-v97-pang19a,\n title = \t {Improving Adversarial Robustness via Promoting Ensemble Diversity},\n author = {Pang, Tianyu and Xu, Kun and Du, Chao and Chen, Ning and Zhu, Jun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4970--4979},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/pang19a/pang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/pang19a.html},\n abstract = \t {Though deep neural networks have achieved significant progress on various tasks, often enhanced by model ensemble, existing high-performance models can be vulnerable to adversarial attacks. Many efforts have been devoted to enhancing the robustness of individual networks and then constructing a straightforward ensemble, e.g., by directly averaging the outputs, which ignores the interaction among networks. This paper presents a new method that explores the interaction among individual networks to improve robustness for ensemble models. Technically, we define a new notion of ensemble diversity in the adversarial setting as the diversity among non-maximal predictions of individual members, and present an adaptive diversity promoting (ADP) regularizer to encourage the diversity, which leads to globally better robustness for the ensemble by making adversarial examples difficult to transfer among individual members. Our method is computationally efficient and compatible with the defense methods acting on individual networks. Empirical results on various datasets verify that our method can improve adversarial robustness while maintaining state-of-the-art accuracy on normal examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/pang19a/pang19a.pdf", "supp": "", "pdf_size": 516000, "gs_citation": 544, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16568032932303177237&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science and Technology, Institute for AI, BNRist Center, THBI Lab, Tsinghua-Fuzhou Institute for Data Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Institute for AI, BNRist Center, THBI Lab, Tsinghua-Fuzhou Institute for Data Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Institute for AI, BNRist Center, THBI Lab, Tsinghua-Fuzhou Institute for Data Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Institute for AI, BNRist Center, THBI Lab, Tsinghua-Fuzhou Institute for Data Technology, Tsinghua University, Beijing, China; Department of Computer Science and Technology, Institute for AI, BNRist Center, THBI Lab, Tsinghua-Fuzhou Institute for Data Technology, Tsinghua University, Beijing, China", "aff_domain": "mails.tsinghua.edu.cn; ; ;tsinghua.edu.cn;tsinghua.edu.cn", "email": "mails.tsinghua.edu.cn; ; ;tsinghua.edu.cn;tsinghua.edu.cn", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/pang19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Department of Computer Science and Technology", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Improving Model Selection by Employing the Test Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3649", "id": "3649", "author_site": "Max Westphal, Werner Brannath", "author": "Max Westphal; Werner Brannath", "abstract": "Model selection and evaluation are usually strictly separated by means of data splitting to enable an unbiased estimation and a simple statistical inference for the unknown generalization performance of the final prediction model. We investigate the properties of novel evaluation strategies, namely when the final model is selected based on empirical performances on the test data. To guard against selection induced overoptimism, we employ a parametric multiple test correction based on the approximate multivariate distribution of performance estimates. Our numerical experiments involve training common machine learning algorithms (EN, CART, SVM, XGB) on various artificial classification tasks. At its core, our proposed approach improves model selection in terms of the expected final model performance without introducing overoptimism. We furthermore observed a higher probability for a successful evaluation study, making it easier in practice to empirically demonstrate a sufficiently high predictive performance.", "bibtex": "@InProceedings{pmlr-v97-westphal19a,\n title = \t {Improving Model Selection by Employing the Test Data},\n author = {Westphal, Max and Brannath, Werner},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6747--6756},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/westphal19a/westphal19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/westphal19a.html},\n abstract = \t {Model selection and evaluation are usually strictly separated by means of data splitting to enable an unbiased estimation and a simple statistical inference for the unknown generalization performance of the final prediction model. We investigate the properties of novel evaluation strategies, namely when the final model is selected based on empirical performances on the test data. To guard against selection induced overoptimism, we employ a parametric multiple test correction based on the approximate multivariate distribution of performance estimates. Our numerical experiments involve training common machine learning algorithms (EN, CART, SVM, XGB) on various artificial classification tasks. At its core, our proposed approach improves model selection in terms of the expected final model performance without introducing overoptimism. We furthermore observed a higher probability for a successful evaluation study, making it easier in practice to empirically demonstrate a sufficiently high predictive performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/westphal19a/westphal19a.pdf", "supp": "", "pdf_size": 832459, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4879274871396256825&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Institute for Statistics, Faculty 3: Mathematics and Computer Science, University of Bremen, Bremen, Germany; Institute for Statistics, Faculty 3: Mathematics and Computer Science, University of Bremen, Bremen, Germany", "aff_domain": "uni-bremen.de; ", "email": "uni-bremen.de; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/westphal19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Bremen", "aff_unique_dep": "Institute for Statistics", "aff_unique_url": "https://www.uni-bremen.de", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Bremen", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Improving Neural Language Modeling via Adversarial Training", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3888", "id": "3888", "author_site": "Dilin Wang, Chengyue Gong, Qiang Liu", "author": "Dilin Wang; Chengyue Gong; Qiang Liu", "abstract": "Recently, substantial progress has been made in language modeling by using deep neural networks. However, in practice, large scale neural language models have been shown to be prone to overfitting. In this paper, we present a simple yet highly effective adversarial training mechanism for regularizing neural language models. The idea is to introduce adversarial noise to the output embedding layer while training the models. We show that the optimal adversarial noise yields a simple closed form solution, thus allowing us to develop a simple and time efficient algorithm. Theoretically, we show that our adversarial mechanism effectively encourages the diversity of the embedding vectors, helping to increase the robustness of models. Empirically, we show that our method improves on the single model state-of-the-art results for language modeling on Penn Treebank (PTB) and Wikitext-2, achieving test perplexity scores of 46.01 and 38.65, respectively. When applied to machine translation, our method improves over various transformer-based translation baselines in BLEU scores on the WMT14 English-German and IWSLT14 German-English tasks.", "bibtex": "@InProceedings{pmlr-v97-wang19f,\n title = \t {Improving Neural Language Modeling via Adversarial Training},\n author = {Wang, Dilin and Gong, Chengyue and Liu, Qiang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6555--6565},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19f/wang19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19f.html},\n abstract = \t {Recently, substantial progress has been made in language modeling by using deep neural networks. However, in practice, large scale neural language models have been shown to be prone to overfitting. In this paper, we present a simple yet highly effective adversarial training mechanism for regularizing neural language models. The idea is to introduce adversarial noise to the output embedding layer while training the models. We show that the optimal adversarial noise yields a simple closed form solution, thus allowing us to develop a simple and time efficient algorithm. Theoretically, we show that our adversarial mechanism effectively encourages the diversity of the embedding vectors, helping to increase the robustness of models. Empirically, we show that our method improves on the single model state-of-the-art results for language modeling on Penn Treebank (PTB) and Wikitext-2, achieving test perplexity scores of 46.01 and 38.65, respectively. When applied to machine translation, our method improves over various transformer-based translation baselines in BLEU scores on the WMT14 English-German and IWSLT14 German-English tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19f/wang19f.pdf", "supp": "", "pdf_size": 2359948, "gs_citation": 123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13673209609848344447&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, UT Austin; Department of Computer Science, UT Austin; Department of Computer Science, UT Austin", "aff_domain": "cs.utexas.edu;cs.utexas.edu; ", "email": "cs.utexas.edu;cs.utexas.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wang19f.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Improving Neural Network Quantization without Retraining using Outlier Channel Splitting", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3981", "id": "3981", "author_site": "Ritchie Zhao, Yuwei Hu, Jordan Dotzel, Christopher De Sa, Zhiru Zhang", "author": "Ritchie Zhao; Yuwei Hu; Jordan Dotzel; Chris De Sa; Zhiru Zhang", "abstract": "Quantization can improve the execution latency and energy efficiency of neural networks on both commodity GPUs and specialized accelerators. The majority of existing literature focuses on training quantized DNNs, while this work examines the less-studied topic of quantizing a floating-point model without (re)training. DNN weights and activations follow a bell-shaped distribution post-training, while practical hardware uses a linear quantization grid. This leads to challenges in dealing with outliers in the distribution. Prior work has addressed this by clipping the outliers or using specialized hardware. In this work, we propose outlier channel splitting (OCS), which duplicates channels containing outliers, then halves the channel values. The network remains functionally identical, but affected outliers are moved toward the center of the distribution. OCS requires no additional training and works on commodity hardware. Experimental evaluation on ImageNet classification and language modeling shows that OCS can outperform state-of-the-art clipping techniques with only minor overhead.", "bibtex": "@InProceedings{pmlr-v97-zhao19c,\n title = \t {Improving Neural Network Quantization without Retraining using Outlier Channel Splitting},\n author = {Zhao, Ritchie and Hu, Yuwei and Dotzel, Jordan and De Sa, Chris and Zhang, Zhiru},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7543--7552},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhao19c/zhao19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhao19c.html},\n abstract = \t {Quantization can improve the execution latency and energy efficiency of neural networks on both commodity GPUs and specialized accelerators. The majority of existing literature focuses on training quantized DNNs, while this work examines the less-studied topic of quantizing a floating-point model without (re)training. DNN weights and activations follow a bell-shaped distribution post-training, while practical hardware uses a linear quantization grid. This leads to challenges in dealing with outliers in the distribution. Prior work has addressed this by clipping the outliers or using specialized hardware. In this work, we propose outlier channel splitting (OCS), which duplicates channels containing outliers, then halves the channel values. The network remains functionally identical, but affected outliers are moved toward the center of the distribution. OCS requires no additional training and works on commodity hardware. Experimental evaluation on ImageNet classification and language modeling shows that OCS can outperform state-of-the-art clipping techniques with only minor overhead.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhao19c/zhao19c.pdf", "supp": "", "pdf_size": 429420, "gs_citation": 392, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6088865890786611447&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Cornell University; Cornell University; Cornell University; Cornell University; Cornell University", "aff_domain": "cornell.edu; ; ; ; ", "email": "cornell.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zhao19c.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Imputing Missing Events in Continuous-Time Event Streams", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3675", "id": "3675", "author_site": "Hongyuan Mei, Guanghui Qin, Jason Eisner", "author": "Hongyuan Mei; Guanghui Qin; Jason Eisner", "abstract": "Events in the world may be caused by other, unobserved events. We consider sequences of events in continuous time. Given a probability model of complete sequences, we propose particle smoothing\u2014a form of sequential importance sampling\u2014to impute the missing events in an incomplete sequence. We develop a trainable family of proposal distributions based on a type of bidirectional continuous-time LSTM: Bidirectionality lets the proposals condition on future observations, not just on the past as in particle filtering. Our method can sample an ensemble of possible complete sequences (particles), from which we form a single consensus prediction that has low Bayes risk under our chosen loss metric. We experiment in multiple synthetic and real domains, using different missingness mechanisms, and modeling the complete sequences in each domain with a neural Hawkes process (Mei & Eisner 2017). On held-out incomplete sequences, our method is effective at inferring the ground-truth unobserved events, with particle smoothing consistently improving upon particle filtering.", "bibtex": "@InProceedings{pmlr-v97-mei19a,\n title = \t {Imputing Missing Events in Continuous-Time Event Streams},\n author = {Mei, Hongyuan and Qin, Guanghui and Eisner, Jason},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4475--4485},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mei19a/mei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mei19a.html},\n abstract = \t {Events in the world may be caused by other, unobserved events. We consider sequences of events in continuous time. Given a probability model of complete sequences, we propose particle smoothing\u2014a form of sequential importance sampling\u2014to impute the missing events in an incomplete sequence. We develop a trainable family of proposal distributions based on a type of bidirectional continuous-time LSTM: Bidirectionality lets the proposals condition on future observations, not just on the past as in particle filtering. Our method can sample an ensemble of possible complete sequences (particles), from which we form a single consensus prediction that has low Bayes risk under our chosen loss metric. We experiment in multiple synthetic and real domains, using different missingness mechanisms, and modeling the complete sequences in each domain with a neural Hawkes process (Mei & Eisner 2017). On held-out incomplete sequences, our method is effective at inferring the ground-truth unobserved events, with particle smoothing consistently improving upon particle filtering.}\n}", "pdf": "http://proceedings.mlr.press/v97/mei19a/mei19a.pdf", "supp": "", "pdf_size": 1666424, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8012453208848277577&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Johns Hopkins University, USA; Department of Physics, Peking University, China; Department of Computer Science, Johns Hopkins University, USA", "aff_domain": "cs.jhu.edu; ; ", "email": "cs.jhu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mei19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Johns Hopkins University;Peking University", "aff_unique_dep": "Department of Computer Science;Department of Physics", "aff_unique_url": "https://www.jhu.edu;http://www.pku.edu.cn", "aff_unique_abbr": "JHU;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;China" }, { "title": "Incorporating Grouping Information into Bayesian Decision Tree Ensembles", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4149", "id": "4149", "author_site": "JUNLIANG DU, Antonio Linero", "author": "Junliang Du; Antonio Linero", "abstract": "We consider the problem of nonparametric regression in the high-dimensional setting in which $P \\gg N$. We study the use of overlapping group structures to improve prediction and variable selection. These structures arise commonly when analyzing DNA microarray data, where genes can naturally be grouped according to genetic pathways. We incorporate overlapping group structure into a Bayesian additive regression trees model using a prior constructed so that, if a variable from some group is used to construct a split, this increases the probability that subsequent splits will use predictors from the same group. We refer to our model as an overlapping group Bayesian additive regression trees (OG-BART) model, and our prior on the splits an overlapping group Dirichlet (OG-Dirichlet) prior. Like the sparse group lasso, our prior encourages sparsity both within and between groups. We study the correlation structure of the prior, illustrate the proposed methodology on simulated data, and apply the methodology to gene expression data to learn which genetic pathways are predictive of breast cancer tumor metastasis.", "bibtex": "@InProceedings{pmlr-v97-du19d,\n title = \t {Incorporating Grouping Information into {B}ayesian Decision Tree Ensembles},\n author = {Du, Junliang and Linero, Antonio},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1686--1695},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/du19d/du19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/du19d.html},\n abstract = \t {We consider the problem of nonparametric regression in the high-dimensional setting in which $P \\gg N$. We study the use of overlapping group structures to improve prediction and variable selection. These structures arise commonly when analyzing DNA microarray data, where genes can naturally be grouped according to genetic pathways. We incorporate overlapping group structure into a Bayesian additive regression trees model using a prior constructed so that, if a variable from some group is used to construct a split, this increases the probability that subsequent splits will use predictors from the same group. We refer to our model as an overlapping group Bayesian additive regression trees (OG-BART) model, and our prior on the splits an overlapping group Dirichlet (OG-Dirichlet) prior. Like the sparse group lasso, our prior encourages sparsity both within and between groups. We study the correlation structure of the prior, illustrate the proposed methodology on simulated data, and apply the methodology to gene expression data to learn which genetic pathways are predictive of breast cancer tumor metastasis.}\n}", "pdf": "http://proceedings.mlr.press/v97/du19d/du19d.pdf", "supp": "", "pdf_size": 458816, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14393597959868510710&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, Florida State University; Department of Statistics, Florida State University", "aff_domain": "stat.fsu.edu;stat.fsu.edu", "email": "stat.fsu.edu;stat.fsu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/du19d.html", "aff_unique_index": "0;0", "aff_unique_norm": "Florida State University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.fsu.edu", "aff_unique_abbr": "FSU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tallahassee", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Incremental Randomized Sketching for Online Kernel Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3922", "id": "3922", "author_site": "Xiao Zhang, Shizhong Liao", "author": "Xiao Zhang; Shizhong Liao", "abstract": "Randomized sketching has been used in offline kernel learning, but it cannot be applied directly to online kernel learning due to the lack of incremental maintenances for randomized sketches with regret guarantees. To address these issues, we propose a novel incremental randomized sketching approach for online kernel learning, which has efficient incremental maintenances with theoretical guarantees. We construct two incremental randomized sketches using the sparse transform matrix and the sampling matrix for kernel matrix approximation, update the incremental randomized sketches using rank-$1$ modifications, and construct an time-varying explicit feature mapping for online kernel learning. We prove that the proposed incremental randomized sketching is statistically unbiased for the matrix product approximation, obtains a $1 + \\epsilon$ relative-error bound for the kernel matrix approximation, enjoys a sublinear regret bound for online kernel learning, and has constant time and space complexities at each round for incremental maintenances. Experimental results demonstrate that the incremental randomized sketching achieves a better learning performance in terms of accuracy and efficiency even in adversarial environments.", "bibtex": "@InProceedings{pmlr-v97-zhang19h,\n title = \t {Incremental Randomized Sketching for Online Kernel Learning},\n author = {Zhang, Xiao and Liao, Shizhong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7394--7403},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19h/zhang19h.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19h.html},\n abstract = \t {Randomized sketching has been used in offline kernel learning, but it cannot be applied directly to online kernel learning due to the lack of incremental maintenances for randomized sketches with regret guarantees. To address these issues, we propose a novel incremental randomized sketching approach for online kernel learning, which has efficient incremental maintenances with theoretical guarantees. We construct two incremental randomized sketches using the sparse transform matrix and the sampling matrix for kernel matrix approximation, update the incremental randomized sketches using rank-$1$ modifications, and construct an time-varying explicit feature mapping for online kernel learning. We prove that the proposed incremental randomized sketching is statistically unbiased for the matrix product approximation, obtains a $1 + \\epsilon$ relative-error bound for the kernel matrix approximation, enjoys a sublinear regret bound for online kernel learning, and has constant time and space complexities at each round for incremental maintenances. Experimental results demonstrate that the incremental randomized sketching achieves a better learning performance in terms of accuracy and efficiency even in adversarial environments.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19h/zhang19h.pdf", "supp": "", "pdf_size": 548980, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4011519096128142998&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "College of Intelligence and Computing, Tianjin University, Tianjin 300350, China; College of Intelligence and Computing, Tianjin University, Tianjin 300350, China", "aff_domain": "tju.edu.cn;tju.edu.cn", "email": "tju.edu.cn;tju.edu.cn", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhang19h.html", "aff_unique_index": "0;0", "aff_unique_norm": "Tianjin University", "aff_unique_dep": "College of Intelligence and Computing", "aff_unique_url": "http://www.tju.edu.cn", "aff_unique_abbr": "Tianjin University", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tianjin", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Inference and Sampling of $K_33$-free Ising Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4294", "id": "4294", "author_site": "Valerii Likhosherstov, Yury Maximov, Misha Chertkov", "author": "Valerii Likhosherstov; Yury Maximov; Misha Chertkov", "abstract": "We call an Ising model tractable when it is possible to compute its partition function value (statistical inference) in polynomial time. The tractability also implies an ability to sample configurations of this model in polynomial time. The notion of tractability extends the basic case of planar zero-field Ising models. Our starting point is to describe algorithms for the basic case, computing partition function and sampling efficiently. Then, we extend our tractable inference and sampling algorithms to models whose triconnected components are either planar or graphs of $O(1)$ size. In particular, it results in a polynomial-time inference and sampling algorithms for $K_{33}$ (minor)-free topologies of zero-field Ising models\u2014a generalization of planar graphs with a potentially unbounded genus.", "bibtex": "@InProceedings{pmlr-v97-likhosherstov19a,\n title = \t {Inference and Sampling of $K_{33}$-free Ising Models},\n author = {Likhosherstov, Valerii and Maximov, Yury and Chertkov, Misha},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3963--3972},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/likhosherstov19a/likhosherstov19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/likhosherstov19a.html},\n abstract = \t {We call an Ising model tractable when it is possible to compute its partition function value (statistical inference) in polynomial time. The tractability also implies an ability to sample configurations of this model in polynomial time. The notion of tractability extends the basic case of planar zero-field Ising models. Our starting point is to describe algorithms for the basic case, computing partition function and sampling efficiently. Then, we extend our tractable inference and sampling algorithms to models whose triconnected components are either planar or graphs of $O(1)$ size. In particular, it results in a polynomial-time inference and sampling algorithms for $K_{33}$ (minor)-free topologies of zero-field Ising models\u2014a generalization of planar graphs with a potentially unbounded genus.}\n}", "pdf": "http://proceedings.mlr.press/v97/likhosherstov19a/likhosherstov19a.pdf", "supp": "", "pdf_size": 482779, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5307370996995361997&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Skolkovo Institute of Science and Technology, Moscow, Russia+Theoretical Division and Center for Nonlinear Studies, Los Alamos National Laboratory, Los Alamos, NM, USA+Graduate Program in Applied Mathematics, University of Arizona, Tucson, AZ, USA; Skolkovo Institute of Science and Technology, Moscow, Russia+Theoretical Division and Center for Nonlinear Studies, Los Alamos National Laboratory, Los Alamos, NM, USA; Theoretical Division and Center for Nonlinear Studies, Los Alamos National Laboratory, Los Alamos, NM, USA+Graduate Program in Applied Mathematics, University of Arizona, Tucson, AZ, USA", "aff_domain": "skoltech.ru; ; ", "email": "skoltech.ru; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/likhosherstov19a.html", "aff_unique_index": "0+1+2;0+1;1+2", "aff_unique_norm": "Skolkovo Institute of Science and Technology;Los Alamos National Laboratory;University of Arizona", "aff_unique_dep": ";Theoretical Division and Center for Nonlinear Studies;Graduate Program in Applied Mathematics", "aff_unique_url": "https://www.skoltech.ru;https://www.lanl.gov;https://www.arizona.edu", "aff_unique_abbr": "Skoltech;LANL;UA", "aff_campus_unique_index": "0+1+2;0+1;1+2", "aff_campus_unique": "Moscow;Los Alamos;Tucson", "aff_country_unique_index": "0+1+1;0+1;1+1", "aff_country_unique": "Russian Federation;United States" }, { "title": "Inferring Heterogeneous Causal Effects in Presence of Spatial Confounding", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3804", "id": "3804", "author_site": "Muhammad Osama, Dave Zachariah, Thomas Sch\u00f6n", "author": "Muhammad Osama; Dave Zachariah; Thomas B. Sch\u00f6n", "abstract": "We address the problem of inferring the causal effect of an exposure on an outcome across space, using observational data. The data is possibly subject to unmeasured confounding variables which, in a standard approach, must be adjusted for by estimating a nuisance function. Here we develop a method that eliminates the nuisance function, while mitigating the resulting errors-in-variables. The result is a robust and accurate inference method for spatially varying heterogeneous causal effects. The properties of the method are demonstrated on synthetic as well as real data from Germany and the US.", "bibtex": "@InProceedings{pmlr-v97-osama19a,\n title = \t {Inferring Heterogeneous Causal Effects in Presence of Spatial Confounding},\n author = {Osama, Muhammad and Zachariah, Dave and Sch{\\\"o}n, Thomas B.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4942--4950},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/osama19a/osama19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/osama19a.html},\n abstract = \t {We address the problem of inferring the causal effect of an exposure on an outcome across space, using observational data. The data is possibly subject to unmeasured confounding variables which, in a standard approach, must be adjusted for by estimating a nuisance function. Here we develop a method that eliminates the nuisance function, while mitigating the resulting errors-in-variables. The result is a robust and accurate inference method for spatially varying heterogeneous causal effects. The properties of the method are demonstrated on synthetic as well as real data from Germany and the US.}\n}", "pdf": "http://proceedings.mlr.press/v97/osama19a/osama19a.pdf", "supp": "", "pdf_size": 3745384, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14041258610340953811&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Division of System and Control, Department of Information Technology, Uppsala University; Division of System and Control, Department of Information Technology, Uppsala University; Division of System and Control, Department of Information Technology, Uppsala University", "aff_domain": "it.uu.se;it.uu.se; ", "email": "it.uu.se;it.uu.se; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/osama19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Uppsala University", "aff_unique_dep": "Division of System and Control, Department of Information Technology", "aff_unique_url": "https://www.uu.se", "aff_unique_abbr": "UU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Sweden" }, { "title": "Infinite Mixture Prototypes for Few-shot Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4256", "id": "4256", "author_site": "Kelsey Allen, Evan Shelhamer, Hanul Shin, Josh Tenenbaum", "author": "Kelsey Allen; Evan Shelhamer; Hanul Shin; Joshua Tenenbaum", "abstract": "We propose infinite mixture prototypes to adaptively represent both simple and complex data distributions for few-shot learning. Infinite mixture prototypes combine deep representation learning with Bayesian nonparametrics, representing each class by a set of clusters, unlike existing prototypical methods that represent each class by a single cluster. By inferring the number of clusters, infinite mixture prototypes interpolate between nearest neighbor and prototypical representations in a learned feature space, which improves accuracy and robustness in the few-shot regime. We show the importance of adaptive capacity for capturing complex data distributions such as super-classes (like alphabets in character recognition), with 10-25% absolute accuracy improvements over prototypical networks, while still maintaining or improving accuracy on standard few-shot learning benchmarks. By clustering labeled and unlabeled data with the same rule, infinite mixture prototypes achieve state-of-the-art semi-supervised accuracy, and can perform purely unsupervised clustering, unlike existing fully- and semi-supervised prototypical methods.", "bibtex": "@InProceedings{pmlr-v97-allen19b,\n title = \t {Infinite Mixture Prototypes for Few-shot Learning},\n author = {Allen, Kelsey and Shelhamer, Evan and Shin, Hanul and Tenenbaum, Joshua},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {232--241},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/allen19b/allen19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/allen19b.html},\n abstract = \t {We propose infinite mixture prototypes to adaptively represent both simple and complex data distributions for few-shot learning. Infinite mixture prototypes combine deep representation learning with Bayesian nonparametrics, representing each class by a set of clusters, unlike existing prototypical methods that represent each class by a single cluster. By inferring the number of clusters, infinite mixture prototypes interpolate between nearest neighbor and prototypical representations in a learned feature space, which improves accuracy and robustness in the few-shot regime. We show the importance of adaptive capacity for capturing complex data distributions such as super-classes (like alphabets in character recognition), with 10-25% absolute accuracy improvements over prototypical networks, while still maintaining or improving accuracy on standard few-shot learning benchmarks. By clustering labeled and unlabeled data with the same rule, infinite mixture prototypes achieve state-of-the-art semi-supervised accuracy, and can perform purely unsupervised clustering, unlike existing fully- and semi-supervised prototypical methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/allen19b/allen19b.pdf", "supp": "", "pdf_size": 357166, "gs_citation": 329, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8170445830318359498&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Brain and Cognitive Sciences, Center for Brains, Minds, and Machines (CBMM), CSAIL, MIT, Cambridge, MA; Computer Science, UC Berkeley, Berkeley, CA; Department of Brain and Cognitive Sciences, Center for Brains, Minds, and Machines (CBMM), CSAIL, MIT, Cambridge, MA; Department of Brain and Cognitive Sciences, Center for Brains, Minds, and Machines (CBMM), CSAIL, MIT, Cambridge, MA", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/allen19b.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;University of California, Berkeley", "aff_unique_dep": "Department of Brain and Cognitive Sciences;Department of Computer Science", "aff_unique_url": "https://www.mit.edu;https://www.berkeley.edu", "aff_unique_abbr": "MIT;UC Berkeley", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Cambridge;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Information-Theoretic Considerations in Batch Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4162", "id": "4162", "author_site": "Jinglin Chen, Nan Jiang", "author": "Jinglin Chen; Nan Jiang", "abstract": "Value-function approximation methods that operate in batch mode have foundational importance to reinforcement learning (RL). Finite sample guarantees for these methods often crucially rely on two types of assumptions: (1) mild distribution shift, and (2) representation conditions that are stronger than realizability. However, the necessity (\u201cwhy do we need them?\u201d) and the naturalness (\u201cwhen do they hold?\u201d) of such assumptions have largely eluded the literature. In this paper, we revisit these assumptions and provide theoretical results towards answering the above questions, and make steps towards a deeper understanding of value-function approximation.", "bibtex": "@InProceedings{pmlr-v97-chen19e,\n title = \t {Information-Theoretic Considerations in Batch Reinforcement Learning},\n author = {Chen, Jinglin and Jiang, Nan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1042--1051},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19e/chen19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19e.html},\n abstract = \t {Value-function approximation methods that operate in batch mode have foundational importance to reinforcement learning (RL). Finite sample guarantees for these methods often crucially rely on two types of assumptions: (1) mild distribution shift, and (2) representation conditions that are stronger than realizability. However, the necessity (\u201cwhy do we need them?\u201d) and the naturalness (\u201cwhen do they hold?\u201d) of such assumptions have largely eluded the literature. In this paper, we revisit these assumptions and provide theoretical results towards answering the above questions, and make steps towards a deeper understanding of value-function approximation.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19e/chen19e.pdf", "supp": "", "pdf_size": 398862, "gs_citation": 453, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6122478153303960156&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "University of Illinois at Urbana-Champaign; University of Illinois at Urbana-Champaign", "aff_domain": "illinois.edu;illinois.edu", "email": "illinois.edu;illinois.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/chen19e.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Insertion Transformer: Flexible Sequence Generation via Insertion Operations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4221", "id": "4221", "author_site": "Mitchell Stern, William Chan, Jamie Kiros, Jakob Uszkoreit", "author": "Mitchell Stern; William Chan; Jamie Kiros; Jakob Uszkoreit", "abstract": "We present the Insertion Transformer, an iterative, partially autoregressive model for sequence generation based on insertion operations. Unlike typical autoregressive models which rely on a fixed, often left-to-right ordering of the output, our approach accommodates arbitrary orderings by allowing for tokens to be inserted anywhere in the sequence during decoding. This flexibility confers a number of advantages: for instance, not only can our model be trained to follow specific orderings such as left-to-right generation or a binary tree traversal, but it can also be trained to maximize entropy over all valid insertions for robustness. In addition, our model seamlessly accommodates both fully autoregressive generation (one insertion at a time) and partially autoregressive generation (simultaneous insertions at multiple locations). We validate our approach by analyzing its performance on the WMT 2014 English-German machine translation task under various settings for training and decoding. We find that the Insertion Transformer outperforms many prior non-autoregressive approaches to translation at comparable or better levels of parallelism, and successfully recovers the performance of the original Transformer while requiring only logarithmically many iterations during decoding.", "bibtex": "@InProceedings{pmlr-v97-stern19a,\n title = \t {Insertion Transformer: Flexible Sequence Generation via Insertion Operations},\n author = {Stern, Mitchell and Chan, William and Kiros, Jamie and Uszkoreit, Jakob},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5976--5985},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/stern19a/stern19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/stern19a.html},\n abstract = \t {We present the Insertion Transformer, an iterative, partially autoregressive model for sequence generation based on insertion operations. Unlike typical autoregressive models which rely on a fixed, often left-to-right ordering of the output, our approach accommodates arbitrary orderings by allowing for tokens to be inserted anywhere in the sequence during decoding. This flexibility confers a number of advantages: for instance, not only can our model be trained to follow specific orderings such as left-to-right generation or a binary tree traversal, but it can also be trained to maximize entropy over all valid insertions for robustness. In addition, our model seamlessly accommodates both fully autoregressive generation (one insertion at a time) and partially autoregressive generation (simultaneous insertions at multiple locations). We validate our approach by analyzing its performance on the WMT 2014 English-German machine translation task under various settings for training and decoding. We find that the Insertion Transformer outperforms many prior non-autoregressive approaches to translation at comparable or better levels of parallelism, and successfully recovers the performance of the original Transformer while requiring only logarithmically many iterations during decoding.}\n}", "pdf": "http://proceedings.mlr.press/v97/stern19a/stern19a.pdf", "supp": "", "pdf_size": 320876, "gs_citation": 270, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9826117600704905049&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google Brain, Mountain View, Toronto, Berlin + University of California, Berkeley; Google Brain, Mountain View, Toronto, Berlin; Google Brain, Mountain View, Toronto, Berlin; Google Brain, Mountain View, Toronto, Berlin", "aff_domain": "berkeley.edu;google.com; ; ", "email": "berkeley.edu;google.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/stern19a.html", "aff_unique_index": "0+1;0;0;0", "aff_unique_norm": "Google;University of California, Berkeley", "aff_unique_dep": "Google Brain;", "aff_unique_url": "https://brain.google.com;https://www.berkeley.edu", "aff_unique_abbr": "Google Brain;UC Berkeley", "aff_campus_unique_index": "0+1;0;0;0", "aff_campus_unique": "Mountain View;Berkeley", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Interpreting Adversarially Trained Convolutional Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3778", "id": "3778", "author_site": "Tianyuan Zhang, Zhanxing Zhu", "author": "Tianyuan Zhang; Zhanxing Zhu", "abstract": "We attempt to interpret how adversarially trained convolutional neural networks (AT-CNNs) recognize objects. We design systematic approaches to interpret AT-CNNs in both qualitative and quantitative ways and compare them with normally trained models. Surprisingly, we find that adversarial training alleviates the texture bias of standard CNNs when trained on object recognition tasks, and helps CNNs learn a more shape-biased representation. We validate our hypothesis from two aspects. First, we compare the salience maps of AT-CNNs and standard CNNs on clean images and images under different transformations. The comparison could visually show that the prediction of the two types of CNNs is sensitive to dramatically different types of features. Second, to achieve quantitative verification, we construct additional test datasets that destroy either textures or shapes, such as style-transferred version of clean data, saturated images and patch-shuffled ones, and then evaluate the classification accuracy of AT-CNNs and normal CNNs on these datasets. Our findings shed some light on why AT-CNNs are more robust than those normally trained ones and contribute to a better understanding of adversarial training over CNNs from an interpretation perspective.", "bibtex": "@InProceedings{pmlr-v97-zhang19s,\n title = \t {Interpreting Adversarially Trained Convolutional Neural Networks},\n author = {Zhang, Tianyuan and Zhu, Zhanxing},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7502--7511},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19s/zhang19s.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19s.html},\n abstract = \t {We attempt to interpret how adversarially trained convolutional neural networks (AT-CNNs) recognize objects. We design systematic approaches to interpret AT-CNNs in both qualitative and quantitative ways and compare them with normally trained models. Surprisingly, we find that adversarial training alleviates the texture bias of standard CNNs when trained on object recognition tasks, and helps CNNs learn a more shape-biased representation. We validate our hypothesis from two aspects. First, we compare the salience maps of AT-CNNs and standard CNNs on clean images and images under different transformations. The comparison could visually show that the prediction of the two types of CNNs is sensitive to dramatically different types of features. Second, to achieve quantitative verification, we construct additional test datasets that destroy either textures or shapes, such as style-transferred version of clean data, saturated images and patch-shuffled ones, and then evaluate the classification accuracy of AT-CNNs and normal CNNs on these datasets. Our findings shed some light on why AT-CNNs are more robust than those normally trained ones and contribute to a better understanding of adversarial training over CNNs from an interpretation perspective.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19s/zhang19s.pdf", "supp": "", "pdf_size": 5470950, "gs_citation": 191, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6664229559742953811&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhang19s.html" }, { "title": "Invariant-Equivariant Representation Learning for Multi-Class Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3719", "id": "3719", "author": "Ilya Feige", "abstract": "Representations learnt through deep neural networks tend to be highly informative, but opaque in terms of what information they learn to encode. We introduce an approach to probabilistic modelling that learns to represent data with two separate deep representations: an invariant representation that encodes the information of the class from which the data belongs, and an equivariant representation that encodes the symmetry transformation defining the particular data point within the class manifold (equivariant in the sense that the representation varies naturally with symmetry transformations). This approach is based primarily on the strategic routing of data through the two latent variables, and thus is conceptually transparent, easy to implement, and in-principle generally applicable to any data comprised of discrete classes of continuous distributions (e.g. objects in images, topics in language, individuals in behavioural data). We demonstrate qualitatively compelling representation learning and competitive quantitative performance, in both supervised and semi-supervised settings, versus comparable modelling approaches in the literature with little fine tuning.", "bibtex": "@InProceedings{pmlr-v97-feige19a,\n title = \t {Invariant-Equivariant Representation Learning for Multi-Class Data},\n author = {Feige, Ilya},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1882--1891},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/feige19a/feige19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/feige19a.html},\n abstract = \t {Representations learnt through deep neural networks tend to be highly informative, but opaque in terms of what information they learn to encode. We introduce an approach to probabilistic modelling that learns to represent data with two separate deep representations: an invariant representation that encodes the information of the class from which the data belongs, and an equivariant representation that encodes the symmetry transformation defining the particular data point within the class manifold (equivariant in the sense that the representation varies naturally with symmetry transformations). This approach is based primarily on the strategic routing of data through the two latent variables, and thus is conceptually transparent, easy to implement, and in-principle generally applicable to any data comprised of discrete classes of continuous distributions (e.g. objects in images, topics in language, individuals in behavioural data). We demonstrate qualitatively compelling representation learning and competitive quantitative performance, in both supervised and semi-supervised settings, versus comparable modelling approaches in the literature with little fine tuning.}\n}", "pdf": "http://proceedings.mlr.press/v97/feige19a/feige19a.pdf", "supp": "", "pdf_size": 3830979, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13745121487961455933&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Faculty, 54 Welbeck Street, London", "aff_domain": "faculty.ai", "email": "faculty.ai", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/feige19a.html", "aff_unique_index": "0", "aff_unique_norm": "Faculty", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "title": "Invertible Residual Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4295", "id": "4295", "author_site": "Jens Behrmann, Will Grathwohl, Ricky T. Q. Chen, David Duvenaud, Joern-Henrik Jacobsen", "author": "Jens Behrmann; Will Grathwohl; Ricky T. Q. Chen; David Duvenaud; Joern-Henrik Jacobsen", "abstract": "We show that standard ResNet architectures can be made invertible, allowing the same model to be used for classification, density estimation, and generation. Typically, enforcing invertibility requires partitioning dimensions or restricting network architectures. In contrast, our approach only requires adding a simple normalization step during training, already available in standard frameworks. Invertible ResNets define a generative model which can be trained by maximum likelihood on unlabeled data. To compute likelihoods, we introduce a tractable approximation to the Jacobian log-determinant of a residual block. Our empirical evaluation shows that invertible ResNets perform competitively with both state-of-the-art image classifiers and flow-based generative models, something that has not been previously achieved with a single architecture.", "bibtex": "@InProceedings{pmlr-v97-behrmann19a,\n title = \t {Invertible Residual Networks},\n author = {Behrmann, Jens and Grathwohl, Will and Chen, Ricky T. Q. and Duvenaud, David and Jacobsen, Joern-Henrik},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {573--582},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/behrmann19a/behrmann19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/behrmann19a.html},\n abstract = \t {We show that standard ResNet architectures can be made invertible, allowing the same model to be used for classification, density estimation, and generation. Typically, enforcing invertibility requires partitioning dimensions or restricting network architectures. In contrast, our approach only requires adding a simple normalization step during training, already available in standard frameworks. Invertible ResNets define a generative model which can be trained by maximum likelihood on unlabeled data. To compute likelihoods, we introduce a tractable approximation to the Jacobian log-determinant of a residual block. Our empirical evaluation shows that invertible ResNets perform competitively with both state-of-the-art image classifiers and flow-based generative models, something that has not been previously achieved with a single architecture.}\n}", "pdf": "http://proceedings.mlr.press/v97/behrmann19a/behrmann19a.pdf", "supp": "", "pdf_size": 1423956, "gs_citation": 759, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15596378066514684893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Bremen, Center for Industrial Mathematics+Vector Institute and University of Toronto; Vector Institute and University of Toronto; Vector Institute and University of Toronto; Vector Institute and University of Toronto; University of Bremen, Center for Industrial Mathematics+Vector Institute and University of Toronto", "aff_domain": "uni-bremen.de; ; ; ;vectorinstitute.ai", "email": "uni-bremen.de; ; ; ;vectorinstitute.ai", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/behrmann19a.html", "aff_unique_index": "0+1;1;1;1;0+1", "aff_unique_norm": "University of Bremen;University of Toronto", "aff_unique_dep": "Center for Industrial Mathematics;Vector Institute", "aff_unique_url": "https://www.uni-bremen.de;https://www vectorinstitute.org", "aff_unique_abbr": ";U of T", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0+1;1;1;1;0+1", "aff_country_unique": "Germany;Canada" }, { "title": "Iterative Linearized Control: Stable Algorithms and Complexity Guarantees", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4201", "id": "4201", "author_site": "Vincent Roulet, Dmitriy Drusvyatskiy, Siddhartha Srinivasa, Zaid Harchaoui", "author": "Vincent Roulet; Siddhartha Srinivasa; Dmitriy Drusvyatskiy; Zaid Harchaoui", "abstract": "We examine popular gradient-based algorithms for nonlinear control in the light of the modern complexity analysis of first-order optimization algorithms. The examination reveals that the complexity bounds can be clearly stated in terms of calls to a computational oracle related to dynamic programming and implementable by gradient back-propagation using machine learning software libraries such as PyTorch or TensorFlow. Finally, we propose a regularized Gauss-Newton algorithm enjoying worst-case complexity bounds and improved convergence behavior in practice. The software library based on PyTorch is publicly available.", "bibtex": "@InProceedings{pmlr-v97-roulet19a,\n title = \t {Iterative Linearized Control: Stable Algorithms and Complexity Guarantees},\n author = {Roulet, Vincent and Srinivasa, Siddhartha and Drusvyatskiy, Dmitriy and Harchaoui, Zaid},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5518--5527},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/roulet19a/roulet19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/roulet19a.html},\n abstract = \t {We examine popular gradient-based algorithms for nonlinear control in the light of the modern complexity analysis of first-order optimization algorithms. The examination reveals that the complexity bounds can be clearly stated in terms of calls to a computational oracle related to dynamic programming and implementable by gradient back-propagation using machine learning software libraries such as PyTorch or TensorFlow. Finally, we propose a regularized Gauss-Newton algorithm enjoying worst-case complexity bounds and improved convergence behavior in practice. The software library based on PyTorch is publicly available.}\n}", "pdf": "http://proceedings.mlr.press/v97/roulet19a/roulet19a.pdf", "supp": "", "pdf_size": 916741, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13502555375251744888&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Statistics, University of Washington, Seattle, USA; Paul G. Allen School of Computer Science & Engineering, University of Washington, Seattle, USA; Department of Mathematics, University of Washington, Seattle, USA; Department of Statistics, University of Washington, Seattle, USA", "aff_domain": "uw.edu; ; ; ", "email": "uw.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/roulet19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Ithemal: Accurate, Portable and Fast Basic Block Throughput Estimation using Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4057", "id": "4057", "author_site": "Charith Mendis, Alex Renda, Dr.Saman Amarasinghe, Michael Carbin", "author": "Charith Mendis; Alex Renda; Dr.Saman Amarasinghe; Michael Carbin", "abstract": "Predicting the number of clock cycles a processor takes to execute a block of assembly instructions in steady state (the throughput) is important for both compiler designers and performance engineers. Building an analytical model to do so is especially complicated in modern x86-64 Complex Instruction Set Computer (CISC) machines with sophisticated processor microarchitectures in that it is tedious, error prone, and must be performed from scratch for each processor generation. In this paper we present Ithemal, the first tool which learns to predict the throughput of a set of instructions. Ithemal uses a hierarchical LSTM\u2013based approach to predict throughput based on the opcodes and operands of instructions in a basic block. We show that Ithemal is more accurate than state-of-the-art hand-written tools currently used in compiler backends and static machine code analyzers. In particular, our model has less than half the error of state-of-the-art analytical models (LLVM\u2019s llvm-mca and Intel\u2019s IACA). Ithemal is also able to predict these throughput values just as fast as the aforementioned tools, and is easily ported across a variety of processor microarchitectures with minimal developer effort.", "bibtex": "@InProceedings{pmlr-v97-mendis19a,\n title = \t {Ithemal: Accurate, Portable and Fast Basic Block Throughput Estimation using Deep Neural Networks},\n author = {Mendis, Charith and Renda, Alex and Amarasinghe, Dr.Saman and Carbin, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4505--4515},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mendis19a/mendis19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mendis19a.html},\n abstract = \t {Predicting the number of clock cycles a processor takes to execute a block of assembly instructions in steady state (the throughput) is important for both compiler designers and performance engineers. Building an analytical model to do so is especially complicated in modern x86-64 Complex Instruction Set Computer (CISC) machines with sophisticated processor microarchitectures in that it is tedious, error prone, and must be performed from scratch for each processor generation. In this paper we present Ithemal, the first tool which learns to predict the throughput of a set of instructions. Ithemal uses a hierarchical LSTM\u2013based approach to predict throughput based on the opcodes and operands of instructions in a basic block. We show that Ithemal is more accurate than state-of-the-art hand-written tools currently used in compiler backends and static machine code analyzers. In particular, our model has less than half the error of state-of-the-art analytical models (LLVM\u2019s llvm-mca and Intel\u2019s IACA). Ithemal is also able to predict these throughput values just as fast as the aforementioned tools, and is easily ported across a variety of processor microarchitectures with minimal developer effort.}\n}", "pdf": "http://proceedings.mlr.press/v97/mendis19a/mendis19a.pdf", "supp": "", "pdf_size": 672858, "gs_citation": 206, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6452183013544894818&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 12, "aff": "MIT CSAIL; MIT CSAIL; MIT CSAIL; MIT CSAIL", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/mendis19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT CSAIL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Jumpout : Improved Dropout for Deep Neural Networks with ReLUs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3904", "id": "3904", "author_site": "Shengjie Wang, Tianyi Zhou, Jeff Bilmes", "author": "Shengjie Wang; Tianyi Zhou; Jeff Bilmes", "abstract": "We discuss three novel insights about dropout for DNNs with ReLUs: 1) dropout encourages each local linear piece of a DNN to be trained on data points from nearby regions; 2) the same dropout rate results in different (effective) deactivation rates for layers with different portions of ReLU-deactivated neurons; and 3) the rescaling factor of dropout causes a normalization inconsistency between training and test when used together with batch normalization. The above leads to three simple but nontrivial modifications resulting in our method \u201cjumpout.\u201d Jumpout samples the dropout rate from a monotone decreasing distribution (e.g., the right half of a Gaussian), so each local linear piece is trained, with high probability, to work better for data points from nearby than more distant regions. Jumpout moreover adaptively normalizes the dropout rate at each layer and every training batch, so the effective deactivation rate on the activated neurons is kept the same. Furthermore, it rescales the outputs for a better trade-off that keeps both the variance and mean of neurons more consistent between training and test phases, thereby mitigating the incompatibility between dropout and batch normalization. Jumpout significantly improves the performance of different neural nets on CIFAR10, CIFAR100, Fashion-MNIST, STL10, SVHN, ImageNet-1k, etc., while introducing negligible additional memory and computation costs.", "bibtex": "@InProceedings{pmlr-v97-wang19q,\n title = \t {Jumpout : Improved Dropout for Deep Neural Networks with {R}e{LU}s},\n author = {Wang, Shengjie and Zhou, Tianyi and Bilmes, Jeff},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6668--6676},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19q/wang19q.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19q.html},\n abstract = \t {We discuss three novel insights about dropout for DNNs with ReLUs: 1) dropout encourages each local linear piece of a DNN to be trained on data points from nearby regions; 2) the same dropout rate results in different (effective) deactivation rates for layers with different portions of ReLU-deactivated neurons; and 3) the rescaling factor of dropout causes a normalization inconsistency between training and test when used together with batch normalization. The above leads to three simple but nontrivial modifications resulting in our method \u201cjumpout.\u201d Jumpout samples the dropout rate from a monotone decreasing distribution (e.g., the right half of a Gaussian), so each local linear piece is trained, with high probability, to work better for data points from nearby than more distant regions. Jumpout moreover adaptively normalizes the dropout rate at each layer and every training batch, so the effective deactivation rate on the activated neurons is kept the same. Furthermore, it rescales the outputs for a better trade-off that keeps both the variance and mean of neurons more consistent between training and test phases, thereby mitigating the incompatibility between dropout and batch normalization. Jumpout significantly improves the performance of different neural nets on CIFAR10, CIFAR100, Fashion-MNIST, STL10, SVHN, ImageNet-1k, etc., while introducing negligible additional memory and computation costs.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19q/wang19q.pdf", "supp": "", "pdf_size": 1535817, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9380303128595331172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Paul G. Allen School of Computer Science & Engineering; Paul G. Allen School of Computer Science & Engineering; Department of Electrical & Computer Engineering, University of Washington, Seattle, USA", "aff_domain": "uw.edu;uw.edu;uw.edu", "email": "uw.edu;uw.edu;uw.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wang19q.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Paul G. Allen School of Computer Science & Engineering", "aff_unique_url": "https://www.cs.washington.edu", "aff_unique_abbr": "UW CSE", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Katalyst: Boosting Convex Katayusha for Non-Convex Problems with a Large Condition Number", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4230", "id": "4230", "author_site": "Zaiyi Chen, Yi Xu, Haoyuan Hu, Tianbao Yang", "author": "Zaiyi Chen; Yi Xu; Haoyuan Hu; Tianbao Yang", "abstract": "An important class of non-convex objectives that has wide applications in machine learning consists of a sum of $n$ smooth functions and a non-smooth convex function. Tremendous studies have been devoted to conquering these problems by leveraging one of the two types of variance reduction techniques, i.e., SVRG-type that computes a full gradient occasionally and SAGA-type that maintains $n$ stochastic gradients at every iteration. In practice, SVRG-type is preferred to SAGA-type due to its potentially less memory costs. An interesting question that has been largely ignored is how to improve the complexity of variance reduction methods for problems with a large condition number that measures the degree to which the objective is close to a convex function. In this paper, we present a simple but non-trivial boosting of a state-of-the-art SVRG-type method for convex problems (namely Katyusha) to enjoy an improved complexity for solving non-convex problems with a large condition number (that is close to a convex function). To the best of our knowledge, its complexity has the best dependence on $n$ and the degree of non-convexity, and also matches that of a recent SAGA-type accelerated stochastic algorithm for a constrained non-convex smooth optimization problem.", "bibtex": "@InProceedings{pmlr-v97-chen19k,\n title = \t {Katalyst: Boosting Convex Katayusha for Non-Convex Problems with a Large Condition Number},\n author = {Chen, Zaiyi and Xu, Yi and Hu, Haoyuan and Yang, Tianbao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1102--1111},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19k/chen19k.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19k.html},\n abstract = \t {An important class of non-convex objectives that has wide applications in machine learning consists of a sum of $n$ smooth functions and a non-smooth convex function. Tremendous studies have been devoted to conquering these problems by leveraging one of the two types of variance reduction techniques, i.e., SVRG-type that computes a full gradient occasionally and SAGA-type that maintains $n$ stochastic gradients at every iteration. In practice, SVRG-type is preferred to SAGA-type due to its potentially less memory costs. An interesting question that has been largely ignored is how to improve the complexity of variance reduction methods for problems with a large condition number that measures the degree to which the objective is close to a convex function. In this paper, we present a simple but non-trivial boosting of a state-of-the-art SVRG-type method for convex problems (namely Katyusha) to enjoy an improved complexity for solving non-convex problems with a large condition number (that is close to a convex function). To the best of our knowledge, its complexity has the best dependence on $n$ and the degree of non-convexity, and also matches that of a recent SAGA-type accelerated stochastic algorithm for a constrained non-convex smooth optimization problem.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19k/chen19k.pdf", "supp": "", "pdf_size": 416728, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18103378429909844859&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Cainiao AI, China + University of Science and Technology of China, China; University of Science and Technology of China, China; Cainiao AI, China; The University of Iowa, USA", "aff_domain": "alibaba-inc.com; ; ; ", "email": "alibaba-inc.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chen19k.html", "aff_unique_index": "0+1;1;0;2", "aff_unique_norm": "Cainiao AI;University of Science and Technology of China;University of Iowa", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cainiao.com;http://www.ustc.edu.cn;https://www.uiowa.edu", "aff_unique_abbr": "Cainiao AI;USTC;UIowa", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;1", "aff_country_unique": "China;United States" }, { "title": "Kernel Mean Matching for Content Addressability of GANs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3606", "id": "3606", "author_site": "Wittawat Jitkrittum, Wittawat Jitkrittum, Patsorn Sangkloy, Muhammad Waleed Gondal, Amit Raj, James Hays, Bernhard Sch\u00f6lkopf", "author": "Wittawat Jitkrittum; Patsorn Sangkloy; Muhammad Waleed Gondal; Amit Raj; James Hays; Bernhard Sch\u00f6lkopf", "abstract": "We propose a novel procedure which adds \"content-addressability\" to any given unconditional implicit model e.g., a generative adversarial network (GAN). The procedure allows users to control the generative process by specifying a set (arbitrary size) of desired examples based on which similar samples are generated from the model. The proposed approach, based on kernel mean matching, is applicable to any generative models which transform latent vectors to samples, and does not require retraining of the model. Experiments on various high-dimensional image generation problems (CelebA-HQ, LSUN bedroom, bridge, tower) show that our approach is able to generate images which are consistent with the input set, while retaining the image quality of the original model. To our knowledge, this is the first work that attempts to construct, at test time, a content-addressable generative model from a trained marginal model.", "bibtex": "@InProceedings{pmlr-v97-jitkrittum19a,\n title = \t {Kernel Mean Matching for Content Addressability of {GAN}s},\n author = {Jitkrittum, Wittawat and Sangkloy, Patsorn and Gondal, Muhammad Waleed and Raj, Amit and Hays, James and Sch{\\\"o}lkopf, Bernhard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3140--3151},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jitkrittum19a/jitkrittum19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jitkrittum19a.html},\n abstract = \t {We propose a novel procedure which adds \"content-addressability\" to any given unconditional implicit model e.g., a generative adversarial network (GAN). The procedure allows users to control the generative process by specifying a set (arbitrary size) of desired examples based on which similar samples are generated from the model. The proposed approach, based on kernel mean matching, is applicable to any generative models which transform latent vectors to samples, and does not require retraining of the model. Experiments on various high-dimensional image generation problems (CelebA-HQ, LSUN bedroom, bridge, tower) show that our approach is able to generate images which are consistent with the input set, while retaining the image quality of the original model. To our knowledge, this is the first work that attempts to construct, at test time, a content-addressable generative model from a trained marginal model.}\n}", "pdf": "http://proceedings.mlr.press/v97/jitkrittum19a/jitkrittum19a.pdf", "supp": "", "pdf_size": 7700235, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=235365843120524307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Empirical Inference Department, Max Planck Institute for Intelligent Systems, Germany+School of Interactive Computing, Georgia Institute of Technology, USA; School of Interactive Computing, Georgia Institute of Technology, USA; Empirical Inference Department, Max Planck Institute for Intelligent Systems, Germany; School of Interactive Computing, Georgia Institute of Technology, USA; School of Interactive Computing, Georgia Institute of Technology, USA; Empirical Inference Department, Max Planck Institute for Intelligent Systems, Germany", "aff_domain": "tuebingen.mpg.de;gmail.com; ; ; ; ", "email": "tuebingen.mpg.de;gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/jitkrittum19a.html", "aff_unique_index": "0+1;1;0;1;1;0", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;Georgia Institute of Technology", "aff_unique_dep": "Empirical Inference Department;School of Interactive Computing", "aff_unique_url": "https://www.mpituebingen.mpg.de;https://www.gatech.edu", "aff_unique_abbr": "MPI-IS;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;1;1;0", "aff_country_unique": "Germany;United States" }, { "title": "Kernel Normalized Cut: a Theoretical Revisit", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4044", "id": "4044", "author_site": "Yoshikazu Terada, Michio Yamamoto", "author": "Yoshikazu Terada; Michio Yamamoto", "abstract": "In this paper, we study the theoretical properties of clustering based on the kernel normalized cut. Our first contribution is to derive a nonasymptotic upper bound on the expected distortion rate of the kernel normalized cut. From this result, we show that the solution of the kernel normalized cut converges to that of the population-level weighted k-means clustering on a certain reproducing kernel Hilbert space (RKHS). Our second contribution is the discover of the interesting fact that the population-level weighted k-means clustering in the RKHS is equivalent to the population-level normalized cut. Combining these results, we can see that the kernel normalized cut converges to the population-level normalized cut. The criterion of the population-level normalized cut can be considered as an indivisibility of the population distribution, and this criterion plays an important role in the theoretical analysis of spectral clustering in Schiebinger et al. (2015). We believe that our results will provide deep insights into the behavior of both normalized cut and spectral clustering.", "bibtex": "@InProceedings{pmlr-v97-terada19a,\n title = \t {Kernel Normalized Cut: a Theoretical Revisit},\n author = {Terada, Yoshikazu and Yamamoto, Michio},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6206--6214},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/terada19a/terada19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/terada19a.html},\n abstract = \t {In this paper, we study the theoretical properties of clustering based on the kernel normalized cut. Our first contribution is to derive a nonasymptotic upper bound on the expected distortion rate of the kernel normalized cut. From this result, we show that the solution of the kernel normalized cut converges to that of the population-level weighted k-means clustering on a certain reproducing kernel Hilbert space (RKHS). Our second contribution is the discover of the interesting fact that the population-level weighted k-means clustering in the RKHS is equivalent to the population-level normalized cut. Combining these results, we can see that the kernel normalized cut converges to the population-level normalized cut. The criterion of the population-level normalized cut can be considered as an indivisibility of the population distribution, and this criterion plays an important role in the theoretical analysis of spectral clustering in Schiebinger et al. (2015). We believe that our results will provide deep insights into the behavior of both normalized cut and spectral clustering.}\n}", "pdf": "http://proceedings.mlr.press/v97/terada19a/terada19a.pdf", "supp": "", "pdf_size": 2929516, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17072349706873524773&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Graduate School of Engineering Science, Osaka University, Osaka, Japan+RIKEN Center for Advanced Intelligence Project (AIP), Tokyo, Japan; Graduate School of Environmental and Life Science, Okayama University, Okayama, Japan+RIKEN Center for Advanced Intelligence Project (AIP), Tokyo, Japan", "aff_domain": "sigmath.es.osaka-u.ac.jp; ", "email": "sigmath.es.osaka-u.ac.jp; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/terada19a.html", "aff_unique_index": "0+1;2+1", "aff_unique_norm": "Osaka University;RIKEN Center for Advanced Intelligence Project;Okayama University", "aff_unique_dep": "Graduate School of Engineering Science;Advanced Intelligence Project;Graduate School of Environmental and Life Science", "aff_unique_url": "https://www.osaka-u.ac.jp;https://aipcenter.riken.jp/en/;https://www.okayama-u.ac.jp", "aff_unique_abbr": "Osaka U;RIKEN AIP;Okayama U", "aff_campus_unique_index": "0+1;2+1", "aff_campus_unique": "Osaka;Tokyo;Okayama", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Japan" }, { "title": "Kernel-Based Reinforcement Learning in Robust Markov Decision Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3984", "id": "3984", "author_site": "Shiau Hong Lim, Arnaud Autef", "author": "Shiau Hong Lim; Arnaud Autef", "abstract": "The robust Markov decision processes (MDP) framework aims to address the problem of parameter uncertainty due to model mismatch, approximation errors or even adversarial behaviors. It is especially relevant when deploying the learned policies in real-world applications. Scaling up the robust MDP framework to large or continuous state space remains a challenging problem. The use of function approximation in this case is usually inevitable and this can only amplify the problem of model mismatch and parameter uncertainties. It has been previously shown that, in the case of MDPs with state aggregation, the robust policies enjoy a tighter performance bound compared to standard solutions due to its reduced sensitivity to approximation errors. We extend these results to the much larger class of kernel-based approximators and show, both analytically and empirically that the robust policies can significantly outperform the non-robust counterpart.", "bibtex": "@InProceedings{pmlr-v97-lim19a,\n title = \t {Kernel-Based Reinforcement Learning in Robust {M}arkov Decision Processes},\n author = {Lim, Shiau Hong and Autef, Arnaud},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3973--3981},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lim19a/lim19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lim19a.html},\n abstract = \t {The robust Markov decision processes (MDP) framework aims to address the problem of parameter uncertainty due to model mismatch, approximation errors or even adversarial behaviors. It is especially relevant when deploying the learned policies in real-world applications. Scaling up the robust MDP framework to large or continuous state space remains a challenging problem. The use of function approximation in this case is usually inevitable and this can only amplify the problem of model mismatch and parameter uncertainties. It has been previously shown that, in the case of MDPs with state aggregation, the robust policies enjoy a tighter performance bound compared to standard solutions due to its reduced sensitivity to approximation errors. We extend these results to the much larger class of kernel-based approximators and show, both analytically and empirically that the robust policies can significantly outperform the non-robust counterpart.}\n}", "pdf": "http://proceedings.mlr.press/v97/lim19a/lim19a.pdf", "supp": "", "pdf_size": 491676, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12763959852322596547&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "IBM Research, Singapore; Applied Mathematics department, Ecole polytechnique, France + IBM Research, Singapore", "aff_domain": "sg.ibm.com; ", "email": "sg.ibm.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/lim19a.html", "aff_unique_index": "0;1+0", "aff_unique_norm": "IBM;Ecole Polytechnique", "aff_unique_dep": "IBM Research;Applied Mathematics department", "aff_unique_url": "https://www.ibm.com/research;https://www.polytechnique.edu", "aff_unique_abbr": "IBM;Polytechnique", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1+0", "aff_country_unique": "Singapore;France" }, { "title": "LGM-Net: Learning to Generate Matching Networks for Few-Shot Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3808", "id": "3808", "author_site": "Huaiyu Li, Weiming Dong, Xing Mei, Chongyang Ma, Feiyue Huang, Bao-Gang Hu", "author": "Huaiyu Li; Weiming Dong; Xing Mei; Chongyang Ma; Feiyue Huang; Bao-Gang Hu", "abstract": "In this work, we propose a novel meta-learning approach for few-shot classification, which learns transferable prior knowledge across tasks and directly produces network parameters for similar unseen tasks with training samples. Our approach, called LGM-Net, includes two key modules, namely, TargetNet and MetaNet. The TargetNet module is a neural network for solving a specific task and the MetaNet module aims at learning to generate functional weights for TargetNet by observing training samples. We also present an intertask normalization strategy for the training process to leverage common information shared across different tasks. The experimental results on Omniglot and miniImageNet datasets demonstrate that LGM-Net can effectively adapt to similar unseen tasks and achieve competitive performance, and the results on synthetic datasets show that transferable prior knowledge is learned by the MetaNet module via mapping training data to functional weights. LGM-Net enables fast learning and adaptation since no further tuning steps are required compared to other meta-learning approaches", "bibtex": "@InProceedings{pmlr-v97-li19c,\n title = \t {{LGM}-Net: Learning to Generate Matching Networks for Few-Shot Learning},\n author = {Li, Huaiyu and Dong, Weiming and Mei, Xing and Ma, Chongyang and Huang, Feiyue and Hu, Bao-Gang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3825--3834},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19c/li19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19c.html},\n abstract = \t {In this work, we propose a novel meta-learning approach for few-shot classification, which learns transferable prior knowledge across tasks and directly produces network parameters for similar unseen tasks with training samples. Our approach, called LGM-Net, includes two key modules, namely, TargetNet and MetaNet. The TargetNet module is a neural network for solving a specific task and the MetaNet module aims at learning to generate functional weights for TargetNet by observing training samples. We also present an intertask normalization strategy for the training process to leverage common information shared across different tasks. The experimental results on Omniglot and miniImageNet datasets demonstrate that LGM-Net can effectively adapt to similar unseen tasks and achieve competitive performance, and the results on synthetic datasets show that transferable prior knowledge is learned by the MetaNet module via mapping training data to functional weights. LGM-Net enables fast learning and adaptation since no further tuning steps are required compared to other meta-learning approaches}\n}", "pdf": "http://proceedings.mlr.press/v97/li19c/li19c.pdf", "supp": "", "pdf_size": 851941, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17373853660485197406&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/li19c.html" }, { "title": "LIT: Learned Intermediate Representation Training for Model Compression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4318", "id": "4318", "author_site": "Animesh Koratana, Daniel Kang, Peter Bailis, Matei Zaharia", "author": "Animesh Koratana; Daniel Kang; Peter Bailis; Matei Zaharia", "abstract": "Researchers have proposed a range of model compression techniques to reduce the computational and memory footprint of deep neural networks (DNNs). In this work, we introduce Learned Intermediate representation Training (LIT), a novel model compression technique that outperforms a range of recent model compression techniques by leveraging the highly repetitive structure of modern DNNs (e.g., ResNet). LIT uses a teacher DNN to train a student DNN of reduced depth by leveraging two key ideas: 1) LIT directly compares intermediate representations of the teacher and student model and 2) LIT uses the intermediate representation from the teacher model\u2019s previous block as input to the current student block during training, improving stability of intermediate representations in the student network. We show that LIT can substantially reduce network size without loss in accuracy on a range of DNN architectures and datasets. For example, LIT can compress ResNet on CIFAR10 by 3.4$\\times$ outperforming network slimming and FitNets. Furthermore, LIT can compress, by depth, ResNeXt 5.5$\\times$ on CIFAR10 (image classification), VDCNN by 1.7$\\times$ on Amazon Reviews (sentiment analysis), and StarGAN by 1.8$\\times$ on CelebA (style transfer, i.e., GANs).", "bibtex": "@InProceedings{pmlr-v97-koratana19a,\n title = \t {{LIT}: Learned Intermediate Representation Training for Model Compression},\n author = {Koratana, Animesh and Kang, Daniel and Bailis, Peter and Zaharia, Matei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3509--3518},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/koratana19a/koratana19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/koratana19a.html},\n abstract = \t {Researchers have proposed a range of model compression techniques to reduce the computational and memory footprint of deep neural networks (DNNs). In this work, we introduce Learned Intermediate representation Training (LIT), a novel model compression technique that outperforms a range of recent model compression techniques by leveraging the highly repetitive structure of modern DNNs (e.g., ResNet). LIT uses a teacher DNN to train a student DNN of reduced depth by leveraging two key ideas: 1) LIT directly compares intermediate representations of the teacher and student model and 2) LIT uses the intermediate representation from the teacher model\u2019s previous block as input to the current student block during training, improving stability of intermediate representations in the student network. We show that LIT can substantially reduce network size without loss in accuracy on a range of DNN architectures and datasets. For example, LIT can compress ResNet on CIFAR10 by 3.4$\\times$ outperforming network slimming and FitNets. Furthermore, LIT can compress, by depth, ResNeXt 5.5$\\times$ on CIFAR10 (image classification), VDCNN by 1.7$\\times$ on Amazon Reviews (sentiment analysis), and StarGAN by 1.8$\\times$ on CelebA (style transfer, i.e., GANs).}\n}", "pdf": "http://proceedings.mlr.press/v97/koratana19a/koratana19a.pdf", "supp": "", "pdf_size": 0, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10708177404989063330&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/koratana19a.html" }, { "title": "LR-GLM: High-Dimensional Bayesian Inference Using Low-Rank Data Approximations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3862", "id": "3862", "author_site": "Brian Trippe, Jonathan Huggins, Raj Agrawal, Tamara Broderick", "author": "Brian Trippe; Jonathan Huggins; Raj Agrawal; Tamara Broderick", "abstract": "Due to the ease of modern data collection, applied statisticians often have access to a large set of covariates that they wish to relate to some observed outcome. Generalized linear models (GLMs) offer a particularly interpretable framework for such an analysis. In these high-dimensional problems, the number of covariates is often large relative to the number of observations, so we face non-trivial inferential uncertainty; a Bayesian approach allows coherent quantification of this uncertainty. Unfortunately, existing methods for Bayesian inference in GLMs require running times roughly cubic in parameter dimension, and so are limited to settings with at most tens of thousand parameters. We propose to reduce time and memory costs with a low-rank approximation of the data in an approach we call LR-GLM. When used with the Laplace approximation or Markov chain Monte Carlo, LR-GLM provides a full Bayesian posterior approximation and admits running times reduced by a full factor of the parameter dimension. We rigorously establish the quality of our approximation and show how the choice of rank allows a tunable computational\u2013statistical trade-off. Experiments support our theory and demonstrate the efficacy of LR-GLM on real large-scale datasets.", "bibtex": "@InProceedings{pmlr-v97-trippe19a,\n title = \t {{LR}-{GLM}: High-Dimensional {B}ayesian Inference Using Low-Rank Data Approximations},\n author = {Trippe, Brian and Huggins, Jonathan and Agrawal, Raj and Broderick, Tamara},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6315--6324},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/trippe19a/trippe19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/trippe19a.html},\n abstract = \t {Due to the ease of modern data collection, applied statisticians often have access to a large set of covariates that they wish to relate to some observed outcome. Generalized linear models (GLMs) offer a particularly interpretable framework for such an analysis. In these high-dimensional problems, the number of covariates is often large relative to the number of observations, so we face non-trivial inferential uncertainty; a Bayesian approach allows coherent quantification of this uncertainty. Unfortunately, existing methods for Bayesian inference in GLMs require running times roughly cubic in parameter dimension, and so are limited to settings with at most tens of thousand parameters. We propose to reduce time and memory costs with a low-rank approximation of the data in an approach we call LR-GLM. When used with the Laplace approximation or Markov chain Monte Carlo, LR-GLM provides a full Bayesian posterior approximation and admits running times reduced by a full factor of the parameter dimension. We rigorously establish the quality of our approximation and show how the choice of rank allows a tunable computational\u2013statistical trade-off. Experiments support our theory and demonstrate the efficacy of LR-GLM on real large-scale datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/trippe19a/trippe19a.pdf", "supp": "", "pdf_size": 1696650, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12982859205175183269&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, MA; Department of Biostatistics, Harvard, Cambridge, MA; Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, MA; Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, MA", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/trippe19a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;Department of Biostatistics", "aff_unique_url": "https://web.mit.edu;https://www.harvard.edu", "aff_unique_abbr": "MIT;Harvard", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Ladder Capsule Network", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3735", "id": "3735", "author_site": "Taewon Jeong, Youngmin Lee, Heeyoung Kim", "author": "Taewon Jeong; Youngmin Lee; Heeyoung Kim", "abstract": "We propose a new architecture of the capsule network called the ladder capsule network, which has an alternative building block to the dynamic routing algorithm in the capsule network (Sabour et al., 2017). Motivated by the need for using only important capsules during training for robust performance, we first introduce a new layer called the pruning layer, which removes irrelevant capsules. Based on the selected capsules, we construct higher-level capsule outputs. Subsequently, to capture the part-whole spatial relationships, we introduce another new layer called the ladder layer, the outputs of which are regressed lower-level capsule outputs from higher-level capsules. Unlike the capsule network adopting the routing-by-agreement, the ladder capsule network uses backpropagation from a loss function to reconstruct the lower-level capsule outputs from higher-level capsules; thus, the ladder layer implements the reverse directional inference of the agreement/disagreement mechanism of the capsule network. The experiments on MNIST demonstrate that the ladder capsule network learns an equivariant representation and improves the capability to extrapolate or generalize to pose variations.", "bibtex": "@InProceedings{pmlr-v97-jeong19b,\n title = \t {Ladder Capsule Network},\n author = {Jeong, Taewon and Lee, Youngmin and Kim, Heeyoung},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3071--3079},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jeong19b/jeong19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/jeong19b.html},\n abstract = \t {We propose a new architecture of the capsule network called the ladder capsule network, which has an alternative building block to the dynamic routing algorithm in the capsule network (Sabour et al., 2017). Motivated by the need for using only important capsules during training for robust performance, we first introduce a new layer called the pruning layer, which removes irrelevant capsules. Based on the selected capsules, we construct higher-level capsule outputs. Subsequently, to capture the part-whole spatial relationships, we introduce another new layer called the ladder layer, the outputs of which are regressed lower-level capsule outputs from higher-level capsules. Unlike the capsule network adopting the routing-by-agreement, the ladder capsule network uses backpropagation from a loss function to reconstruct the lower-level capsule outputs from higher-level capsules; thus, the ladder layer implements the reverse directional inference of the agreement/disagreement mechanism of the capsule network. The experiments on MNIST demonstrate that the ladder capsule network learns an equivariant representation and improves the capability to extrapolate or generalize to pose variations.}\n}", "pdf": "http://proceedings.mlr.press/v97/jeong19b/jeong19b.pdf", "supp": "", "pdf_size": 747589, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10434366601065627728&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea; Department of Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea; Department of Industrial and Systems Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Republic of Korea", "aff_domain": "kaist.ac.kr; ; ", "email": "kaist.ac.kr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/jeong19b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Department of Industrial and Systems Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Large-Scale Sparse Kernel Canonical Correlation Analysis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3793", "id": "3793", "author_site": "Viivi Uurtio, Sahely Bhadra, Juho Rousu", "author": "Viivi Uurtio; Sahely Bhadra; Juho Rousu", "abstract": "This paper presents gradKCCA, a large-scale sparse non-linear canonical correlation method. Like Kernel Canonical Correlation Analysis (KCCA), our method finds non-linear relations through kernel functions, but it does not rely on a kernel matrix, a known bottleneck for scaling up kernel methods. gradKCCA corresponds to solving KCCA with the additional constraint that the canonical projection directions in the kernel-induced feature space have preimages in the original data space. Firstly, this modification allows us to very efficiently maximize kernel canonical correlation through an alternating projected gradient algorithm working in the original data space. Secondly, we can control the sparsity of the projection directions by constraining the $\\ell_1$ norm of the preimages of the projection directions, facilitating the interpretation of the discovered patterns, which is not available through KCCA. Our empirical experiments demonstrate that gradKCCA outperforms state-of-the-art CCA methods in terms of speed and robustness to noise both in simulated and real-world datasets.", "bibtex": "@InProceedings{pmlr-v97-uurtio19a,\n title = \t {Large-Scale Sparse Kernel Canonical Correlation Analysis},\n author = {Uurtio, Viivi and Bhadra, Sahely and Rousu, Juho},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6383--6391},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/uurtio19a/uurtio19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/uurtio19a.html},\n abstract = \t {This paper presents gradKCCA, a large-scale sparse non-linear canonical correlation method. Like Kernel Canonical Correlation Analysis (KCCA), our method finds non-linear relations through kernel functions, but it does not rely on a kernel matrix, a known bottleneck for scaling up kernel methods. gradKCCA corresponds to solving KCCA with the additional constraint that the canonical projection directions in the kernel-induced feature space have preimages in the original data space. Firstly, this modification allows us to very efficiently maximize kernel canonical correlation through an alternating projected gradient algorithm working in the original data space. Secondly, we can control the sparsity of the projection directions by constraining the $\\ell_1$ norm of the preimages of the projection directions, facilitating the interpretation of the discovered patterns, which is not available through KCCA. Our empirical experiments demonstrate that gradKCCA outperforms state-of-the-art CCA methods in terms of speed and robustness to noise both in simulated and real-world datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/uurtio19a/uurtio19a.pdf", "supp": "", "pdf_size": 391216, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12418274141391687497&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Aalto University, Espoo, Finland+Helsinki Institute for Information Technology, Helsinki, Finland; Computer Science and Engineering, Indian Institute of Technology Palakkad, Palakkad, India; Department of Computer Science, Aalto University, Espoo, Finland+Helsinki Institute for Information Technology, Helsinki, Finland", "aff_domain": "aalto.fi; ; ", "email": "aalto.fi; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/uurtio19a.html", "aff_unique_index": "0+1;2;0+1", "aff_unique_norm": "Aalto University;Helsinki Institute for Information Technology;Indian Institute of Technology Palakkad", "aff_unique_dep": "Department of Computer Science;;Computer Science and Engineering", "aff_unique_url": "https://www.aalto.fi;;https://www.iitpkd.ac.in", "aff_unique_abbr": "Aalto;;IIT Palakkad", "aff_campus_unique_index": "0+1;2;0+1", "aff_campus_unique": "Espoo;Helsinki;Palakkad", "aff_country_unique_index": "0+0;1;0+0", "aff_country_unique": "Finland;India" }, { "title": "Latent Normalizing Flows for Discrete Sequences", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4292", "id": "4292", "author_site": "Zachary Ziegler, Alexander Rush", "author": "Zachary Ziegler; Alexander Rush", "abstract": "Normalizing flows are a powerful class of generative models for continuous random variables, showing both strong model flexibility and the potential for non-autoregressive generation. These benefits are also desired when modeling discrete random variables such as text, but directly applying normalizing flows to discrete sequences poses significant additional challenges. We propose a VAE-based generative model which jointly learns a normalizing flow-based distribution in the latent space and a stochastic mapping to an observed discrete space. In this setting, we find that it is crucial for the flow-based distribution to be highly multimodal. To capture this property, we propose several normalizing flow architectures to maximize model flexibility. Experiments consider common discrete sequence tasks of character-level language modeling and polyphonic music generation. Our results indicate that an autoregressive flow-based model can match the performance of a comparable autoregressive baseline, and a non-autoregressive flow-based model can improve generation speed with a penalty to performance.", "bibtex": "@InProceedings{pmlr-v97-ziegler19a,\n title = \t {Latent Normalizing Flows for Discrete Sequences},\n author = {Ziegler, Zachary and Rush, Alexander},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7673--7682},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ziegler19a/ziegler19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ziegler19a.html},\n abstract = \t {Normalizing flows are a powerful class of generative models for continuous random variables, showing both strong model flexibility and the potential for non-autoregressive generation. These benefits are also desired when modeling discrete random variables such as text, but directly applying normalizing flows to discrete sequences poses significant additional challenges. We propose a VAE-based generative model which jointly learns a normalizing flow-based distribution in the latent space and a stochastic mapping to an observed discrete space. In this setting, we find that it is crucial for the flow-based distribution to be highly multimodal. To capture this property, we propose several normalizing flow architectures to maximize model flexibility. Experiments consider common discrete sequence tasks of character-level language modeling and polyphonic music generation. Our results indicate that an autoregressive flow-based model can match the performance of a comparable autoregressive baseline, and a non-autoregressive flow-based model can improve generation speed with a penalty to performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/ziegler19a/ziegler19a.pdf", "supp": "", "pdf_size": 2932881, "gs_citation": 158, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14468956623112090674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "School of Engineering and Applied Sciences, Harvard University, Cambridge, MA, USA; School of Engineering and Applied Sciences, Harvard University, Cambridge, MA, USA", "aff_domain": "g.harvard.edu; ", "email": "g.harvard.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ziegler19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "School of Engineering and Applied Sciences", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "LatentGNN: Learning Efficient Non-local Relations for Visual Recognition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3647", "id": "3647", "author_site": "Songyang Zhang, Xuming He, Shipeng Yan", "author": "Songyang Zhang; Xuming He; Shipeng Yan", "abstract": "Capturing long-range dependencies in feature representations is crucial for many visual recognition tasks. Despite recent successes of deep convolutional networks, it remains challenging to model non-local context relations between visual features. A promising strategy is to model the feature context by a fully-connected graph neural network (GNN), which augments traditional convolutional features with an estimated non-local context representation. However, most GNN-based approaches require computing a dense graph affinity matrix and hence have difficulty in scaling up to tackle complex real-world visual problems. In this work, we propose an efficient and yet flexible non-local relation representation based on a novel class of graph neural networks. Our key idea is to introduce a latent space to reduce the complexity of graph, which allows us to use a low-rank representation for the graph affinity matrix and to achieve a linear complexity in computation. Extensive experimental evaluations on three major visual recognition tasks show that our method outperforms the prior works with a large margin while maintaining a low computation cost.", "bibtex": "@InProceedings{pmlr-v97-zhang19f,\n title = \t {{L}atent{GNN}: Learning Efficient Non-local Relations for Visual Recognition},\n author = {Zhang, Songyang and He, Xuming and Yan, Shipeng},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7374--7383},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19f/zhang19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19f.html},\n abstract = \t {Capturing long-range dependencies in feature representations is crucial for many visual recognition tasks. Despite recent successes of deep convolutional networks, it remains challenging to model non-local context relations between visual features. A promising strategy is to model the feature context by a fully-connected graph neural network (GNN), which augments traditional convolutional features with an estimated non-local context representation. However, most GNN-based approaches require computing a dense graph affinity matrix and hence have difficulty in scaling up to tackle complex real-world visual problems. In this work, we propose an efficient and yet flexible non-local relation representation based on a novel class of graph neural networks. Our key idea is to introduce a latent space to reduce the complexity of graph, which allows us to use a low-rank representation for the graph affinity matrix and to achieve a linear complexity in computation. Extensive experimental evaluations on three major visual recognition tasks show that our method outperforms the prior works with a large margin while maintaining a low computation cost.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19f/zhang19f.pdf", "supp": "", "pdf_size": 437482, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7578360606999759452&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Information Science and Technology, ShanghaiTech University, Shanghai, China; School of Information Science and Technology, ShanghaiTech University, Shanghai, China; School of Information Science and Technology, ShanghaiTech University, Shanghai, China", "aff_domain": "shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn", "email": "shanghaitech.edu.cn;shanghaitech.edu.cn;shanghaitech.edu.cn", "github": "https://github.com/latentgnn/LatentGNN-V1-PyTorch", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhang19f.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ShanghaiTech University", "aff_unique_dep": "School of Information Science and Technology", "aff_unique_url": "https://www.shanghaitech.edu.cn", "aff_unique_abbr": "ShanghaiTech", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Shanghai", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Learn to Grow: A Continual Structure Learning Framework for Overcoming Catastrophic Forgetting", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3842", "id": "3842", "author_site": "Xilai Li, Yingbo Zhou, Tianfu Wu, Richard Socher, Caiming Xiong", "author": "Xilai Li; Yingbo Zhou; Tianfu Wu; Richard Socher; Caiming Xiong", "abstract": "Addressing catastrophic forgetting is one of the key challenges in continual learning where machine learning systems are trained with sequential or streaming tasks. Despite recent remarkable progress in state-of-the-art deep learning, deep neural networks (DNNs) are still plagued with the catastrophic forgetting problem. This paper presents a conceptually simple yet general and effective framework for handling catastrophic forgetting in continual learning with DNNs. The proposed method consists of two components: a neural structure optimization component and a parameter learning and/or fine-tuning component. By separating the explicit neural structure learning and the parameter estimation, not only is the proposed method capable of evolving neural structures in an intuitively meaningful way, but also shows strong capabilities of alleviating catastrophic forgetting in experiments. Furthermore, the proposed method outperforms all other baselines on the permuted MNIST dataset, the split CIFAR100 dataset and the Visual Domain Decathlon dataset in continual learning setting.", "bibtex": "@InProceedings{pmlr-v97-li19m,\n title = \t {Learn to Grow: A Continual Structure Learning Framework for Overcoming Catastrophic Forgetting},\n author = {Li, Xilai and Zhou, Yingbo and Wu, Tianfu and Socher, Richard and Xiong, Caiming},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3925--3934},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19m/li19m.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19m.html},\n abstract = \t {Addressing catastrophic forgetting is one of the key challenges in continual learning where machine learning systems are trained with sequential or streaming tasks. Despite recent remarkable progress in state-of-the-art deep learning, deep neural networks (DNNs) are still plagued with the catastrophic forgetting problem. This paper presents a conceptually simple yet general and effective framework for handling catastrophic forgetting in continual learning with DNNs. The proposed method consists of two components: a neural structure optimization component and a parameter learning and/or fine-tuning component. By separating the explicit neural structure learning and the parameter estimation, not only is the proposed method capable of evolving neural structures in an intuitively meaningful way, but also shows strong capabilities of alleviating catastrophic forgetting in experiments. Furthermore, the proposed method outperforms all other baselines on the permuted MNIST dataset, the split CIFAR100 dataset and the Visual Domain Decathlon dataset in continual learning setting.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19m/li19m.pdf", "supp": "", "pdf_size": 1536523, "gs_citation": 531, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5531866442746385760&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Electrical and Computer Engineering and the Visual Narrative Initiative, North Carolina State University, NC, USA+Salesforce Research, Palo Alto, CA, USA; Salesforce Research, Palo Alto, CA, USA; Department of Electrical and Computer Engineering and the Visual Narrative Initiative, North Carolina State University, NC, USA; Salesforce Research, Palo Alto, CA, USA; Salesforce Research, Palo Alto, CA, USA", "aff_domain": "ncsu.edu;salesforce.com;ncsu.edu;salesforce.com;salesforce.com", "email": "ncsu.edu;salesforce.com;ncsu.edu;salesforce.com;salesforce.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/li19m.html", "aff_unique_index": "0+1;1;0;1;1", "aff_unique_norm": "North Carolina State University;Salesforce Research", "aff_unique_dep": "Department of Electrical and Computer Engineering;Research", "aff_unique_url": "https://www.ncsu.edu;https://research.salesforce.com", "aff_unique_abbr": "NCSU;Salesforce", "aff_campus_unique_index": "0+1;1;0;1;1", "aff_campus_unique": "NC;Palo Alto", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Action Representations for Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3974", "id": "3974", "author_site": "Yash Chandak, Georgios Theocharous, James Kostas, Scott Jordan, Philip Thomas", "author": "Yash Chandak; Georgios Theocharous; James Kostas; Scott Jordan; Philip Thomas", "abstract": "Most model-free reinforcement learning methods leverage state representations (embeddings) for generalization, but either ignore structure in the space of actions or assume the structure is provided a priori. We show how a policy can be decomposed into a component that acts in a low-dimensional space of action representations and a component that transforms these representations into actual actions. These representations improve generalization over large, finite action sets by allowing the agent to infer the outcomes of actions similar to actions already taken. We provide an algorithm to both learn and use action representations and provide conditions for its convergence. The efficacy of the proposed method is demonstrated on large-scale real-world problems.", "bibtex": "@InProceedings{pmlr-v97-chandak19a,\n title = \t {Learning Action Representations for Reinforcement Learning},\n author = {Chandak, Yash and Theocharous, Georgios and Kostas, James and Jordan, Scott and Thomas, Philip},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {941--950},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chandak19a/chandak19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chandak19a.html},\n abstract = \t {Most model-free reinforcement learning methods leverage state representations (embeddings) for generalization, but either ignore structure in the space of actions or assume the structure is provided a priori. We show how a policy can be decomposed into a component that acts in a low-dimensional space of action representations and a component that transforms these representations into actual actions. These representations improve generalization over large, finite action sets by allowing the agent to infer the outcomes of actions similar to actions already taken. We provide an algorithm to both learn and use action representations and provide conditions for its convergence. The efficacy of the proposed method is demonstrated on large-scale real-world problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/chandak19a/chandak19a.pdf", "supp": "", "pdf_size": 835225, "gs_citation": 228, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14670351355398666906&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Massachusetts, Amherst, USA; Adobe Research, San Jose, USA; University of Massachusetts, Amherst, USA; University of Massachusetts, Amherst, USA; University of Massachusetts, Amherst, USA", "aff_domain": "cs.umass.edu; ; ; ; ", "email": "cs.umass.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/chandak19a.html", "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Massachusetts Amherst;Adobe", "aff_unique_dep": ";Adobe Research", "aff_unique_url": "https://www.umass.edu;https://research.adobe.com", "aff_unique_abbr": "UMass Amherst;Adobe", "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Amherst;San Jose", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Classifiers for Target Domain with Limited or No Labels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3837", "id": "3837", "author_site": "Pengkai Zhu, Hanxiao Wang, Venkatesh Saligrama", "author": "Pengkai Zhu; Hanxiao Wang; Venkatesh Saligrama", "abstract": "In computer vision applications, such as domain adaptation (DA), few shot learning (FSL) and zero-shot learning (ZSL), we encounter new objects and environments, for which insufficient examples exist to allow for training \u201cmodels from scratch,\u201d and methods that adapt existing models, trained on the presented training environment, to the new scenario are required. We propose a novel visual attribute encoding method that encodes each image as a low-dimensional probability vector composed of prototypical part-type probabilities. The prototypes are learnt to be representative of all training data. At test-time we utilize this encoding as an input to a classifier. At test-time we freeze the encoder and only learn/adapt the classifier component to limited annotated labels in FSL; new semantic attributes in ZSL. We conduct extensive experiments on benchmark datasets. Our method outperforms state-of-art methods trained for the specific contexts (ZSL, FSL, DA).", "bibtex": "@InProceedings{pmlr-v97-zhu19d,\n title = \t {Learning Classifiers for Target Domain with Limited or No Labels},\n author = {Zhu, Pengkai and Wang, Hanxiao and Saligrama, Venkatesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7643--7653},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhu19d/zhu19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhu19d.html},\n abstract = \t {In computer vision applications, such as domain adaptation (DA), few shot learning (FSL) and zero-shot learning (ZSL), we encounter new objects and environments, for which insufficient examples exist to allow for training \u201cmodels from scratch,\u201d and methods that adapt existing models, trained on the presented training environment, to the new scenario are required. We propose a novel visual attribute encoding method that encodes each image as a low-dimensional probability vector composed of prototypical part-type probabilities. The prototypes are learnt to be representative of all training data. At test-time we utilize this encoding as an input to a classifier. At test-time we freeze the encoder and only learn/adapt the classifier component to limited annotated labels in FSL; new semantic attributes in ZSL. We conduct extensive experiments on benchmark datasets. Our method outperforms state-of-art methods trained for the specific contexts (ZSL, FSL, DA).}\n}", "pdf": "http://proceedings.mlr.press/v97/zhu19d/zhu19d.pdf", "supp": "", "pdf_size": 5855350, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9300529921925483203&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Electrical and Computer Engineering Department, Boston University; Electrical and Computer Engineering Department, Boston University; Electrical and Computer Engineering Department, Boston University", "aff_domain": "bu.edu;bu.edu;bu.edu", "email": "bu.edu;bu.edu;bu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhu19d.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "Electrical and Computer Engineering Department", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Context-dependent Label Permutations for Multi-label Classification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4041", "id": "4041", "author_site": "Jinseok Nam, Young-Bum Kim, Eneldo Loza Mencia, Sunghyun Park, Ruhi Sarikaya, Johannes F\u00fcrnkranz", "author": "Jinseok Nam; Young-Bum Kim; Eneldo Loza Mencia; Sunghyun Park; Ruhi Sarikaya; Johannes F\u00fcrnkranz", "abstract": "A key problem in multi-label classification is to utilize dependencies among the labels. Chaining classifiers are a simple technique for addressing this problem but current algorithms all assume a fixed, static label ordering. In this work, we propose a multi-label classification approach which allows to choose a dynamic, context-dependent label ordering. Our proposed approach consists of two sub-components: a simple EM-like algorithm which bootstraps the learned model, and a more elaborate approach based on reinforcement learning. Our experiments on three public multi-label classification benchmarks show that our proposed dynamic label ordering approach based on reinforcement learning outperforms recurrent neural networks with fixed label ordering across both bipartition and ranking measures on all the three datasets. As a result, we obtain a powerful sequence prediction-based algorithm for multi-label classification, which is able to efficiently and explicitly exploit label dependencies.", "bibtex": "@InProceedings{pmlr-v97-nam19a,\n title = \t {Learning Context-dependent Label Permutations for Multi-label Classification},\n author = {Nam, Jinseok and Kim, Young-Bum and Mencia, Eneldo Loza and Park, Sunghyun and Sarikaya, Ruhi and F{\\\"u}rnkranz, Johannes},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4733--4742},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nam19a/nam19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nam19a.html},\n abstract = \t {A key problem in multi-label classification is to utilize dependencies among the labels. Chaining classifiers are a simple technique for addressing this problem but current algorithms all assume a fixed, static label ordering. In this work, we propose a multi-label classification approach which allows to choose a dynamic, context-dependent label ordering. Our proposed approach consists of two sub-components: a simple EM-like algorithm which bootstraps the learned model, and a more elaborate approach based on reinforcement learning. Our experiments on three public multi-label classification benchmarks show that our proposed dynamic label ordering approach based on reinforcement learning outperforms recurrent neural networks with fixed label ordering across both bipartition and ranking measures on all the three datasets. As a result, we obtain a powerful sequence prediction-based algorithm for multi-label classification, which is able to efficiently and explicitly exploit label dependencies.}\n}", "pdf": "http://proceedings.mlr.press/v97/nam19a/nam19a.pdf", "supp": "", "pdf_size": 4008551, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1631798304385401852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Amazon, Seattle, Washington, USA+Knowledge Engineering, TU Darmstadt, Darmstadt, Hessen, Germany; Amazon, Seattle, Washington, USA; Knowledge Engineering, TU Darmstadt, Darmstadt, Hessen, Germany; Amazon, Seattle, Washington, USA; Amazon, Seattle, Washington, USA; Knowledge Engineering, TU Darmstadt, Darmstadt, Hessen, Germany", "aff_domain": "amazon.com; ; ; ; ; ", "email": "amazon.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/nam19a.html", "aff_unique_index": "0+1;0;1;0;0;1", "aff_unique_norm": "Amazon;Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "Amazon;Knowledge Engineering", "aff_unique_url": "https://www.amazon.com;https://www.tu-darmstadt.de", "aff_unique_abbr": "Amazon;TU Darmstadt", "aff_campus_unique_index": "0+1;0;1;0;0;1", "aff_campus_unique": "Seattle;Darmstadt", "aff_country_unique_index": "0+1;0;1;0;0;1", "aff_country_unique": "United States;Germany" }, { "title": "Learning Dependency Structures for Weak Supervision Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3601", "id": "3601", "author_site": "Paroma Varma, Frederic Sala, Ann He, Alexander J Ratner, Christopher Re", "author": "Paroma Varma; Frederic Sala; Ann He; Alexander Ratner; Christopher Re", "abstract": "Labeling training data is a key bottleneck in the modern machine learning pipeline. Recent weak supervision approaches combine labels from multiple noisy sources by estimating their accuracies without access to ground truth labels; however, estimating the dependencies among these sources is a critical challenge. We focus on a robust PCA-based algorithm for learning these dependency structures, establish improved theoretical recovery rates, and outperform existing methods on various real-world tasks. Under certain conditions, we show that the amount of unlabeled data needed can scale sublinearly or even logarithmically with the number of sources m, improving over previous efforts that ignore the sparsity pattern in the dependency structure and scale linearly in m. We provide an information-theoretic lower bound on the minimum sample complexity of the weak supervision setting. Our method outperforms weak supervision approaches that assume conditionally-independent sources by up to 4.64 F1 points and previous structure learning approaches by up to 4.41 F1 points on real-world relation extraction and image classification tasks.", "bibtex": "@InProceedings{pmlr-v97-varma19a,\n title = \t {Learning Dependency Structures for Weak Supervision Models},\n author = {Varma, Paroma and Sala, Frederic and He, Ann and Ratner, Alexander and Re, Christopher},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6418--6427},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/varma19a/varma19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/varma19a.html},\n abstract = \t {Labeling training data is a key bottleneck in the modern machine learning pipeline. Recent weak supervision approaches combine labels from multiple noisy sources by estimating their accuracies without access to ground truth labels; however, estimating the dependencies among these sources is a critical challenge. We focus on a robust PCA-based algorithm for learning these dependency structures, establish improved theoretical recovery rates, and outperform existing methods on various real-world tasks. Under certain conditions, we show that the amount of unlabeled data needed can scale sublinearly or even logarithmically with the number of sources m, improving over previous efforts that ignore the sparsity pattern in the dependency structure and scale linearly in m. We provide an information-theoretic lower bound on the minimum sample complexity of the weak supervision setting. Our method outperforms weak supervision approaches that assume conditionally-independent sources by up to 4.64 F1 points and previous structure learning approaches by up to 4.41 F1 points on real-world relation extraction and image classification tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/varma19a/varma19a.pdf", "supp": "", "pdf_size": 319436, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17517952326770451341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Electrical Engineering, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "stanford.edu;stanford.edu; ; ; ", "email": "stanford.edu;stanford.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/varma19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Discrete Structures for Graph Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4197", "id": "4197", "author_site": "Luca Franceschi, Mathias Niepert, Massimiliano Pontil, Xiao He", "author": "Luca Franceschi; Mathias Niepert; Massimiliano Pontil; Xiao He", "abstract": "Graph neural networks (GNNs) are a popular class of machine learning models that have been successfully applied to a range of problems. Their major advantage lies in their ability to explicitly incorporate a sparse and discrete dependency structure between data points. Unfortunately, GNNs can only be used when such a graph-structure is available. In practice, however, real-world graphs are often noisy and incomplete or might not be available at all. With this work, we propose to jointly learn the graph structure and the parameters of graph convolutional networks (GCNs) by approximately solving a bilevel program that learns a discrete probability distribution on the edges of the graph. This allows one to apply GCNs not only in scenarios where the given graph is incomplete or corrupted but also in those where a graph is not available. We conduct a series of experiments that analyze the behavior of the proposed method and demonstrate that it outperforms related methods by a significant margin.", "bibtex": "@InProceedings{pmlr-v97-franceschi19a,\n title = \t {Learning Discrete Structures for Graph Neural Networks},\n author = {Franceschi, Luca and Niepert, Mathias and Pontil, Massimiliano and He, Xiao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1972--1982},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/franceschi19a/franceschi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/franceschi19a.html},\n abstract = \t {Graph neural networks (GNNs) are a popular class of machine learning models that have been successfully applied to a range of problems. Their major advantage lies in their ability to explicitly incorporate a sparse and discrete dependency structure between data points. Unfortunately, GNNs can only be used when such a graph-structure is available. In practice, however, real-world graphs are often noisy and incomplete or might not be available at all. With this work, we propose to jointly learn the graph structure and the parameters of graph convolutional networks (GCNs) by approximately solving a bilevel program that learns a discrete probability distribution on the edges of the graph. This allows one to apply GCNs not only in scenarios where the given graph is incomplete or corrupted but also in those where a graph is not available. We conduct a series of experiments that analyze the behavior of the proposed method and demonstrate that it outperforms related methods by a significant margin.}\n}", "pdf": "http://proceedings.mlr.press/v97/franceschi19a/franceschi19a.pdf", "supp": "", "pdf_size": 2574183, "gs_citation": 526, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2496865549075068518&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "aff": "CSML, Istituto Italiano di Tecnologia, Genoa, Italy+University College London, London, UK; NEC Labs EU, Heidelberg, Germany; CSML, Istituto Italiano di Tecnologia, Genoa, Italy+University College London, London, UK; NEC Labs EU, Heidelberg, Germany", "aff_domain": "iit.it; ; ;neclab.eu", "email": "iit.it; ; ;neclab.eu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/franceschi19a.html", "aff_unique_index": "0+1;2;0+1;2", "aff_unique_norm": "Istituto Italiano di Tecnologia;University College London;NEC Labs EU", "aff_unique_dep": "CSML;;", "aff_unique_url": "https://www.iit.it;https://www.ucl.ac.uk;https://www.nec-labs.eu", "aff_unique_abbr": "IIT;UCL;NEC Labs EU", "aff_campus_unique_index": "0+1;2;0+1;2", "aff_campus_unique": "Genoa;London;Heidelberg", "aff_country_unique_index": "0+1;2;0+1;2", "aff_country_unique": "Italy;United Kingdom;Germany" }, { "title": "Learning Discrete and Continuous Factors of Data via Alternating Disentanglement", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4023", "id": "4023", "author_site": "Yeonwoo Jeong, Hyun Oh Song", "author": "Yeonwoo Jeong; Hyun Oh Song", "abstract": "We address the problem of unsupervised disentanglement of discrete and continuous explanatory factors of data. We first show a simple procedure for minimizing the total correlation of the continuous latent variables without having to use a discriminator network or perform importance sampling, via cascading the information flow in the beta-VAE framework. Furthermore, we propose a method which avoids offloading the entire burden of jointly modeling the continuous and discrete factors to the variational encoder by employing a separate discrete inference procedure. This leads to an interesting alternating minimization problem which switches between finding the most likely discrete configuration given the continuous factors and updating the variational encoder based on the computed discrete factors. Experiments show that the proposed method clearly disentangles discrete factors and significantly outperforms current disentanglement methods based on the disentanglement score and inference network classification score. The source code is available at https://github.com/snumllab/DisentanglementICML19.", "bibtex": "@InProceedings{pmlr-v97-jeong19d,\n title = \t {Learning Discrete and Continuous Factors of Data via Alternating Disentanglement},\n author = {Jeong, Yeonwoo and Song, Hyun Oh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3091--3099},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jeong19d/jeong19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/jeong19d.html},\n abstract = \t {We address the problem of unsupervised disentanglement of discrete and continuous explanatory factors of data. We first show a simple procedure for minimizing the total correlation of the continuous latent variables without having to use a discriminator network or perform importance sampling, via cascading the information flow in the beta-VAE framework. Furthermore, we propose a method which avoids offloading the entire burden of jointly modeling the continuous and discrete factors to the variational encoder by employing a separate discrete inference procedure. This leads to an interesting alternating minimization problem which switches between finding the most likely discrete configuration given the continuous factors and updating the variational encoder based on the computed discrete factors. Experiments show that the proposed method clearly disentangles discrete factors and significantly outperforms current disentanglement methods based on the disentanglement score and inference network classification score. The source code is available at https://github.com/snumllab/DisentanglementICML19.}\n}", "pdf": "http://proceedings.mlr.press/v97/jeong19d/jeong19d.pdf", "supp": "", "pdf_size": 3501981, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14742637203782847188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science and Engineering, Seoul National University, Seoul, Korea; Department of Computer Science and Engineering, Seoul National University, Seoul, Korea", "aff_domain": "snu.ac.kr;snu.ac.kr", "email": "snu.ac.kr;snu.ac.kr", "github": "https://github.com/snu-mllab/DisentanglementICML19", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/jeong19d.html", "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Learning Distance for Sequences by Learning a Ground Metric", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3624", "id": "3624", "author_site": "Bing Su, Ying Wu", "author": "Bing Su; Ying Wu", "abstract": "Learning distances that operate directly on multi-dimensional sequences is challenging because such distances are structural by nature and the vectors in sequences are not independent. Generally, distances for sequences heavily depend on the ground metric between the vectors in sequences. We propose to learn the distance for sequences through learning a ground Mahalanobis metric for the vectors in sequences. The learning samples are sequences of vectors for which how the ground metric between vectors induces the overall distance is given, and the objective is that the distance induced by the learned ground metric produces large values for sequences from different classes and small values for those from the same class. We formulate the metric as a parameter of the distance, bring closer each sequence to an associated virtual sequence w.r.t. the distance to reduce the number of constraints, and develop a general iterative solution for any ground-metric-based sequence distance. Experiments on several sequence datasets demonstrate the effectiveness and efficiency of our method.", "bibtex": "@InProceedings{pmlr-v97-su19b,\n title = \t {Learning Distance for Sequences by Learning a Ground Metric},\n author = {Su, Bing and Wu, Ying},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6015--6025},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/su19b/su19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/su19b.html},\n abstract = \t {Learning distances that operate directly on multi-dimensional sequences is challenging because such distances are structural by nature and the vectors in sequences are not independent. Generally, distances for sequences heavily depend on the ground metric between the vectors in sequences. We propose to learn the distance for sequences through learning a ground Mahalanobis metric for the vectors in sequences. The learning samples are sequences of vectors for which how the ground metric between vectors induces the overall distance is given, and the objective is that the distance induced by the learned ground metric produces large values for sequences from different classes and small values for those from the same class. We formulate the metric as a parameter of the distance, bring closer each sequence to an associated virtual sequence w.r.t. the distance to reduce the number of constraints, and develop a general iterative solution for any ground-metric-based sequence distance. Experiments on several sequence datasets demonstrate the effectiveness and efficiency of our method.}\n}", "pdf": "http://proceedings.mlr.press/v97/su19b/su19b.pdf", "supp": "", "pdf_size": 125484, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5489681468541171674&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Science & Technology on Integrated Information System Laboratory, Institute of Software, Chinese Academy of Sciences, Beijing, China; Department of Electrical and Computer Engineering, Northwestern University, Evanston, IL, USA", "aff_domain": "gmail.com; ", "email": "gmail.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/su19b.html", "aff_unique_index": "0;1", "aff_unique_norm": "Chinese Academy of Sciences;Northwestern University", "aff_unique_dep": "Institute of Software;Department of Electrical and Computer Engineering", "aff_unique_url": "http://www.ios.ac.cn;https://www.northwestern.edu", "aff_unique_abbr": "CAS;NU", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Beijing;Evanston", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Learning Fast Algorithms for Linear Transforms Using Butterfly Factorizations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3573", "id": "3573", "author_site": "Tri Dao, Albert Gu, Matthew Eichhorn, Atri Rudra, Christopher Re", "author": "Tri Dao; Albert Gu; Matthew Eichhorn; Atri Rudra; Christopher Re", "abstract": "Fast linear transforms are ubiquitous in machine learning, including the discrete Fourier transform, discrete cosine transform, and other structured transformations such as convolutions. All of these transforms can be represented by dense matrix-vector multiplication, yet each has a specialized and highly efficient (subquadratic) algorithm. We ask to what extent hand-crafting these algorithms and implementations is necessary, what structural prior they encode, and how much knowledge is required to automatically learn a fast algorithm for a provided structured transform. Motivated by a characterization of fast matrix-vector multiplication as products of sparse matrices, we introduce a parameterization of divide-and-conquer methods that is capable of representing a large class of transforms. This generic formulation can automatically learn an efficient algorithm for many important transforms; for example, it recovers the $O(N \\log N)$ Cooley-Tukey FFT algorithm to machine precision, for dimensions $N$ up to $1024$. Furthermore, our method can be incorporated as a lightweight replacement of generic matrices in machine learning pipelines to learn efficient and compressible transformations. On a standard task of compressing a single hidden-layer network, our method exceeds the classification accuracy of unconstrained matrices on CIFAR-10 by 3.9 points\u2014the first time a structured approach has done so\u2014with 4X faster inference speed and 40X fewer parameters.", "bibtex": "@InProceedings{pmlr-v97-dao19a,\n title = \t {Learning Fast Algorithms for Linear Transforms Using Butterfly Factorizations},\n author = {Dao, Tri and Gu, Albert and Eichhorn, Matthew and Rudra, Atri and Re, Christopher},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1517--1527},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dao19a/dao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dao19a.html},\n abstract = \t {Fast linear transforms are ubiquitous in machine learning, including the discrete Fourier transform, discrete cosine transform, and other structured transformations such as convolutions. All of these transforms can be represented by dense matrix-vector multiplication, yet each has a specialized and highly efficient (subquadratic) algorithm. We ask to what extent hand-crafting these algorithms and implementations is necessary, what structural prior they encode, and how much knowledge is required to automatically learn a fast algorithm for a provided structured transform. Motivated by a characterization of fast matrix-vector multiplication as products of sparse matrices, we introduce a parameterization of divide-and-conquer methods that is capable of representing a large class of transforms. This generic formulation can automatically learn an efficient algorithm for many important transforms; for example, it recovers the $O(N \\log N)$ Cooley-Tukey FFT algorithm to machine precision, for dimensions $N$ up to $1024$. Furthermore, our method can be incorporated as a lightweight replacement of generic matrices in machine learning pipelines to learn efficient and compressible transformations. On a standard task of compressing a single hidden-layer network, our method exceeds the classification accuracy of unconstrained matrices on CIFAR-10 by 3.9 points\u2014the first time a structured approach has done so\u2014with 4X faster inference speed and 40X fewer parameters.}\n}", "pdf": "http://proceedings.mlr.press/v97/dao19a/dao19a.pdf", "supp": "", "pdf_size": 503697, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8670371133727236715&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Stanford University, USA; Department of Computer Science, Stanford University, USA; Department of Computer Science and Engineering, University at Buffalo, SUNY, USA; Department of Computer Science and Engineering, University at Buffalo, SUNY, USA; Department of Computer Science, Stanford University, USA", "aff_domain": "cs.stanford.edu; ; ; ; ", "email": "cs.stanford.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/dao19a.html", "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Stanford University;University at Buffalo", "aff_unique_dep": "Department of Computer Science;Department of Computer Science and Engineering", "aff_unique_url": "https://www.stanford.edu;https://www.buffalo.edu", "aff_unique_abbr": "Stanford;UB", "aff_campus_unique_index": "0;0;1;1;0", "aff_campus_unique": "Stanford;Buffalo", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Generative Models across Incomparable Spaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3687", "id": "3687", "author_site": "Charlotte Bunne, David Alvarez-Melis, Andreas Krause, Stefanie Jegelka", "author": "Charlotte Bunne; David Alvarez-Melis; Andreas Krause; Stefanie Jegelka", "abstract": "Generative Adversarial Networks have shown remarkable success in learning a distribution that faithfully recovers a reference distribution in its entirety. However, in some cases, we may want to only learn some aspects (e.g., cluster or manifold structure), while modifying others (e.g., style, orientation or dimension). In this work, we propose an approach to learn generative models across such incomparable spaces, and demonstrate how to steer the learned distribution towards target properties. A key component of our model is the Gromov-Wasserstein distance, a notion of discrepancy that compares distributions relationally rather than absolutely. While this framework subsumes current generative models in identically reproducing distributions, its inherent flexibility allows application to tasks in manifold learning, relational learning and cross-domain learning.", "bibtex": "@InProceedings{pmlr-v97-bunne19a,\n title = \t {Learning Generative Models across Incomparable Spaces},\n author = {Bunne, Charlotte and Alvarez-Melis, David and Krause, Andreas and Jegelka, Stefanie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {851--861},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bunne19a/bunne19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bunne19a.html},\n abstract = \t {Generative Adversarial Networks have shown remarkable success in learning a distribution that faithfully recovers a reference distribution in its entirety. However, in some cases, we may want to only learn some aspects (e.g., cluster or manifold structure), while modifying others (e.g., style, orientation or dimension). In this work, we propose an approach to learn generative models across such incomparable spaces, and demonstrate how to steer the learned distribution towards target properties. A key component of our model is the Gromov-Wasserstein distance, a notion of discrepancy that compares distributions relationally rather than absolutely. While this framework subsumes current generative models in identically reproducing distributions, its inherent flexibility allows application to tasks in manifold learning, relational learning and cross-domain learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/bunne19a/bunne19a.pdf", "supp": "", "pdf_size": 2484921, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14918252695980798695&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "aff": "Department of Computer Science, Eidgen\u00f6ssische Technische Hochschule (ETH), Z\u00fcrich, Switzerland; Computer Science and Artificial Intelligence Laboratory (CSAIL), Massachusetts Institute of Technology (MIT), Cambridge, USA; Department of Computer Science, Eidgen\u00f6ssische Technische Hochschule (ETH), Z\u00fcrich, Switzerland; Computer Science and Artificial Intelligence Laboratory (CSAIL), Massachusetts Institute of Technology (MIT), Cambridge, USA", "aff_domain": "ethz.ch; ; ; ", "email": "ethz.ch; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/bunne19a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Eidgen\u00f6ssische Technische Hochschule (ETH) Z\u00fcrich;Massachusetts Institute of Technology", "aff_unique_dep": "Department of Computer Science;Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.ethz.ch;https://www.mit.edu", "aff_unique_abbr": "ETH Z\u00fcrich;MIT", "aff_campus_unique_index": "0;1;0;1", "aff_campus_unique": "Z\u00fcrich;Cambridge", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Learning Hawkes Processes Under Synchronization Noise", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3756", "id": "3756", "author_site": "William Trouleau, Jalal Etesami, Matthias Grossglauser, Negar Kiyavash, Patrick Thiran", "author": "William Trouleau; Jalal Etesami; Matthias Grossglauser; Negar Kiyavash; Patrick Thiran", "abstract": "Multivariate Hawkes processes (MHP) are widely used in a variety of fields to model the occurrence of discrete events. Prior work on learning MHPs has only focused on inference in the presence of perfect traces without noise. We address the problem of learning the causal structure of MHPs when observations are subject to an unknown delay. In particular, we introduce the so-called synchronization noise, where the stream of events generated by each dimension is subject to a random and unknown time shift. We characterize the robustness of the classic maximum likelihood estimator to synchronization noise, and we introduce a new approach for learning the causal structure in the presence of noise. Our experimental results show that our approach accurately recovers the causal structure of MHPs for a wide range of noise levels, and significantly outperforms classic estimation methods.", "bibtex": "@InProceedings{pmlr-v97-trouleau19a,\n title = \t {Learning {H}awkes Processes Under Synchronization Noise},\n author = {Trouleau, William and Etesami, Jalal and Grossglauser, Matthias and Kiyavash, Negar and Thiran, Patrick},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6325--6334},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/trouleau19a/trouleau19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/trouleau19a.html},\n abstract = \t {Multivariate Hawkes processes (MHP) are widely used in a variety of fields to model the occurrence of discrete events. Prior work on learning MHPs has only focused on inference in the presence of perfect traces without noise. We address the problem of learning the causal structure of MHPs when observations are subject to an unknown delay. In particular, we introduce the so-called synchronization noise, where the stream of events generated by each dimension is subject to a random and unknown time shift. We characterize the robustness of the classic maximum likelihood estimator to synchronization noise, and we introduce a new approach for learning the causal structure in the presence of noise. Our experimental results show that our approach accurately recovers the causal structure of MHPs for a wide range of noise levels, and significantly outperforms classic estimation methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/trouleau19a/trouleau19a.pdf", "supp": "", "pdf_size": 734672, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8492038023472776568&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "School of Computer and Communication Sciences, EPFL, Lausanne, Switzerland; Bosch Center for Arti\ufb01cial Intelligence; Dept. of Electrical and Computer Eng. (ECE), Georgia Institute of Technology; Dept. of Industrial and Systems Eng. (ISyE), Georgia Institute of Technology; School of Computer and Communication Sciences, EPFL, Lausanne, Switzerland", "aff_domain": "epfl.ch; ; ; ; ", "email": "epfl.ch; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/trouleau19a.html", "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "EPFL;Bosch Center for Arti\ufb01cial Intelligence;Georgia Institute of Technology", "aff_unique_dep": "School of Computer and Communication Sciences;Artificial Intelligence;Dept. of Electrical and Computer Eng. (ECE)", "aff_unique_url": "https://www.epfl.ch;https://www.bosch-ai.com;https://www.gatech.edu", "aff_unique_abbr": "EPFL;BCAI;Georgia Tech", "aff_campus_unique_index": "0;2;2;0", "aff_campus_unique": "Lausanne;;Atlanta", "aff_country_unique_index": "0;1;2;2;0", "aff_country_unique": "Switzerland;Germany;United States" }, { "title": "Learning Latent Dynamics for Planning from Pixels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3764", "id": "3764", "author_site": "Danijar Hafner, Timothy Lillicrap, Ian Fischer, Ruben Villegas, David Ha, Honglak Lee, James Davidson", "author": "Danijar Hafner; Timothy Lillicrap; Ian Fischer; Ruben Villegas; David Ha; Honglak Lee; James Davidson", "abstract": "Planning has been very successful for control tasks with known environment dynamics. To leverage planning in unknown environments, the agent needs to learn the dynamics from interactions with the world. However, learning dynamics models that are accurate enough for planning has been a long-standing challenge, especially in image-based domains. We propose the Deep Planning Network (PlaNet), a purely model-based agent that learns the environment dynamics from images and chooses actions through fast online planning in latent space. To achieve high performance, the dynamics model must accurately predict the rewards ahead for multiple time steps. We approach this using a latent dynamics model with both deterministic and stochastic transition components. Moreover, we propose a multi-step variational inference objective that we name latent overshooting. Using only pixel observations, our agent solves continuous control tasks with contact dynamics, partial observability, and sparse rewards, which exceed the difficulty of tasks that were previously solved by planning with learned models. PlaNet uses substantially fewer episodes and reaches final performance close to and sometimes higher than strong model-free algorithms.", "bibtex": "@InProceedings{pmlr-v97-hafner19a,\n title = \t {Learning Latent Dynamics for Planning from Pixels},\n author = {Hafner, Danijar and Lillicrap, Timothy and Fischer, Ian and Villegas, Ruben and Ha, David and Lee, Honglak and Davidson, James},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2555--2565},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hafner19a/hafner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hafner19a.html},\n abstract = \t {Planning has been very successful for control tasks with known environment dynamics. To leverage planning in unknown environments, the agent needs to learn the dynamics from interactions with the world. However, learning dynamics models that are accurate enough for planning has been a long-standing challenge, especially in image-based domains. We propose the Deep Planning Network (PlaNet), a purely model-based agent that learns the environment dynamics from images and chooses actions through fast online planning in latent space. To achieve high performance, the dynamics model must accurately predict the rewards ahead for multiple time steps. We approach this using a latent dynamics model with both deterministic and stochastic transition components. Moreover, we propose a multi-step variational inference objective that we name latent overshooting. Using only pixel observations, our agent solves continuous control tasks with contact dynamics, partial observability, and sparse rewards, which exceed the difficulty of tasks that were previously solved by planning with learned models. PlaNet uses substantially fewer episodes and reaches final performance close to and sometimes higher than strong model-free algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/hafner19a/hafner19a.pdf", "supp": "", "pdf_size": 834694, "gs_citation": 1823, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17717536865000191198&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Google Brain + University of Toronto; DeepMind; Google Research; Google Brain + University of Michigan; Google Brain; Google Brain; Google Brain", "aff_domain": "danijar.com; ; ; ; ; ; ", "email": "danijar.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/hafner19a.html", "aff_unique_index": "0+1;2;0;0+3;0;0;0", "aff_unique_norm": "Google;University of Toronto;DeepMind;University of Michigan", "aff_unique_dep": "Google Brain;;;", "aff_unique_url": "https://brain.google.com;https://www.utoronto.ca;https://deepmind.com;https://www.umich.edu", "aff_unique_abbr": "Google Brain;U of T;DeepMind;UM", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;2;0;0+0;0;0;0", "aff_country_unique": "United States;Canada;United Kingdom" }, { "title": "Learning Linear-Quadratic Regulators Efficiently with only $\\sqrtT$ Regret", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3803", "id": "3803", "author_site": "Alon Cohen, Tomer Koren, Yishay Mansour", "author": "Alon Cohen; Tomer Koren; Yishay Mansour", "abstract": "We present the first computationally-efficient algorithm with $\\widetilde{O}(\\sqrt{T})$ regret for learning in Linear Quadratic Control systems with unknown dynamics. By that, we resolve an open question of Abbasi-Yadkori and Szepesvari (2011) and Dean,Mania, Matni, Recht, and Tu (2018).", "bibtex": "@InProceedings{pmlr-v97-cohen19b,\n title = \t {Learning Linear-Quadratic Regulators Efficiently with only $\\sqrt{T}$ Regret},\n author = {Cohen, Alon and Koren, Tomer and Mansour, Yishay},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1300--1309},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cohen19b/cohen19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/cohen19b.html},\n abstract = \t {We present the first computationally-efficient algorithm with $\\widetilde{O}(\\sqrt{T})$ regret for learning in Linear Quadratic Control systems with unknown dynamics. By that, we resolve an open question of Abbasi-Yadkori and Szepesvari (2011) and Dean,Mania, Matni, Recht, and Tu (2018).}\n}", "pdf": "http://proceedings.mlr.press/v97/cohen19b/cohen19b.pdf", "supp": "", "pdf_size": 253003, "gs_citation": 224, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14858797898450179882&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google Research, Tel-Aviv + Technion\u2014Israel Inst. of Technology; Google Brain, Mountain View; Tel-Aviv University", "aff_domain": "technion.ac.il; ; ", "email": "technion.ac.il; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/cohen19b.html", "aff_unique_index": "0+1;0;2", "aff_unique_norm": "Google;Technion\u2014Israel Institute of Technology;Tel Aviv University", "aff_unique_dep": "Google Research;;", "aff_unique_url": "https://research.google;https://www.technion.ac.il/en/;https://www.tau.ac.il", "aff_unique_abbr": "Google;Technion;TAU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Tel-Aviv;;Mountain View", "aff_country_unique_index": "0+0;1;0", "aff_country_unique": "Israel;United States" }, { "title": "Learning Models from Data with Measurement Error: Tackling Underreporting", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4067", "id": "4067", "author_site": "Roy Adams, Yuelong Ji, Xiaobin Wang, Suchi Saria", "author": "Roy Adams; Yuelong Ji; Xiaobin Wang; Suchi Saria", "abstract": "Measurement error in observational datasets can lead to systematic bias in inferences based on these datasets. As studies based on observational data are increasingly used to inform decisions with real-world impact, it is critical that we develop a robust set of techniques for analyzing and adjusting for these biases. In this paper we present a method for estimating the distribution of an outcome given a binary exposure that is subject to underreporting. Our method is based on a missing data view of the measurement error problem, where the true exposure is treated as a latent variable that is marginalized out of a joint model. We prove three different conditions under which the outcome distribution can still be identified from data containing only error-prone observations of the exposure. We demonstrate this method on synthetic data and analyze its sensitivity to near violations of the identifiability conditions. Finally, we use this method to estimate the effects of maternal smoking and heroin use during pregnancy on childhood obesity, two import problems from public health. Using the proposed method, we estimate these effects using only subject-reported drug use data and refine the range of estimates generated by a sensitivity analysis-based approach. Further, the estimates produced by our method are consistent with existing literature on both the effects of maternal smoking and the rate at which subjects underreport smoking.", "bibtex": "@InProceedings{pmlr-v97-adams19a,\n title = \t {Learning Models from Data with Measurement Error: Tackling Underreporting},\n author = {Adams, Roy and Ji, Yuelong and Wang, Xiaobin and Saria, Suchi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {61--70},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/adams19a/adams19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/adams19a.html},\n abstract = \t {Measurement error in observational datasets can lead to systematic bias in inferences based on these datasets. As studies based on observational data are increasingly used to inform decisions with real-world impact, it is critical that we develop a robust set of techniques for analyzing and adjusting for these biases. In this paper we present a method for estimating the distribution of an outcome given a binary exposure that is subject to underreporting. Our method is based on a missing data view of the measurement error problem, where the true exposure is treated as a latent variable that is marginalized out of a joint model. We prove three different conditions under which the outcome distribution can still be identified from data containing only error-prone observations of the exposure. We demonstrate this method on synthetic data and analyze its sensitivity to near violations of the identifiability conditions. Finally, we use this method to estimate the effects of maternal smoking and heroin use during pregnancy on childhood obesity, two import problems from public health. Using the proposed method, we estimate these effects using only subject-reported drug use data and refine the range of estimates generated by a sensitivity analysis-based approach. Further, the estimates produced by our method are consistent with existing literature on both the effects of maternal smoking and the rate at which subjects underreport smoking.}\n}", "pdf": "http://proceedings.mlr.press/v97/adams19a/adams19a.pdf", "supp": "", "pdf_size": 3481076, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11857060552177712436&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Johns Hopkins University; Center on the Life Origins of Disease, Department of Population, Family, and Reporductive Health, Johns Hopkins University Bloomberg School of Public Health; Center on the Life Origins of Disease, Department of Population, Family, and Reporductive Health, Johns Hopkins University Bloomberg School of Public Health; Department of Applied Math and Statistics, Johns Hopkins University+Bayesian Health", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/adams19a.html", "aff_unique_index": "0;1;1;0+2", "aff_unique_norm": "Johns Hopkins University;Johns Hopkins University Bloomberg School of Public Health;Bayesian Health", "aff_unique_dep": "Department of Computer Science;Department of Population, Family, and Reproductive Health;", "aff_unique_url": "https://www.jhu.edu;https://www.jhsph.edu;", "aff_unique_abbr": "JHU;JHU Bloomberg School;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Learning Neurosymbolic Generative Models via Program Synthesis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3775", "id": "3775", "author_site": "Halley R Young, Osbert Bastani, Mayur Naik", "author": "Halley Young; Osbert Bastani; Mayur Naik", "abstract": "Generative models have become significantly more powerful in recent years. However, these models continue to have difficulty capturing global structure in data. For example, images of buildings typically contain spatial patterns such as windows repeating at regular intervals, but state-of-the-art models have difficulty generating these patterns. We propose to address this problem by incorporating programs representing global structure into generative models{\u2014}e.g., a 2D for-loop may represent a repeating pattern of windows{\u2014}along with a framework for learning these models by leveraging program synthesis to obtain training data. On both synthetic and real-world data, we demonstrate that our approach substantially outperforms state-of-the-art at both generating and completing images with global structure.", "bibtex": "@InProceedings{pmlr-v97-young19a,\n title = \t {Learning Neurosymbolic Generative Models via Program Synthesis},\n author = {Young, Halley and Bastani, Osbert and Naik, Mayur},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7144--7153},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/young19a/young19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/young19a.html},\n abstract = \t {Generative models have become significantly more powerful in recent years. However, these models continue to have difficulty capturing global structure in data. For example, images of buildings typically contain spatial patterns such as windows repeating at regular intervals, but state-of-the-art models have difficulty generating these patterns. We propose to address this problem by incorporating programs representing global structure into generative models{\u2014}e.g., a 2D for-loop may represent a repeating pattern of windows{\u2014}along with a framework for learning these models by leveraging program synthesis to obtain training data. On both synthetic and real-world data, we demonstrate that our approach substantially outperforms state-of-the-art at both generating and completing images with global structure.}\n}", "pdf": "http://proceedings.mlr.press/v97/young19a/young19a.pdf", "supp": "", "pdf_size": 3961433, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9125289543622355706&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16, "aff": "University of Pennsylvania; University of Pennsylvania; University of Pennsylvania", "aff_domain": "seas.upenn.edu; ; ", "email": "seas.upenn.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/young19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Novel Policies For Tasks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4274", "id": "4274", "author_site": "Yunbo Zhang, Wenhao Yu, Greg Turk", "author": "Yunbo Zhang; Wenhao Yu; Greg Turk", "abstract": "In this work, we present a reinforcement learning algorithm that can find a variety of policies (novel policies) for a task that is given by a task reward function. Our method does this by creating a second reward function that recognizes previously seen state sequences and rewards those by novelty, which is measured using autoencoders that have been trained on state sequences from previously discovered policies. We present a two-objective update technique for policy gradient algorithms in which each update of the policy is a compromise between improving the task reward and improving the novelty reward. Using this method, we end up with a collection of policies that solves a given task as well as carrying out action sequences that are distinct from one another. We demonstrate this method on maze navigation tasks, a reaching task for a simulated robot arm, and a locomotion task for a hopper. We also demonstrate the effectiveness of our approach on deceptive tasks in which policy gradient methods often get stuck.", "bibtex": "@InProceedings{pmlr-v97-zhang19q,\n title = \t {Learning Novel Policies For Tasks},\n author = {Zhang, Yunbo and Yu, Wenhao and Turk, Greg},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7483--7492},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19q/zhang19q.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19q.html},\n abstract = \t {In this work, we present a reinforcement learning algorithm that can find a variety of policies (novel policies) for a task that is given by a task reward function. Our method does this by creating a second reward function that recognizes previously seen state sequences and rewards those by novelty, which is measured using autoencoders that have been trained on state sequences from previously discovered policies. We present a two-objective update technique for policy gradient algorithms in which each update of the policy is a compromise between improving the task reward and improving the novelty reward. Using this method, we end up with a collection of policies that solves a given task as well as carrying out action sequences that are distinct from one another. We demonstrate this method on maze navigation tasks, a reaching task for a simulated robot arm, and a locomotion task for a hopper. We also demonstrate the effectiveness of our approach on deceptive tasks in which policy gradient methods often get stuck.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19q/zhang19q.pdf", "supp": "", "pdf_size": 1467829, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3873676737834589628&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Interactive Computing, Georgia Institute of Technology, USA; School of Interactive Computing, Georgia Institute of Technology, USA; School of Interactive Computing, Georgia Institute of Technology, USA", "aff_domain": "gatech.edu;gatech.edu;cc.gatech.edu", "email": "gatech.edu;gatech.edu;cc.gatech.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhang19q.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "School of Interactive Computing", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Optimal Fair Policies", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4266", "id": "4266", "author_site": "Razieh Nabi, Daniel Malinsky, Ilya Shpitser", "author": "Razieh Nabi; Daniel Malinsky; Ilya Shpitser", "abstract": "Systematic discriminatory biases present in our society influence the way data is collected and stored, the way variables are defined, and the way scientific findings are put into practice as policy. Automated decision procedures and learning algorithms applied to such data may serve to perpetuate existing injustice or unfairness in our society. In this paper, we consider how to make optimal but fair decisions, which \u201cbreak the cycle of injustice\u201d by correcting for the unfair dependence of both decisions and outcomes on sensitive features (e.g., variables that correspond to gender, race, disability, or other protected attributes). We use methods from causal inference and constrained optimization to learn optimal policies in a way that addresses multiple potential biases which afflict data analysis in sensitive contexts, extending the approach of Nabi & Shpitser (2018). Our proposal comes equipped with the theoretical guarantee that the chosen fair policy will induce a joint distribution for new instances that satisfies given fairness constraints. We illustrate our approach with both synthetic data and real criminal justice data.", "bibtex": "@InProceedings{pmlr-v97-nabi19a,\n title = \t {Learning Optimal Fair Policies},\n author = {Nabi, Razieh and Malinsky, Daniel and Shpitser, Ilya},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4674--4682},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nabi19a/nabi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nabi19a.html},\n abstract = \t {Systematic discriminatory biases present in our society influence the way data is collected and stored, the way variables are defined, and the way scientific findings are put into practice as policy. Automated decision procedures and learning algorithms applied to such data may serve to perpetuate existing injustice or unfairness in our society. In this paper, we consider how to make optimal but fair decisions, which \u201cbreak the cycle of injustice\u201d by correcting for the unfair dependence of both decisions and outcomes on sensitive features (e.g., variables that correspond to gender, race, disability, or other protected attributes). We use methods from causal inference and constrained optimization to learn optimal policies in a way that addresses multiple potential biases which afflict data analysis in sensitive contexts, extending the approach of Nabi & Shpitser (2018). Our proposal comes equipped with the theoretical guarantee that the chosen fair policy will induce a joint distribution for new instances that satisfies given fairness constraints. We illustrate our approach with both synthetic data and real criminal justice data.}\n}", "pdf": "http://proceedings.mlr.press/v97/nabi19a/nabi19a.pdf", "supp": "", "pdf_size": 338348, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13513980497976308534&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Johns Hopkins University, Baltimore, MD, USA; Department of Computer Science, Johns Hopkins University, Baltimore, MD, USA; Department of Computer Science, Johns Hopkins University, Baltimore, MD, USA", "aff_domain": "jhu.edu; ; ", "email": "jhu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/nabi19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Baltimore", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Learning Optimal Linear Regularizers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3760", "id": "3760", "author": "Matthew Streeter", "abstract": "We present algorithms for efficiently learning regularizers that improve generalization. Our approach is based on the insight that regularizers can be viewed as upper bounds on the generalization gap, and that reducing the slack in the bound can improve performance on test data. For a broad class of regularizers, the hyperparameters that give the best upper bound can be computed using linear programming. Under certain Bayesian assumptions, solving the LP lets us \"jump\" to the optimal hyperparameters given very limited data. This suggests a natural algorithm for tuning regularization hyperparameters, which we show to be effective on both real and synthetic data.", "bibtex": "@InProceedings{pmlr-v97-streeter19a,\n title = \t {Learning Optimal Linear Regularizers},\n author = {Streeter, Matthew},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5996--6004},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/streeter19a/streeter19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/streeter19a.html},\n abstract = \t {We present algorithms for efficiently learning regularizers that improve generalization. Our approach is based on the insight that regularizers can be viewed as upper bounds on the generalization gap, and that reducing the slack in the bound can improve performance on test data. For a broad class of regularizers, the hyperparameters that give the best upper bound can be computed using linear programming. Under certain Bayesian assumptions, solving the LP lets us \"jump\" to the optimal hyperparameters given very limited data. This suggests a natural algorithm for tuning regularization hyperparameters, which we show to be effective on both real and synthetic data.}\n}", "pdf": "http://proceedings.mlr.press/v97/streeter19a/streeter19a.pdf", "supp": "", "pdf_size": 663910, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8864155713935170950&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Research", "aff_domain": "google.com", "email": "google.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/streeter19a.html", "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Learning Structured Decision Problems with Unawareness", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3648", "id": "3648", "author_site": "Craig Innes, Alex Lascarides", "author": "Craig Innes; Alex Lascarides", "abstract": "Structured models of decision making often assume an agent is aware of all possible states and actions in advance. This assumption is sometimes untenable. In this paper, we learn Bayesian Decision Networks from both domain exploration and expert assertions in a way which guarantees convergence to optimal behaviour, even when the agent starts unaware of actions or belief variables that are critical to success. Our experiments show that our agent learns optimal behaviour on both small and large decision problems, and that allowing an agent to conserve information upon making new discoveries results in faster convergence.", "bibtex": "@InProceedings{pmlr-v97-innes19a,\n title = \t {Learning Structured Decision Problems with Unawareness},\n author = {Innes, Craig and Lascarides, Alex},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2941--2950},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/innes19a/innes19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/innes19a.html},\n abstract = \t {Structured models of decision making often assume an agent is aware of all possible states and actions in advance. This assumption is sometimes untenable. In this paper, we learn Bayesian Decision Networks from both domain exploration and expert assertions in a way which guarantees convergence to optimal behaviour, even when the agent starts unaware of actions or belief variables that are critical to success. Our experiments show that our agent learns optimal behaviour on both small and large decision problems, and that allowing an agent to conserve information upon making new discoveries results in faster convergence.}\n}", "pdf": "http://proceedings.mlr.press/v97/innes19a/innes19a.pdf", "supp": "", "pdf_size": 574334, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5308307232789907026&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "University of Edinburgh, UK; University of Edinburgh, UK", "aff_domain": "ed.ac.uk;inf.ed.ac.uk", "email": "ed.ac.uk;inf.ed.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/innes19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Learning What and Where to Transfer", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3985", "id": "3985", "author_site": "Yunhun Jang, Hankook Lee, Sung Ju Hwang, Jinwoo Shin", "author": "Yunhun Jang; Hankook Lee; Sung Ju Hwang; Jinwoo Shin", "abstract": "As the application of deep learning has expanded to real-world problems with insufficient volume of training data, transfer learning recently has gained much attention as means of improving the performance in such small-data regime. However, when existing methods are applied between heterogeneous architectures and tasks, it becomes more important to manage their detailed configurations and often requires exhaustive tuning on them for the desired performance. To address the issue, we propose a novel transfer learning approach based on meta-learning that can automatically learn what knowledge to transfer from the source network to where in the target network. Given source and target networks, we propose an efficient training scheme to learn meta-networks that decide (a) which pairs of layers between the source and target networks should be matched for knowledge transfer and (b) which features and how much knowledge from each feature should be transferred. We validate our meta-transfer approach against recent transfer learning methods on various datasets and network architectures, on which our automated scheme significantly outperforms the prior baselines that find \u201cwhat and where to transfer\u201d in a hand-crafted manner.", "bibtex": "@InProceedings{pmlr-v97-jang19b,\n title = \t {Learning What and Where to Transfer},\n author = {Jang, Yunhun and Lee, Hankook and Hwang, Sung Ju and Shin, Jinwoo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3030--3039},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jang19b/jang19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/jang19b.html},\n abstract = \t {As the application of deep learning has expanded to real-world problems with insufficient volume of training data, transfer learning recently has gained much attention as means of improving the performance in such small-data regime. However, when existing methods are applied between heterogeneous architectures and tasks, it becomes more important to manage their detailed configurations and often requires exhaustive tuning on them for the desired performance. To address the issue, we propose a novel transfer learning approach based on meta-learning that can automatically learn what knowledge to transfer from the source network to where in the target network. Given source and target networks, we propose an efficient training scheme to learn meta-networks that decide (a) which pairs of layers between the source and target networks should be matched for knowledge transfer and (b) which features and how much knowledge from each feature should be transferred. We validate our meta-transfer approach against recent transfer learning methods on various datasets and network architectures, on which our automated scheme significantly outperforms the prior baselines that find \u201cwhat and where to transfer\u201d in a hand-crafted manner.}\n}", "pdf": "http://proceedings.mlr.press/v97/jang19b/jang19b.pdf", "supp": "", "pdf_size": 4003869, "gs_citation": 183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12979255639867638665&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Electrical Engineering, KAIST, Korea + OMNIOUS, Korea; School of Electrical Engineering, KAIST, Korea; School of Computing, KAIST, Korea + Graduate School of AI, KAIST, Korea + AITRICS, Korea; School of Electrical Engineering, KAIST, Korea + Graduate School of AI, KAIST, Korea + AITRICS, Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/jang19b.html", "aff_unique_index": "0+1;0;0+0+2;0+0+2", "aff_unique_norm": "KAIST;OMNIOUS;AITRICS", "aff_unique_dep": "School of Electrical Engineering;;", "aff_unique_url": "https://www.kaist.ac.kr;;https://www.aitrics.com", "aff_unique_abbr": "KAIST;;", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0+0;0+0+0", "aff_country_unique": "South Korea" }, { "title": "Learning a Compressed Sensing Measurement Matrix via Gradient Unrolling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3980", "id": "3980", "author_site": "Shanshan Wu, Alexandros Dimakis, Sujay Sanghavi, Felix Xinnan Yu, Daniel Holtmann-Rice, Dmitry Storcheus, Afshin Rostamizadeh, Sanjiv Kumar", "author": "Shanshan Wu; Alex Dimakis; Sujay Sanghavi; Felix Yu; Daniel Holtmann-Rice; Dmitry Storcheus; Afshin Rostamizadeh; Sanjiv Kumar", "abstract": "Linear encoding of sparse vectors is widely popular, but is commonly data-independent \u2013 missing any possible extra (but a priori unknown) structure beyond sparsity. In this paper we present a new method to learn linear encoders that adapt to data, while still performing well with the widely used $\\ell_1$ decoder. The convex $\\ell_1$ decoder prevents gradient propagation as needed in standard gradient-based training. Our method is based on the insight that unrolling the convex decoder into $T$ projected subgradient steps can address this issue. Our method can be seen as a data-driven way to learn a compressed sensing measurement matrix. We compare the empirical performance of 10 algorithms over 6 sparse datasets (3 synthetic and 3 real). Our experiments show that there is indeed additional structure beyond sparsity in the real datasets; our method is able to discover it and exploit it to create excellent reconstructions with fewer measurements (by a factor of 1.1-3x) compared to the previous state-of-the-art methods. We illustrate an application of our method in learning label embeddings for extreme multi-label classification, and empirically show that our method is able to match or outperform the precision scores of SLEEC, which is one of the state-of-the-art embedding-based approaches.", "bibtex": "@InProceedings{pmlr-v97-wu19b,\n title = \t {Learning a Compressed Sensing Measurement Matrix via Gradient Unrolling},\n author = {Wu, Shanshan and Dimakis, Alex and Sanghavi, Sujay and Yu, Felix and Holtmann-Rice, Daniel and Storcheus, Dmitry and Rostamizadeh, Afshin and Kumar, Sanjiv},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6828--6839},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wu19b/wu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/wu19b.html},\n abstract = \t {Linear encoding of sparse vectors is widely popular, but is commonly data-independent \u2013 missing any possible extra (but a priori unknown) structure beyond sparsity. In this paper we present a new method to learn linear encoders that adapt to data, while still performing well with the widely used $\\ell_1$ decoder. The convex $\\ell_1$ decoder prevents gradient propagation as needed in standard gradient-based training. Our method is based on the insight that unrolling the convex decoder into $T$ projected subgradient steps can address this issue. Our method can be seen as a data-driven way to learn a compressed sensing measurement matrix. We compare the empirical performance of 10 algorithms over 6 sparse datasets (3 synthetic and 3 real). Our experiments show that there is indeed additional structure beyond sparsity in the real datasets; our method is able to discover it and exploit it to create excellent reconstructions with fewer measurements (by a factor of 1.1-3x) compared to the previous state-of-the-art methods. We illustrate an application of our method in learning label embeddings for extreme multi-label classification, and empirically show that our method is able to match or outperform the precision scores of SLEEC, which is one of the state-of-the-art embedding-based approaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/wu19b/wu19b.pdf", "supp": "", "pdf_size": 548627, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7047806265254435189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Electrical and Computer Engineering, University of Texas at Austin, USA+Google Research, New York, USA; Department of Electrical and Computer Engineering, University of Texas at Austin, USA; Department of Electrical and Computer Engineering, University of Texas at Austin, USA; Google Research, New York, USA; Google Research, New York, USA; Google Research, New York, USA; Google Research, New York, USA; Google Research, New York, USA", "aff_domain": "utexas.edu; ; ; ; ; ; ; ", "email": "utexas.edu; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/wu19b.html", "aff_unique_index": "0+1;0;0;1;1;1;1;1", "aff_unique_norm": "University of Texas at Austin;Google", "aff_unique_dep": "Department of Electrical and Computer Engineering;Google Research", "aff_unique_url": "https://www.utexas.edu;https://research.google", "aff_unique_abbr": "UT Austin;Google Research", "aff_campus_unique_index": "0+1;0;0;1;1;1;1;1", "aff_campus_unique": "Austin;New York", "aff_country_unique_index": "0+0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning a Prior over Intent via Meta-Inverse Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3995", "id": "3995", "author_site": "Kelvin Xu, Ellis Ratner, Anca Dragan, Sergey Levine, Chelsea Finn", "author": "Kelvin Xu; Ellis Ratner; Anca Dragan; Sergey Levine; Chelsea Finn", "abstract": "A significant challenge for the practical application of reinforcement learning to real world problems is the need to specify an oracle reward function that correctly defines a task. Inverse reinforcement learning (IRL) seeks to avoid this challenge by instead inferring a reward function from expert demonstrations. While appealing, it can be impractically expensive to collect datasets of demonstrations that cover the variation common in the real world (e.g. opening any type of door). Thus in practice, IRL must commonly be performed with only a limited set of demonstrations where it can be exceedingly difficult to unambiguously recover a reward function. In this work, we exploit the insight that demonstrations from other tasks can be used to constrain the set of possible reward functions by learning a \"prior\" that is specifically optimized for the ability to infer expressive reward functions from limited numbers of demonstrations. We demonstrate that our method can efficiently recover rewards from images for novel tasks and provide intuition as to how our approach is analogous to learning a prior.", "bibtex": "@InProceedings{pmlr-v97-xu19d,\n title = \t {Learning a Prior over Intent via Meta-Inverse Reinforcement Learning},\n author = {Xu, Kelvin and Ratner, Ellis and Dragan, Anca and Levine, Sergey and Finn, Chelsea},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6952--6962},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xu19d/xu19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/xu19d.html},\n abstract = \t {A significant challenge for the practical application of reinforcement learning to real world problems is the need to specify an oracle reward function that correctly defines a task. Inverse reinforcement learning (IRL) seeks to avoid this challenge by instead inferring a reward function from expert demonstrations. While appealing, it can be impractically expensive to collect datasets of demonstrations that cover the variation common in the real world (e.g. opening any type of door). Thus in practice, IRL must commonly be performed with only a limited set of demonstrations where it can be exceedingly difficult to unambiguously recover a reward function. In this work, we exploit the insight that demonstrations from other tasks can be used to constrain the set of possible reward functions by learning a \"prior\" that is specifically optimized for the ability to infer expressive reward functions from limited numbers of demonstrations. We demonstrate that our method can efficiently recover rewards from images for novel tasks and provide intuition as to how our approach is analogous to learning a prior.}\n}", "pdf": "http://proceedings.mlr.press/v97/xu19d/xu19d.pdf", "supp": "", "pdf_size": 5005543, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9219954710940302832&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering and Computer Science, University of California, Berkeley, USA; Department of Electrical Engineering and Computer Science, University of California, Berkeley, USA; Department of Electrical Engineering and Computer Science, University of California, Berkeley, USA; Department of Electrical Engineering and Computer Science, University of California, Berkeley, USA; Department of Electrical Engineering and Computer Science, University of California, Berkeley, USA", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/xu19d.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning and Data Selection in Big Datasets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3733", "id": "3733", "author_site": "Hossein Shokri Ghadikolaei, Hadi Ghauch, Inst. of Technology Carlo Fischione, Mikael Skoglund", "author": "Hossein Shokri Ghadikolaei; Hadi Ghauch; Carlo Fischione; Mikael Skoglund", "abstract": "Finding a dataset of minimal cardinality to characterize the optimal parameters of a model is of paramount importance in machine learning and distributed optimization over a network. This paper investigates the compressibility of large datasets. More specifically, we propose a framework that jointly learns the input-output mapping as well as the most representative samples of the dataset (sufficient dataset). Our analytical results show that the cardinality of the sufficient dataset increases sub-linearly with respect to the original dataset size. Numerical evaluations of real datasets reveal a large compressibility, up to 95%, without a noticeable drop in the learnability performance, measured by the generalization error.", "bibtex": "@InProceedings{pmlr-v97-ghadikolaei19a,\n title = \t {Learning and Data Selection in Big Datasets},\n author = {Ghadikolaei, Hossein Shokri and Ghauch, Hadi and Fischione, Carlo and Skoglund, Mikael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2191--2200},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ghadikolaei19a/ghadikolaei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ghadikolaei19a.html},\n abstract = \t {Finding a dataset of minimal cardinality to characterize the optimal parameters of a model is of paramount importance in machine learning and distributed optimization over a network. This paper investigates the compressibility of large datasets. More specifically, we propose a framework that jointly learns the input-output mapping as well as the most representative samples of the dataset (sufficient dataset). Our analytical results show that the cardinality of the sufficient dataset increases sub-linearly with respect to the original dataset size. Numerical evaluations of real datasets reveal a large compressibility, up to 95%, without a noticeable drop in the learnability performance, measured by the generalization error.}\n}", "pdf": "http://proceedings.mlr.press/v97/ghadikolaei19a/ghadikolaei19a.pdf", "supp": "", "pdf_size": 401692, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10470233448992585915&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Electrical Engineering and Computer Science, KTH Royal Institute of Technology, Stockholm, Sweden+COMELEC Department, Telecom ParisTech, Paris, France; School of Electrical Engineering and Computer Science, KTH Royal Institute of Technology, Stockholm, Sweden+COMELEC Department, Telecom ParisTech, Paris, France; School of Electrical Engineering and Computer Science, KTH Royal Institute of Technology, Stockholm, Sweden; School of Electrical Engineering and Computer Science, KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se; ; ; ", "email": "kth.se; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/ghadikolaei19a.html", "aff_unique_index": "0+1;0+1;0;0", "aff_unique_norm": "KTH Royal Institute of Technology;Telecom ParisTech", "aff_unique_dep": "School of Electrical Engineering and Computer Science;COMELEC Department", "aff_unique_url": "https://www.kth.se;https://www.telecom-paristech.fr", "aff_unique_abbr": "KTH;Telecom ParisTech", "aff_campus_unique_index": "0+1;0+1;0;0", "aff_campus_unique": "Stockholm;Paris", "aff_country_unique_index": "0+1;0+1;0;0", "aff_country_unique": "Sweden;France" }, { "title": "Learning deep kernels for exponential family densities", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4255", "id": "4255", "author_site": "Li Kevin Wenliang, D.J. Sutherland, Heiko Strathmann, Arthur Gretton", "author": "Li Wenliang; Danica J. Sutherland; Heiko Strathmann; Arthur Gretton", "abstract": "The kernel exponential family is a rich class of distributions, which can be fit efficiently and with statistical guarantees by score matching. Being required to choose a priori a simple kernel such as the Gaussian, however, limits its practical applicability. We provide a scheme for learning a kernel parameterized by a deep network, which can find complex location-dependent local features of the data geometry. This gives a very rich class of density models, capable of fitting complex structures on moderate-dimensional problems. Compared to deep density models fit via maximum likelihood, our approach provides a complementary set of strengths and tradeoffs: in empirical studies, the former can yield higher likelihoods, whereas the latter gives better estimates of the gradient of the log density, the score, which describes the distribution\u2019s shape.", "bibtex": "@InProceedings{pmlr-v97-wenliang19a,\n title = \t {Learning deep kernels for exponential family densities},\n author = {Wenliang, Li and Sutherland, Danica J. and Strathmann, Heiko and Gretton, Arthur},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6737--6746},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wenliang19a/wenliang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wenliang19a.html},\n abstract = \t {The kernel exponential family is a rich class of distributions, which can be fit efficiently and with statistical guarantees by score matching. Being required to choose a priori a simple kernel such as the Gaussian, however, limits its practical applicability. We provide a scheme for learning a kernel parameterized by a deep network, which can find complex location-dependent local features of the data geometry. This gives a very rich class of density models, capable of fitting complex structures on moderate-dimensional problems. Compared to deep density models fit via maximum likelihood, our approach provides a complementary set of strengths and tradeoffs: in empirical studies, the former can yield higher likelihoods, whereas the latter gives better estimates of the gradient of the log density, the score, which describes the distribution\u2019s shape.}\n}", "pdf": "http://proceedings.mlr.press/v97/wenliang19a/wenliang19a.pdf", "supp": "", "pdf_size": 4255384, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18438114656627425154&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wenliang19a.html" }, { "title": "Learning from Delayed Outcomes via Proxies with Applications to Recommender Systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3921", "id": "3921", "author_site": "Timothy Mann, Sven Gowal, Andr\u00e1s Gy\u00f6rgy, Huiyi Hu, Ray Jiang, Balaji Lakshminarayanan, Prav Srinivasan", "author": "Timothy Arthur Mann; Sven Gowal; Andras Gyorgy; Huiyi Hu; Ray Jiang; Balaji Lakshminarayanan; Prav Srinivasan", "abstract": "Predicting delayed outcomes is an important problem in recommender systems (e.g., if customers will finish reading an ebook). We formalize the problem as an adversarial, delayed online learning problem and consider how a proxy for the delayed outcome (e.g., if customers read a third of the book in 24 hours) can help minimize regret, even though the proxy is not available when making a prediction. Motivated by our regret analysis, we propose two neural network architectures: Factored Forecaster (FF) which is ideal if the proxy is informative of the outcome in hindsight, and Residual Factored Forecaster (RFF) that is robust to a non-informative proxy. Experiments on two real-world datasets for predicting human behavior show that RFF outperforms both FF and a direct forecaster that does not make use of the proxy. Our results suggest that exploiting proxies by factorization is a promising way to mitigate the impact of long delays in human-behavior prediction tasks.", "bibtex": "@InProceedings{pmlr-v97-mann19a,\n title = \t {Learning from Delayed Outcomes via Proxies with Applications to Recommender Systems},\n author = {Mann, Timothy Arthur and Gowal, Sven and Gyorgy, Andras and Hu, Huiyi and Jiang, Ray and Lakshminarayanan, Balaji and Srinivasan, Prav},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4324--4332},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mann19a/mann19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mann19a.html},\n abstract = \t {Predicting delayed outcomes is an important problem in recommender systems (e.g., if customers will finish reading an ebook). We formalize the problem as an adversarial, delayed online learning problem and consider how a proxy for the delayed outcome (e.g., if customers read a third of the book in 24 hours) can help minimize regret, even though the proxy is not available when making a prediction. Motivated by our regret analysis, we propose two neural network architectures: Factored Forecaster (FF) which is ideal if the proxy is informative of the outcome in hindsight, and Residual Factored Forecaster (RFF) that is robust to a non-informative proxy. Experiments on two real-world datasets for predicting human behavior show that RFF outperforms both FF and a direct forecaster that does not make use of the proxy. Our results suggest that exploiting proxies by factorization is a promising way to mitigate the impact of long delays in human-behavior prediction tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/mann19a/mann19a.pdf", "supp": "", "pdf_size": 731406, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=80397537878287202&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK", "aff_domain": "google.com; ; ; ; ; ; ", "email": "google.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/mann19a.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "London", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Learning from a Learner", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4077", "id": "4077", "author_site": "alexis jacq, Matthieu Geist, Ana Paiva, Olivier Pietquin", "author": "Alexis Jacq; Matthieu Geist; Ana Paiva; Olivier Pietquin", "abstract": "In this paper, we propose a novel setting for Inverse Reinforcement Learning (IRL), namely \"Learning from a Learner\" (LfL). As opposed to standard IRL, it does not consist in learning a reward by observing an optimal agent but from observations of another learning (and thus sub-optimal) agent. To do so, we leverage the fact that the observed agent\u2019s policy is assumed to improve over time. The ultimate goal of this approach is to recover the actual environment\u2019s reward and to allow the observer to outperform the learner. To recover that reward in practice, we propose methods based on the entropy-regularized policy iteration framework. We discuss different approaches to learn solely from trajectories in the state-action space. We demonstrate the genericity of our method by observing agents implementing various reinforcement learning algorithms. Finally, we show that, on both discrete and continuous state/action tasks, the observer\u2019s performance (that optimizes the recovered reward) can surpass those of the observed agent.", "bibtex": "@InProceedings{pmlr-v97-jacq19a,\n title = \t {Learning from a Learner},\n author = {Jacq, Alexis and Geist, Matthieu and Paiva, Ana and Pietquin, Olivier},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2990--2999},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jacq19a/jacq19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jacq19a.html},\n abstract = \t {In this paper, we propose a novel setting for Inverse Reinforcement Learning (IRL), namely \"Learning from a Learner\" (LfL). As opposed to standard IRL, it does not consist in learning a reward by observing an optimal agent but from observations of another learning (and thus sub-optimal) agent. To do so, we leverage the fact that the observed agent\u2019s policy is assumed to improve over time. The ultimate goal of this approach is to recover the actual environment\u2019s reward and to allow the observer to outperform the learner. To recover that reward in practice, we propose methods based on the entropy-regularized policy iteration framework. We discuss different approaches to learn solely from trajectories in the state-action space. We demonstrate the genericity of our method by observing agents implementing various reinforcement learning algorithms. Finally, we show that, on both discrete and continuous state/action tasks, the observer\u2019s performance (that optimizes the recovered reward) can surpass those of the observed agent.}\n}", "pdf": "http://proceedings.mlr.press/v97/jacq19a/jacq19a.pdf", "supp": "", "pdf_size": 2191455, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=82164898949132983&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Google Brain, Paris, France+INESC-ID, IST, University of Lisbon; Google Brain, Paris, France; INESC-ID, IST, University of Lisbon; Google Brain, Paris, France", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/jacq19a.html", "aff_unique_index": "0+1;0;1;0", "aff_unique_norm": "Google;University of Lisbon", "aff_unique_dep": "Google Brain;INESC-ID, IST", "aff_unique_url": "https://brain.google.com;https://www.ulusiada.pt", "aff_unique_abbr": "Google Brain;ULisbon", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0+1;0;1;0", "aff_country_unique": "France;Portugal" }, { "title": "Learning interpretable continuous-time models of latent stochastic dynamical systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4089", "id": "4089", "author_site": "Lea Duncker, Gergo Bohner, Julien Boussard, Maneesh Sahani", "author": "Lea Duncker; Gergo Bohner; Julien Boussard; Maneesh Sahani", "abstract": "We develop an approach to learn an interpretable semi-parametric model of a latent continuous-time stochastic dynamical system, assuming noisy high-dimensional outputs sampled at uneven times. The dynamics are described by a nonlinear stochastic differential equation (SDE) driven by a Wiener process, with a drift evolution function drawn from a Gaussian process (GP) conditioned on a set of learnt fixed points and corresponding local Jacobian matrices. This form yields a flexible nonparametric model of the dynamics, with a representation corresponding directly to the interpretable portraits routinely employed in the study of nonlinear dynamical systems. The learning algorithm combines inference of continuous latent paths underlying observed data with a sparse variational description of the dynamical process. We demonstrate our approach on simulated data from different nonlinear dynamical systems.", "bibtex": "@InProceedings{pmlr-v97-duncker19a,\n title = \t {Learning interpretable continuous-time models of latent stochastic dynamical systems},\n author = {Duncker, Lea and Bohner, Gergo and Boussard, Julien and Sahani, Maneesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1726--1734},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/duncker19a/duncker19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/duncker19a.html},\n abstract = \t {We develop an approach to learn an interpretable semi-parametric model of a latent continuous-time stochastic dynamical system, assuming noisy high-dimensional outputs sampled at uneven times. The dynamics are described by a nonlinear stochastic differential equation (SDE) driven by a Wiener process, with a drift evolution function drawn from a Gaussian process (GP) conditioned on a set of learnt fixed points and corresponding local Jacobian matrices. This form yields a flexible nonparametric model of the dynamics, with a representation corresponding directly to the interpretable portraits routinely employed in the study of nonlinear dynamical systems. The learning algorithm combines inference of continuous latent paths underlying observed data with a sparse variational description of the dynamical process. We demonstrate our approach on simulated data from different nonlinear dynamical systems.}\n}", "pdf": "http://proceedings.mlr.press/v97/duncker19a/duncker19a.pdf", "supp": "", "pdf_size": 1729442, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14742287364904060967&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Gatsby Computational Neuroscience Unit, University College London, London, United Kingdom; Gatsby Computational Neuroscience Unit, University College London, London, United Kingdom; Stanford University, Palo Alto, California, USA; Gatsby Computational Neuroscience Unit, University College London, London, United Kingdom", "aff_domain": "gatsby.ucl.ac.uk; ; ; ", "email": "gatsby.ucl.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/duncker19a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University College London;Stanford University", "aff_unique_dep": "Gatsby Computational Neuroscience Unit;", "aff_unique_url": "https://www.ucl.ac.uk;https://www.stanford.edu", "aff_unique_abbr": "UCL;Stanford", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "London;Palo Alto", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Learning to Clear the Market", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4192", "id": "4192", "author_site": "Weiran Shen, S\u00e9bastien Lahaie, Renato Leme", "author": "Weiran Shen; Sebastien Lahaie; Renato Paes Leme", "abstract": "The problem of market clearing is to set a price for an item such that quantity demanded equals quantity supplied. In this work, we cast the problem of predicting clearing prices into a learning framework and use the resulting models to perform revenue optimization in auctions and markets with contextual information. The economic intuition behind market clearing allows us to obtain fine-grained control over the aggressiveness of the resulting pricing policy, grounded in theory. To evaluate our approach, we fit a model of clearing prices over a massive dataset of bids in display ad auctions from a major ad exchange. The learned prices outperform other modeling techniques in the literature in terms of revenue and efficiency trade-offs. Because of the convex nature of the clearing loss function, the convergence rate of our method is as fast as linear regression.", "bibtex": "@InProceedings{pmlr-v97-shen19b,\n title = \t {Learning to Clear the Market},\n author = {Shen, Weiran and Lahaie, Sebastien and Leme, Renato Paes},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5710--5718},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shen19b/shen19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/shen19b.html},\n abstract = \t {The problem of market clearing is to set a price for an item such that quantity demanded equals quantity supplied. In this work, we cast the problem of predicting clearing prices into a learning framework and use the resulting models to perform revenue optimization in auctions and markets with contextual information. The economic intuition behind market clearing allows us to obtain fine-grained control over the aggressiveness of the resulting pricing policy, grounded in theory. To evaluate our approach, we fit a model of clearing prices over a massive dataset of bids in display ad auctions from a major ad exchange. The learned prices outperform other modeling techniques in the literature in terms of revenue and efficiency trade-offs. Because of the convex nature of the clearing loss function, the convergence rate of our method is as fast as linear regression.}\n}", "pdf": "http://proceedings.mlr.press/v97/shen19b/shen19b.pdf", "supp": "", "pdf_size": 372900, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff": "Tsinghua University, Beijing, China; Google Research, New York, New York, USA; Google Research, New York, New York, USA", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/shen19b.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Tsinghua University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.tsinghua.edu.cn;https://research.google", "aff_unique_abbr": "THU;Google", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Beijing;New York", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United States" }, { "title": "Learning to Collaborate in Markov Decision Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3599", "id": "3599", "author_site": "Goran Radanovic, Rati Devidze, David Parkes, Adish Singla", "author": "Goran Radanovic; Rati Devidze; David Parkes; Adish Singla", "abstract": "We consider a two-agent MDP framework where agents repeatedly solve a task in a collaborative setting. We study the problem of designing a learning algorithm for the first agent (A1) that facilitates a successful collaboration even in cases when the second agent (A2) is adapting its policy in an unknown way. The key challenge in our setting is that the first agent faces non-stationarity in rewards and transitions because of the adaptive behavior of the second agent. We design novel online learning algorithms for agent A1 whose regret decays as $O(T^{1-\\frac{3}{7} \\cdot \\alpha})$ with $T$ learning episodes provided that the magnitude of agent A2\u2019s policy changes between any two consecutive episodes are upper bounded by $O(T^{-\\alpha})$. Here, the parameter $\\alpha$ is assumed to be strictly greater than $0$, and we show that this assumption is necessary provided that the", "bibtex": "@InProceedings{pmlr-v97-radanovic19a,\n title = \t {Learning to Collaborate in {M}arkov Decision Processes},\n author = {Radanovic, Goran and Devidze, Rati and Parkes, David and Singla, Adish},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5261--5270},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/radanovic19a/radanovic19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/radanovic19a.html},\n abstract = \t {We consider a two-agent MDP framework where agents repeatedly solve a task in a collaborative setting. We study the problem of designing a learning algorithm for the first agent (A1) that facilitates a successful collaboration even in cases when the second agent (A2) is adapting its policy in an unknown way. The key challenge in our setting is that the first agent faces non-stationarity in rewards and transitions because of the adaptive behavior of the second agent. We design novel online learning algorithms for agent A1 whose regret decays as $O(T^{1-\\frac{3}{7} \\cdot \\alpha})$ with $T$ learning episodes provided that the magnitude of agent A2\u2019s policy changes between any two consecutive episodes are upper bounded by $O(T^{-\\alpha})$. Here, the parameter $\\alpha$ is assumed to be strictly greater than $0$, and we show that this assumption is necessary provided that the", "pdf": "http://proceedings.mlr.press/v97/radanovic19a/radanovic19a.pdf", "supp": "", "pdf_size": 327762, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9309894194182964785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Harvard University; Max Planck Institute for Software Systems (MPI-SWS); Harvard University; Max Planck Institute for Software Systems (MPI-SWS)", "aff_domain": "g.harvard.edu; ; ;mpi-sws.org", "email": "g.harvard.edu; ; ;mpi-sws.org", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/radanovic19a.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Harvard University;Max Planck Institute for Software Systems", "aff_unique_dep": ";", "aff_unique_url": "https://www.harvard.edu;https://www.mpi-sws.org", "aff_unique_abbr": "Harvard;MPI-SWS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "United States;Germany" }, { "title": "Learning to Convolve: A Generalized Weight-Tying Approach", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3761", "id": "3761", "author_site": "Nichita Diaconu, Daniel E Worrall", "author": "Nichita Diaconu; Daniel Worrall", "abstract": "Recent work (Cohen & Welling, 2016) has shown that generalizations of convolutions, based on group theory, provide powerful inductive biases for learning. In these generalizations, filters are not only translated but can also be rotated, flipped, etc. However, coming up with exact models of how to rotate a 3x3 filter on a square pixel-grid is difficult. In this paper, we learn how to transform filters for use in the group convolution, focussing on roto-translation. For this, we learn a filter basis and all rotated versions of that filter basis. Filters are then encoded by a set of rotation invariant coefficients. To rotate a filter, we switch the basis. We demonstrate we can produce feature maps with low sensitivity to input rotations, while achieving high performance on MNIST and CIFAR-10.", "bibtex": "@InProceedings{pmlr-v97-diaconu19a,\n title = \t {Learning to Convolve: A Generalized Weight-Tying Approach},\n author = {Diaconu, Nichita and Worrall, Daniel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1586--1595},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/diaconu19a/diaconu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/diaconu19a.html},\n abstract = \t {Recent work (Cohen & Welling, 2016) has shown that generalizations of convolutions, based on group theory, provide powerful inductive biases for learning. In these generalizations, filters are not only translated but can also be rotated, flipped, etc. However, coming up with exact models of how to rotate a 3x3 filter on a square pixel-grid is difficult. In this paper, we learn how to transform filters for use in the group convolution, focussing on roto-translation. For this, we learn a filter basis and all rotated versions of that filter basis. Filters are then encoded by a set of rotation invariant coefficients. To rotate a filter, we switch the basis. We demonstrate we can produce feature maps with low sensitivity to input rotations, while achieving high performance on MNIST and CIFAR-10.}\n}", "pdf": "http://proceedings.mlr.press/v97/diaconu19a/diaconu19a.pdf", "supp": "", "pdf_size": 2195438, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9868760729468254284&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Philips Lab, University of Amsterdam, Netherlands; Philips Lab, University of Amsterdam, Netherlands", "aff_domain": "uva.nl;uva.nl", "email": "uva.nl;uva.nl", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/diaconu19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Amsterdam", "aff_unique_dep": "Philips Lab", "aff_unique_url": "https://www.uva.nl", "aff_unique_abbr": "UvA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "title": "Learning to Exploit Long-term Relational Dependencies in Knowledge Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3605", "id": "3605", "author_site": "Lingbing Guo, Zequn Sun, Wei Hu", "author": "Lingbing Guo; Zequn Sun; Wei Hu", "abstract": "We study the problem of knowledge graph (KG) embedding. A widely-established assumption to this problem is that similar entities are likely to have similar relational roles. However, existing related methods derive KG embeddings mainly based on triple-level learning, which lack the capability of capturing long-term relational dependencies of entities. Moreover, triple-level learning is insufficient for the propagation of semantic information among entities, especially for the case of cross-KG embedding. In this paper, we propose recurrent skipping networks (RSNs), which employ a skipping mechanism to bridge the gaps between entities. RSNs integrate recurrent neural networks (RNNs) with residual learning to efficiently capture the long-term relational dependencies within and between KGs. We design an end-to-end framework to support RSNs on different tasks. Our experimental results showed that RSNs outperformed state-of-the-art embedding-based methods for entity alignment and achieved competitive performance for KG completion.", "bibtex": "@InProceedings{pmlr-v97-guo19c,\n title = \t {Learning to Exploit Long-term Relational Dependencies in Knowledge Graphs},\n author = {Guo, Lingbing and Sun, Zequn and Hu, Wei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2505--2514},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/guo19c/guo19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/guo19c.html},\n abstract = \t {We study the problem of knowledge graph (KG) embedding. A widely-established assumption to this problem is that similar entities are likely to have similar relational roles. However, existing related methods derive KG embeddings mainly based on triple-level learning, which lack the capability of capturing long-term relational dependencies of entities. Moreover, triple-level learning is insufficient for the propagation of semantic information among entities, especially for the case of cross-KG embedding. In this paper, we propose recurrent skipping networks (RSNs), which employ a skipping mechanism to bridge the gaps between entities. RSNs integrate recurrent neural networks (RNNs) with residual learning to efficiently capture the long-term relational dependencies within and between KGs. We design an end-to-end framework to support RSNs on different tasks. Our experimental results showed that RSNs outperformed state-of-the-art embedding-based methods for entity alignment and achieved competitive performance for KG completion.}\n}", "pdf": "http://proceedings.mlr.press/v97/guo19c/guo19c.pdf", "supp": "", "pdf_size": 649284, "gs_citation": 343, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13843373750336430796&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, Jiangsu, China; State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, Jiangsu, China; State Key Laboratory for Novel Software Technology, Nanjing University, Nanjing, Jiangsu, China", "aff_domain": "nju.edu.cn; ; ", "email": "nju.edu.cn; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/guo19c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Nanjing University", "aff_unique_dep": "State Key Laboratory for Novel Software Technology", "aff_unique_url": "http://www.nju.edu.cn", "aff_unique_abbr": "Nanjing U", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Nanjing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Learning to Generalize from Sparse and Underspecified Rewards", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4332", "id": "4332", "author_site": "Rishabh Agarwal, Chen Liang, Dale Schuurmans, Mohammad Norouzi", "author": "Rishabh Agarwal; Chen Liang; Dale Schuurmans; Mohammad Norouzi", "abstract": "We consider the problem of learning from sparse and underspecified rewards, where an agent receives a complex input, such as a natural language instruction, and needs to generate a complex response, such as an action sequence, while only receiving binary success-failure feedback. Such success-failure rewards are often underspecified: they do not distinguish between purposeful and accidental success. Generalization from underspecified rewards hinges on discounting spurious trajectories that attain accidental success, while learning from sparse feedback requires effective exploration. We address exploration by using a mode covering direction of KL divergence to collect a diverse set of successful trajectories, followed by a mode seeking KL divergence to train a robust policy. We propose Meta Reward Learning (MeRL) to construct an auxiliary reward function that provides more refined feedback for learning. The parameters of the auxiliary reward function are optimized with respect to the validation performance of a trained policy. The MeRL approach outperforms an alternative method for reward learning based on Bayesian Optimization, and achieves the state-of-the-art on weakly-supervised semantic parsing. It improves previous work by 1.2% and 2.4% on WikiTableQuestions and WikiSQL datasets respectively.", "bibtex": "@InProceedings{pmlr-v97-agarwal19e,\n title = \t {Learning to Generalize from Sparse and Underspecified Rewards},\n author = {Agarwal, Rishabh and Liang, Chen and Schuurmans, Dale and Norouzi, Mohammad},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {130--140},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/agarwal19e/agarwal19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/agarwal19e.html},\n abstract = \t {We consider the problem of learning from sparse and underspecified rewards, where an agent receives a complex input, such as a natural language instruction, and needs to generate a complex response, such as an action sequence, while only receiving binary success-failure feedback. Such success-failure rewards are often underspecified: they do not distinguish between purposeful and accidental success. Generalization from underspecified rewards hinges on discounting spurious trajectories that attain accidental success, while learning from sparse feedback requires effective exploration. We address exploration by using a mode covering direction of KL divergence to collect a diverse set of successful trajectories, followed by a mode seeking KL divergence to train a robust policy. We propose Meta Reward Learning (MeRL) to construct an auxiliary reward function that provides more refined feedback for learning. The parameters of the auxiliary reward function are optimized with respect to the validation performance of a trained policy. The MeRL approach outperforms an alternative method for reward learning based on Bayesian Optimization, and achieves the state-of-the-art on weakly-supervised semantic parsing. It improves previous work by 1.2% and 2.4% on WikiTableQuestions and WikiSQL datasets respectively.}\n}", "pdf": "http://proceedings.mlr.press/v97/agarwal19e/agarwal19e.pdf", "supp": "", "pdf_size": 671963, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13736275324943891850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Google Research, Brain Team; Google Research, Brain Team; Google Research, Brain Team + University of Alberta; Google Research, Brain Team", "aff_domain": "google.com; ;google.com;google.com", "email": "google.com; ;google.com;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/agarwal19e.html", "aff_unique_index": "0;0;0+1;0", "aff_unique_norm": "Google;University of Alberta", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.ualberta.ca", "aff_unique_abbr": "Google;UAlberta", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0+1;0", "aff_country_unique": "United States;Canada" }, { "title": "Learning to Groove with Inverse Sequence Transformations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3931", "id": "3931", "author_site": "Jon Gillick, Adam Roberts, Jesse Engel, Douglas Eck, David Bamman", "author": "Jon Gillick; Adam Roberts; Jesse Engel; Douglas Eck; David Bamman", "abstract": "We explore models for translating abstract musical ideas (scores, rhythms) into expressive performances using seq2seq and recurrent variational information bottleneck (VIB) models. Though seq2seq models usually require painstakingly aligned corpora, we show that it is possible to adapt an approach from the Generative Adversarial Network (GAN) literature (e.g. Pix2Pix, Vid2Vid) to sequences, creating large volumes of paired data by performing simple transformations and training generative models to plausibly invert these transformations. Music, and drumming in particular, provides a strong test case for this approach because many common transformations (quantization, removing voices) have clear semantics, and learning to invert them has real-world applications. Focusing on the case of drum set players, we create and release a new dataset for this purpose, containing over 13 hours of recordings by professional drummers aligned with fine-grained timing and dynamics information. We also explore some of the creative potential of these models, demonstrating improvements on state-of-the-art methods for Humanization (instantiating a performance from a musical score).", "bibtex": "@InProceedings{pmlr-v97-gillick19a,\n title = \t {Learning to Groove with Inverse Sequence Transformations},\n author = {Gillick, Jon and Roberts, Adam and Engel, Jesse and Eck, Douglas and Bamman, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2269--2279},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gillick19a/gillick19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gillick19a.html},\n abstract = \t {We explore models for translating abstract musical ideas (scores, rhythms) into expressive performances using seq2seq and recurrent variational information bottleneck (VIB) models. Though seq2seq models usually require painstakingly aligned corpora, we show that it is possible to adapt an approach from the Generative Adversarial Network (GAN) literature (e.g. Pix2Pix, Vid2Vid) to sequences, creating large volumes of paired data by performing simple transformations and training generative models to plausibly invert these transformations. Music, and drumming in particular, provides a strong test case for this approach because many common transformations (quantization, removing voices) have clear semantics, and learning to invert them has real-world applications. Focusing on the case of drum set players, we create and release a new dataset for this purpose, containing over 13 hours of recordings by professional drummers aligned with fine-grained timing and dynamics information. We also explore some of the creative potential of these models, demonstrating improvements on state-of-the-art methods for Humanization (instantiating a performance from a musical score).}\n}", "pdf": "http://proceedings.mlr.press/v97/gillick19a/gillick19a.pdf", "supp": "", "pdf_size": 1421351, "gs_citation": 136, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11938158807597584701&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Information, University of California, Berkeley, CA, U.S.A + Google AI, Mountain View, CA, U.S.A; Google AI, Mountain View, CA, U.S.A; Google AI, Mountain View, CA, U.S.A; Google AI, Mountain View, CA, U.S.A; School of Information, University of California, Berkeley, CA, U.S.A", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/gillick19a.html", "aff_unique_index": "0+1;1;1;1;0", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": "School of Information;Google AI", "aff_unique_url": "https://www.berkeley.edu;https://ai.google", "aff_unique_abbr": "UC Berkeley;Google AI", "aff_campus_unique_index": "0+1;1;1;1;0", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0+0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Learning to Infer Program Sketches", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4065", "id": "4065", "author_site": "Maxwell Nye, Luke Hewitt, Josh Tenenbaum, Armando Solar-Lezama", "author": "Maxwell Nye; Luke Hewitt; Joshua Tenenbaum; Armando Solar-Lezama", "abstract": "Our goal is to build systems which write code automatically from the kinds of specifications humans can most easily provide, such as examples and natural language instruction. The key idea of this work is that a flexible combination of pattern recognition and explicit reasoning can be used to solve these complex programming problems. We propose a method for dynamically integrating these types of information. Our novel intermediate representation and training algorithm allow a program synthesis system to learn, without direct supervision, when to rely on pattern recognition and when to perform symbolic search. Our model matches the memorization and generalization performance of neural synthesis and symbolic search, respectively, and achieves state-of-the-art performance on a dataset of simple English description-to-code programming problems.", "bibtex": "@InProceedings{pmlr-v97-nye19a,\n title = \t {Learning to Infer Program Sketches},\n author = {Nye, Maxwell and Hewitt, Luke and Tenenbaum, Joshua and Solar-Lezama, Armando},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4861--4870},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nye19a/nye19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nye19a.html},\n abstract = \t {Our goal is to build systems which write code automatically from the kinds of specifications humans can most easily provide, such as examples and natural language instruction. The key idea of this work is that a flexible combination of pattern recognition and explicit reasoning can be used to solve these complex programming problems. We propose a method for dynamically integrating these types of information. Our novel intermediate representation and training algorithm allow a program synthesis system to learn, without direct supervision, when to rely on pattern recognition and when to perform symbolic search. Our model matches the memorization and generalization performance of neural synthesis and symbolic search, respectively, and achieves state-of-the-art performance on a dataset of simple English description-to-code programming problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/nye19a/nye19a.pdf", "supp": "", "pdf_size": 803436, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17303764643585588375&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff": "MIT Brain and Cognitive Sciences+MIT CSAIL; MIT Brain and Cognitive Sciences+MIT CSAIL+MIT-IBM AI Lab; MIT Brain and Cognitive Sciences+MIT CSAIL+Center for Brains, Minds and Machines (CBMM); MIT CSAIL", "aff_domain": "mit.edu; ; ; ", "email": "mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/nye19a.html", "aff_unique_index": "0+0;0+0+0;0+0+1;0", "aff_unique_norm": "Massachusetts Institute of Technology;Center for Brains, Minds and Machines", "aff_unique_dep": "Department of Brain and Cognitive Sciences;", "aff_unique_url": "https://www.mit.edu;http://cbmm.mit.edu/", "aff_unique_abbr": "MIT;CBMM", "aff_campus_unique_index": "0+0;0+0;0+0;0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0+0;0+0+0;0+0+0;0", "aff_country_unique": "United States" }, { "title": "Learning to Optimize Multigrid PDE Solvers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3877", "id": "3877", "author_site": "Daniel Greenfeld, Meirav Galun, Ronen Basri, Irad Yavneh, Ron Kimmel", "author": "Daniel Greenfeld; Meirav Galun; Ronen Basri; Irad Yavneh; Ron Kimmel", "abstract": "Constructing fast numerical solvers for partial differential equations (PDEs) is crucial for many scientific disciplines. A leading technique for solving large-scale PDEs is using multigrid methods. At the core of a multigrid solver is the prolongation matrix, which relates between different scales of the problem. This matrix is strongly problem-dependent, and its optimal construction is critical to the efficiency of the solver. In practice, however, devising multigrid algorithms for new problems often poses formidable challenges. In this paper we propose a framework for learning multigrid solvers. Our method learns a (single) mapping from discretized PDEs to prolongation operators for a broad class of 2D diffusion problems. We train a neural network once for the entire class of PDEs, using an efficient and unsupervised loss function. Our tests demonstrate improved convergence rates compared to the widely used Black-Box multigrid scheme, suggesting that our method successfully learned rules for constructing prolongation matrices.", "bibtex": "@InProceedings{pmlr-v97-greenfeld19a,\n title = \t {Learning to Optimize Multigrid {PDE} Solvers},\n author = {Greenfeld, Daniel and Galun, Meirav and Basri, Ronen and Yavneh, Irad and Kimmel, Ron},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2415--2423},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/greenfeld19a/greenfeld19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/greenfeld19a.html},\n abstract = \t {Constructing fast numerical solvers for partial differential equations (PDEs) is crucial for many scientific disciplines. A leading technique for solving large-scale PDEs is using multigrid methods. At the core of a multigrid solver is the prolongation matrix, which relates between different scales of the problem. This matrix is strongly problem-dependent, and its optimal construction is critical to the efficiency of the solver. In practice, however, devising multigrid algorithms for new problems often poses formidable challenges. In this paper we propose a framework for learning multigrid solvers. Our method learns a (single) mapping from discretized PDEs to prolongation operators for a broad class of 2D diffusion problems. We train a neural network once for the entire class of PDEs, using an efficient and unsupervised loss function. Our tests demonstrate improved convergence rates compared to the widely used Black-Box multigrid scheme, suggesting that our method successfully learned rules for constructing prolongation matrices.}\n}", "pdf": "http://proceedings.mlr.press/v97/greenfeld19a/greenfeld19a.pdf", "supp": "", "pdf_size": 3835322, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6923556660343216997&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16, "aff": "Weizmann Institute of Science; Weizmann Institute of Science; Technion, Israel Institute of Technology; Technion, Israel Institute of Technology; Weizmann Institute of Science", "aff_domain": "weizmann.ac.il; ; ; ; ", "email": "weizmann.ac.il; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/greenfeld19a.html", "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Weizmann Institute of Science;Israel Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.weizmann.org.il;https://www.technion.ac.il/en/", "aff_unique_abbr": "Weizmann;Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Learning to Prove Theorems via Interacting with Proof Assistants", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3658", "id": "3658", "author_site": "Kaiyu Yang, Jia Deng", "author": "Kaiyu Yang; Jia Deng", "abstract": "Humans prove theorems by relying on substantial high-level reasoning and problem-specific insights. Proof assistants offer a formalism that resembles human mathematical reasoning, representing theorems in higher-order logic and proofs as high-level tactics. However, human experts have to construct proofs manually by entering tactics into the proof assistant. In this paper, we study the problem of using machine learning to automate the interaction with proof assistants. We construct CoqGym, a large-scale dataset and learning environment containing 71K human-written proofs from 123 projects developed with the Coq proof assistant. We develop ASTactic, a deep learning-based model that generates tactics as programs in the form of abstract syntax trees (ASTs). Experiments show that ASTactic trained on CoqGym can generate effective tactics and can be used to prove new theorems not previously provable by automated methods. Code is available at https://github.com/princeton-vl/CoqGym.", "bibtex": "@InProceedings{pmlr-v97-yang19a,\n title = \t {Learning to Prove Theorems via Interacting with Proof Assistants},\n author = {Yang, Kaiyu and Deng, Jia},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6984--6994},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yang19a/yang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yang19a.html},\n abstract = \t {Humans prove theorems by relying on substantial high-level reasoning and problem-specific insights. Proof assistants offer a formalism that resembles human mathematical reasoning, representing theorems in higher-order logic and proofs as high-level tactics. However, human experts have to construct proofs manually by entering tactics into the proof assistant. In this paper, we study the problem of using machine learning to automate the interaction with proof assistants. We construct CoqGym, a large-scale dataset and learning environment containing 71K human-written proofs from 123 projects developed with the Coq proof assistant. We develop ASTactic, a deep learning-based model that generates tactics as programs in the form of abstract syntax trees (ASTs). Experiments show that ASTactic trained on CoqGym can generate effective tactics and can be used to prove new theorems not previously provable by automated methods. Code is available at https://github.com/princeton-vl/CoqGym.}\n}", "pdf": "http://proceedings.mlr.press/v97/yang19a/yang19a.pdf", "supp": "", "pdf_size": 729025, "gs_citation": 183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14925207938076962028&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "aff": "Department of Computer Science, Princeton University; Department of Computer Science, Princeton University", "aff_domain": "cs.princeton.edu;cs.princeton.edu", "email": "cs.princeton.edu;cs.princeton.edu", "github": "https://github.com/princeton-vl/CoqGym", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/yang19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning to Route in Similarity Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4084", "id": "4084", "author_site": "Dmitry Baranchuk, Dmitry Persiyanov, Anton Sinitsin, Artem Babenko", "author": "Dmitry Baranchuk; Dmitry Persiyanov; Anton Sinitsin; Artem Babenko", "abstract": "Recently similarity graphs became the leading paradigm for efficient nearest neighbor search, outperforming traditional tree-based and LSH-based methods. Similarity graphs perform the search via greedy routing: a query traverses the graph and in each vertex moves to the adjacent vertex that is the closest to this query. In practice, similarity graphs are often susceptible to local minima, when queries do not reach its nearest neighbors, getting stuck in suboptimal vertices. In this paper we propose to learn the routing function that overcomes local minima via incorporating information about the graph global structure. In particular, we augment the vertices of a given graph with additional representations that are learned to provide the optimal routing from the start vertex to the query nearest neighbor. By thorough experiments, we demonstrate that the proposed learnable routing successfully diminishes the local minima problem and significantly improves the overall search performance.", "bibtex": "@InProceedings{pmlr-v97-baranchuk19a,\n title = \t {Learning to Route in Similarity Graphs},\n author = {Baranchuk, Dmitry and Persiyanov, Dmitry and Sinitsin, Anton and Babenko, Artem},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {475--484},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/baranchuk19a/baranchuk19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/baranchuk19a.html},\n abstract = \t {Recently similarity graphs became the leading paradigm for efficient nearest neighbor search, outperforming traditional tree-based and LSH-based methods. Similarity graphs perform the search via greedy routing: a query traverses the graph and in each vertex moves to the adjacent vertex that is the closest to this query. In practice, similarity graphs are often susceptible to local minima, when queries do not reach its nearest neighbors, getting stuck in suboptimal vertices. In this paper we propose to learn the routing function that overcomes local minima via incorporating information about the graph global structure. In particular, we augment the vertices of a given graph with additional representations that are learned to provide the optimal routing from the start vertex to the query nearest neighbor. By thorough experiments, we demonstrate that the proposed learnable routing successfully diminishes the local minima problem and significantly improves the overall search performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/baranchuk19a/baranchuk19a.pdf", "supp": "", "pdf_size": 918498, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=381431972230740194&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Yandex, Russia+Lomonosov Moscow State University, Russia; Moscow Institute of Physics and Technology, Russia; Yandex, Russia+National Research University Higher School of Economics, Russia+Lomonosov Moscow State University, Russia; Yandex, Russia+National Research University Higher School of Economics, Russia", "aff_domain": "graphics.cs.msu.ru; ; ; ", "email": "graphics.cs.msu.ru; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/baranchuk19a.html", "aff_unique_index": "0+1;2;0+3+1;0+3", "aff_unique_norm": "Yandex;Lomonosov Moscow State University;Moscow Institute of Physics and Technology;National Research University Higher School of Economics", "aff_unique_dep": ";;;", "aff_unique_url": "https://yandex.com;https://www.msu.ru;https://www.mipt.ru/en;https://hse.ru", "aff_unique_abbr": "Yandex;MSU;MIPT;HSE", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+0+0;0+0", "aff_country_unique": "Russian Federation" }, { "title": "Learning to bid in revenue-maximizing auctions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4090", "id": "4090", "author_site": "Thomas Nedelec, Noureddine El Karoui, Vianney Perchet", "author": "Thomas Nedelec; Noureddine El Karoui; Vianney Perchet", "abstract": "We consider the problem of the optimization of bidding strategies in prior-dependent revenue-maximizing auctions, when the seller fixes the reserve prices based on the bid distributions. Our study is done in the setting where one bidder is strategic. Using a variational approach, we study the complexity of the original objective and we introduce a relaxation of the objective functional in order to use gradient descent methods. Our approach is simple, general and can be applied to various value distributions and revenue-maximizing mechanisms. The new strategies we derive yield massive uplifts compared to the traditional truthfully bidding strategy.", "bibtex": "@InProceedings{pmlr-v97-nedelec19a,\n title = \t {Learning to bid in revenue-maximizing auctions},\n author = {Nedelec, Thomas and Karoui, Noureddine El and Perchet, Vianney},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4781--4789},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nedelec19a/nedelec19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nedelec19a.html},\n abstract = \t {We consider the problem of the optimization of bidding strategies in prior-dependent revenue-maximizing auctions, when the seller fixes the reserve prices based on the bid distributions. Our study is done in the setting where one bidder is strategic. Using a variational approach, we study the complexity of the original objective and we introduce a relaxation of the objective functional in order to use gradient descent methods. Our approach is simple, general and can be applied to various value distributions and revenue-maximizing mechanisms. The new strategies we derive yield massive uplifts compared to the traditional truthfully bidding strategy.}\n}", "pdf": "http://proceedings.mlr.press/v97/nedelec19a/nedelec19a.pdf", "supp": "", "pdf_size": 728495, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11919580122796819740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Criteo AI Lab + CMLA, ENS Paris Saclay; UC, Berkeley; Criteo AI Lab + CMLA, ENS Paris Saclay", "aff_domain": "cmla.ens-cachan.fr; ; ", "email": "cmla.ens-cachan.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/nedelec19a.html", "aff_unique_index": "0+1;2;0+1", "aff_unique_norm": "Criteo;\u00c9cole Normale Sup\u00e9rieure Paris Saclay;University of California, Berkeley", "aff_unique_dep": "Criteo AI Lab;CMLA;", "aff_unique_url": "https://www.criteo.com;https://www.ens-paris-saclay.fr;https://www.berkeley.edu", "aff_unique_abbr": "Criteo;ENS Paris Saclay;UC Berkeley", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Paris Saclay;Berkeley", "aff_country_unique_index": "0+0;1;0+0", "aff_country_unique": "France;United States" }, { "title": "Learning to select for a predefined ranking", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3797", "id": "3797", "author_site": "Aleksei Ustimenko, Aleksandr Vorobev, Gleb Gusev, Pavel Serdyukov", "author": "Aleksei Ustimenko; Aleksandr Vorobev; Gleb Gusev; Pavel Serdyukov", "abstract": "In this paper, we formulate a novel problem of learning to select a set of items maximizing the quality of their ordered list, where the order is predefined by some explicit rule. Unlike the classic information retrieval problem, in our setting, the predefined order of items in the list may not correspond to their quality in general. For example, this is a dominant scenario in personalized news and social media feeds, where items are ordered by publication time in a user interface. We propose new theoretically grounded algorithms based on direct optimization of the resulting list quality. Our offline and online experiments with a large-scale product search engine demonstrate the overwhelming advantage of our methods over the baselines in terms of all key quality metrics.", "bibtex": "@InProceedings{pmlr-v97-vorobev19a,\n title = \t {Learning to select for a predefined ranking},\n author = {Ustimenko, Aleksei and Vorobev, Aleksandr and Gusev, Gleb and Serdyukov, Pavel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6477--6486},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/vorobev19a/vorobev19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/vorobev19a.html},\n abstract = \t {In this paper, we formulate a novel problem of learning to select a set of items maximizing the quality of their ordered list, where the order is predefined by some explicit rule. Unlike the classic information retrieval problem, in our setting, the predefined order of items in the list may not correspond to their quality in general. For example, this is a dominant scenario in personalized news and social media feeds, where items are ordered by publication time in a user interface. We propose new theoretically grounded algorithms based on direct optimization of the resulting list quality. Our offline and online experiments with a large-scale product search engine demonstrate the overwhelming advantage of our methods over the baselines in terms of all key quality metrics.}\n}", "pdf": "http://proceedings.mlr.press/v97/vorobev19a/vorobev19a.pdf", "supp": "", "pdf_size": 345891, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15475336222894273136&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Yandex, Moscow, Russia + Skoltech University, Moscow, Russia + Faculty of Computer Science, Higher School of Economics, Moscow, Russia; Yandex, Moscow, Russia; Yandex, Moscow, Russia + Department of Innovation and High Technology, Moscow Institute of Physics and Technology, Dolgoprudny, Russia; Yandex, Moscow, Russia", "aff_domain": "yandex-team.ru;gmail.com;gmail.com;yandex-team.ru", "email": "yandex-team.ru;gmail.com;gmail.com;yandex-team.ru", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/vorobev19a.html", "aff_unique_index": "0+1+2;0;0+3;0", "aff_unique_norm": "Yandex;Skoltech University;Higher School of Economics;Moscow Institute of Physics and Technology", "aff_unique_dep": ";;Faculty of Computer Science;Department of Innovation and High Technology", "aff_unique_url": "https://yandex.com;https://www.skoltech.ru;https://www.hse.ru;https://www.mipt.ru", "aff_unique_abbr": "Yandex;Skoltech;HSE;MIPT", "aff_campus_unique_index": "0+0+0;0;0+1;0", "aff_campus_unique": "Moscow;Dolgoprudny", "aff_country_unique_index": "0+0+0;0;0+0;0", "aff_country_unique": "Russian Federation" }, { "title": "Learning with Bad Training Data via Iterative Trimmed Loss Minimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4165", "id": "4165", "author_site": "Yanyao Shen, Sujay Sanghavi", "author": "Yanyao Shen; Sujay Sanghavi", "abstract": "In this paper, we study a simple and generic framework to tackle the problem of learning model parameters when a fraction of the training samples are corrupted. Our approach is motivated by a simple observation: in a variety of such settings, the evolution of training accuracy (as a function of training epochs) is different for clean samples and bad samples. We propose to iteratively minimize the trimmed loss, by alternating between (a) selecting samples with lowest current loss, and (b) retraining a model on only these samples. Analytically, we characterize the statistical performance and convergence rate of the algorithm for simple and natural linear and non-linear models. Experimentally, we demonstrate its effectiveness in three settings: (a) deep image classifiers with errors only in labels, (b) generative adversarial networks with bad training images, and (c) deep image classifiers with adversarial (image, label) pairs (i.e., backdoor attacks). For the well-studied setting of random label noise, our algorithm achieves state-of-the-art performance without having access to any a-priori guaranteed clean samples.", "bibtex": "@InProceedings{pmlr-v97-shen19e,\n title = \t {Learning with Bad Training Data via Iterative Trimmed Loss Minimization},\n author = {Shen, Yanyao and Sanghavi, Sujay},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5739--5748},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shen19e/shen19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/shen19e.html},\n abstract = \t {In this paper, we study a simple and generic framework to tackle the problem of learning model parameters when a fraction of the training samples are corrupted. Our approach is motivated by a simple observation: in a variety of such settings, the evolution of training accuracy (as a function of training epochs) is different for clean samples and bad samples. We propose to iteratively minimize the trimmed loss, by alternating between (a) selecting samples with lowest current loss, and (b) retraining a model on only these samples. Analytically, we characterize the statistical performance and convergence rate of the algorithm for simple and natural linear and non-linear models. Experimentally, we demonstrate its effectiveness in three settings: (a) deep image classifiers with errors only in labels, (b) generative adversarial networks with bad training images, and (c) deep image classifiers with adversarial (image, label) pairs (i.e., backdoor attacks). For the well-studied setting of random label noise, our algorithm achieves state-of-the-art performance without having access to any a-priori guaranteed clean samples.}\n}", "pdf": "http://proceedings.mlr.press/v97/shen19e/shen19e.pdf", "supp": "", "pdf_size": 1980688, "gs_citation": 327, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17654591022475396219&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "ECE Department, University of Texas at Austin, TX, USA; ECE Department, University of Texas at Austin, TX, USA", "aff_domain": "utexas.edu;mail.utexas.edu", "email": "utexas.edu;mail.utexas.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/shen19e.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "ECE Department", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Learning-to-Learn Stochastic Gradient Descent with Biased Regularization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4116", "id": "4116", "author_site": "Giulia Denevi, Carlo Ciliberto, Riccardo Grazzi, Massimiliano Pontil", "author": "Giulia Denevi; Carlo Ciliberto; Riccardo Grazzi; Massimiliano Pontil", "abstract": "We study the problem of learning-to-learn: infer- ring a learning algorithm that works well on a family of tasks sampled from an unknown distribution. As class of algorithms we consider Stochastic Gradient Descent (SGD) on the true risk regularized by the square euclidean distance from a bias vector. We present an average excess risk bound for such a learning algorithm that quantifies the potential benefit of using a bias vector with respect to the unbiased case. We then propose a novel meta-algorithm to estimate the bias term online from a sequence of observed tasks. The small memory footprint and low time complexity of our approach makes it appealing in practice while our theoretical analysis provides guarantees on the generalization properties of the meta-algorithm on new tasks. A key feature of our results is that, when the number of tasks grows and their vari- ance is relatively small, our learning-to-learn approach has a significant advantage over learning each task in isolation by standard SGD without a bias term. Numerical experiments demonstrate the effectiveness of our approach in practice.", "bibtex": "@InProceedings{pmlr-v97-denevi19a,\n title = \t {Learning-to-Learn Stochastic Gradient Descent with Biased Regularization},\n author = {Denevi, Giulia and Ciliberto, Carlo and Grazzi, Riccardo and Pontil, Massimiliano},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1566--1575},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/denevi19a/denevi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/denevi19a.html},\n abstract = \t {We study the problem of learning-to-learn: infer- ring a learning algorithm that works well on a family of tasks sampled from an unknown distribution. As class of algorithms we consider Stochastic Gradient Descent (SGD) on the true risk regularized by the square euclidean distance from a bias vector. We present an average excess risk bound for such a learning algorithm that quantifies the potential benefit of using a bias vector with respect to the unbiased case. We then propose a novel meta-algorithm to estimate the bias term online from a sequence of observed tasks. The small memory footprint and low time complexity of our approach makes it appealing in practice while our theoretical analysis provides guarantees on the generalization properties of the meta-algorithm on new tasks. A key feature of our results is that, when the number of tasks grows and their vari- ance is relatively small, our learning-to-learn approach has a significant advantage over learning each task in isolation by standard SGD without a bias term. Numerical experiments demonstrate the effectiveness of our approach in practice.}\n}", "pdf": "http://proceedings.mlr.press/v97/denevi19a/denevi19a.pdf", "supp": "", "pdf_size": 776756, "gs_citation": 143, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5491276692157599761&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Istituto Italiano di Tecnologia, Genoa, Italy+University of Genoa, Genoa, Italy; Imperial College of London, London, United Kingdom; Istituto Italiano di Tecnologia, Genoa, Italy+University College London, London, United Kingdom; University College London, London, United Kingdom", "aff_domain": "iit.it; ; ; ", "email": "iit.it; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/denevi19a.html", "aff_unique_index": "0+1;2;0+3;3", "aff_unique_norm": "Istituto Italiano di Tecnologia;University of Genoa;Imperial College London;University College London", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.iit.it;https://www.unige.it;https://www.imperial.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "IIT;UniGe;Imperial;UCL", "aff_campus_unique_index": "0+0;1;0+1;1", "aff_campus_unique": "Genoa;London", "aff_country_unique_index": "0+0;1;0+1;1", "aff_country_unique": "Italy;United Kingdom" }, { "title": "LegoNet: Efficient Convolutional Neural Networks with Lego Filters", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3712", "id": "3712", "author_site": "Zhaohui Yang, Yunhe Wang, Chuanjian Liu, Hanting Chen, Chunjing Xu, Boxin Shi, Chao Xu, Chang Xu", "author": "Zhaohui Yang; Yunhe Wang; Chuanjian Liu; Hanting Chen; Chunjing Xu; Boxin Shi; Chao Xu; Chang Xu", "abstract": "This paper aims to build efficient convolutional neural networks using a set of Lego filters. Many successful building blocks, e.g., inception and residual modules, have been designed to refresh state-of-the-art records of CNNs on visual recognition tasks. Beyond these high-level modules, we suggest that an ordinary filter in the neural network can be upgraded to a sophisticated module as well. Filter modules are established by assembling a shared set of Lego filters that are often of much lower dimensions. Weights in Lego filters and binary masks to stack Lego filters for these filter modules can be simultaneously optimized in an end-to-end manner as usual. Inspired by network engineering, we develop a split-transform-merge strategy for an efficient convolution by exploiting intermediate Lego feature maps. The compression and acceleration achieved by Lego Networks using the proposed Lego filters have been theoretically discussed. Experimental results on benchmark datasets and deep models demonstrate the advantages of the proposed Lego filters and their potential real-world applications on mobile devices.", "bibtex": "@InProceedings{pmlr-v97-yang19c,\n title = \t {{L}ego{N}et: Efficient Convolutional Neural Networks with Lego Filters},\n author = {Yang, Zhaohui and Wang, Yunhe and Liu, Chuanjian and Chen, Hanting and Xu, Chunjing and Shi, Boxin and Xu, Chao and Xu, Chang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7005--7014},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yang19c/yang19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/yang19c.html},\n abstract = \t {This paper aims to build efficient convolutional neural networks using a set of Lego filters. Many successful building blocks, e.g., inception and residual modules, have been designed to refresh state-of-the-art records of CNNs on visual recognition tasks. Beyond these high-level modules, we suggest that an ordinary filter in the neural network can be upgraded to a sophisticated module as well. Filter modules are established by assembling a shared set of Lego filters that are often of much lower dimensions. Weights in Lego filters and binary masks to stack Lego filters for these filter modules can be simultaneously optimized in an end-to-end manner as usual. Inspired by network engineering, we develop a split-transform-merge strategy for an efficient convolution by exploiting intermediate Lego feature maps. The compression and acceleration achieved by Lego Networks using the proposed Lego filters have been theoretically discussed. Experimental results on benchmark datasets and deep models demonstrate the advantages of the proposed Lego filters and their potential real-world applications on mobile devices.}\n}", "pdf": "http://proceedings.mlr.press/v97/yang19c/yang19c.pdf", "supp": "", "pdf_size": 1160384, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2539006001613884151&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Key Laboratory of Machine Perception (Ministry of Education), Peking University + Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; Key Laboratory of Machine Perception (Ministry of Education), Peking University + Huawei Noah\u2019s Ark Lab; Huawei Noah\u2019s Ark Lab; National Engineering Laboratory for Video Technology, Peking University + Peng Cheng Laboratory; Key Laboratory of Machine Perception (Ministry of Education), Peking University; Huawei Noah\u2019s Ark Lab; School of Computer Science, University of Sydney", "aff_domain": "pku.edu.cn;pku.edu.cn; ; ; ; ; ;sydney.edu.au", "email": "pku.edu.cn;pku.edu.cn; ; ; ; ; ;sydney.edu.au", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/yang19c.html", "aff_unique_index": "0+1;1;0+1;1;0+2;0;1;3", "aff_unique_norm": "Peking University;Huawei;Pengcheng Laboratory;University of Sydney", "aff_unique_dep": "Key Laboratory of Machine Perception;Noah\u2019s Ark Lab;Peng Cheng Laboratory;School of Computer Science", "aff_unique_url": "http://www.pku.edu.cn;https://www.huawei.com;http://www.pcl.ac.cn;https://www.sydney.edu.au", "aff_unique_abbr": "PKU;Huawei;PCL;USYD", "aff_campus_unique_index": ";;;1", "aff_campus_unique": ";Sydney", "aff_country_unique_index": "0+0;0;0+0;0;0+0;0;0;1", "aff_country_unique": "China;Australia" }, { "title": "Leveraging Low-Rank Relations Between Surrogate Tasks in Structured Prediction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4011", "id": "4011", "author_site": "Giulia Luise, Dimitrios Stamos, Massimiliano Pontil, Carlo Ciliberto", "author": "Giulia Luise; Dimitrios Stamos; Massimiliano Pontil; Carlo Ciliberto", "abstract": "We study the interplay between surrogate methods for structured prediction and techniques from multitask learning designed to leverage relationships between surrogate outputs. We propose an efficient algorithm based on trace norm regularization which, differently from previous methods, does not require explicit knowledge of the coding/decoding functions of the surrogate framework. As a result, our algorithm can be applied to the broad class of problems in which the surrogate space is large or even infinite dimensional. We study excess risk bounds for trace norm regularized structured prediction proving the consistency and learning rates for our estimator. We also identify relevant regimes in which our approach can enjoy better generalization performance than previous methods. Numerical experiments on ranking problems indicate that enforcing low-rank relations among surrogate outputs may indeed provide a significant advantage in practice.", "bibtex": "@InProceedings{pmlr-v97-luise19a,\n title = \t {Leveraging Low-Rank Relations Between Surrogate Tasks in Structured Prediction},\n author = {Luise, Giulia and Stamos, Dimitrios and Pontil, Massimiliano and Ciliberto, Carlo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4193--4202},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/luise19a/luise19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/luise19a.html},\n abstract = \t {We study the interplay between surrogate methods for structured prediction and techniques from multitask learning designed to leverage relationships between surrogate outputs. We propose an efficient algorithm based on trace norm regularization which, differently from previous methods, does not require explicit knowledge of the coding/decoding functions of the surrogate framework. As a result, our algorithm can be applied to the broad class of problems in which the surrogate space is large or even infinite dimensional. We study excess risk bounds for trace norm regularized structured prediction proving the consistency and learning rates for our estimator. We also identify relevant regimes in which our approach can enjoy better generalization performance than previous methods. Numerical experiments on ranking problems indicate that enforcing low-rank relations among surrogate outputs may indeed provide a significant advantage in practice.}\n}", "pdf": "http://proceedings.mlr.press/v97/luise19a/luise19a.pdf", "supp": "", "pdf_size": 977476, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7021548697258012467&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, University College London, London, UK+Computational Statistics and Machine Learning, Istituto Italiano di Tecnologia, Genoa+Department of Electrical and Electronic Engineering,Imperial College London, London, UK; Department of Computer Science, University College London, London, UK; Department of Computer Science, University College London, London, UK+Computational Statistics and Machine Learning, Istituto Italiano di Tecnologia, Genoa; Department of Computer Science, University College London, London, UK+Department of Electrical and Electronic Engineering,Imperial College London, London, UK", "aff_domain": "ucl.ac.uk; ; ; ", "email": "ucl.ac.uk; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/luise19a.html", "aff_unique_index": "0+1+2;0;0+1;0+2", "aff_unique_norm": "University College London;Istituto Italiano di Tecnologia;Imperial College London", "aff_unique_dep": "Department of Computer Science;Computational Statistics and Machine Learning;Department of Electrical and Electronic Engineering", "aff_unique_url": "https://www.ucl.ac.uk;https://www.iit.it;https://www.imperial.ac.uk", "aff_unique_abbr": "UCL;IIT;ICL", "aff_campus_unique_index": "0+1+0;0;0+1;0+0", "aff_campus_unique": "London;Genoa", "aff_country_unique_index": "0+1+0;0;0+1;0+0", "aff_country_unique": "United Kingdom;Italy" }, { "title": "Lexicographic and Depth-Sensitive Margins in Homogeneous and Non-Homogeneous Deep Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4224", "id": "4224", "author_site": "Mor Shpigel Nacson, Suriya Gunasekar, Jason Lee, Nati Srebro, Daniel Soudry", "author": "Mor Shpigel Nacson; Suriya Gunasekar; Jason Lee; Nathan Srebro; Daniel Soudry", "abstract": "With an eye toward understanding complexity control in deep learning, we study how infinitesimal regularization or gradient descent optimization lead to margin maximizing solutions in both homogeneous and non homogeneous models, extending previous work that focused on infinitesimal regularization only in homogeneous models. To this end we study the limit of loss minimization with a diverging norm constraint (the \u201cconstrained path\u201d), relate it to the limit of a \u201cmargin path\u201d and characterize the resulting solution. For non-homogeneous ensemble models, which output is a sum of homogeneous sub-models, we show that this solution discards the shallowest sub-models if they are unnecessary. For homogeneous models, we show convergence to a \u201clexicographic max-margin solution\u201d, and provide conditions under which max-margin solutions are also attained as the limit of unconstrained gradient descent.", "bibtex": "@InProceedings{pmlr-v97-nacson19a,\n title = \t {Lexicographic and Depth-Sensitive Margins in Homogeneous and Non-Homogeneous Deep Models},\n author = {Nacson, Mor Shpigel and Gunasekar, Suriya and Lee, Jason and Srebro, Nathan and Soudry, Daniel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4683--4692},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nacson19a/nacson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nacson19a.html},\n abstract = \t {With an eye toward understanding complexity control in deep learning, we study how infinitesimal regularization or gradient descent optimization lead to margin maximizing solutions in both homogeneous and non homogeneous models, extending previous work that focused on infinitesimal regularization only in homogeneous models. To this end we study the limit of loss minimization with a diverging norm constraint (the \u201cconstrained path\u201d), relate it to the limit of a \u201cmargin path\u201d and characterize the resulting solution. For non-homogeneous ensemble models, which output is a sum of homogeneous sub-models, we show that this solution discards the shallowest sub-models if they are unnecessary. For homogeneous models, we show convergence to a \u201clexicographic max-margin solution\u201d, and provide conditions under which max-margin solutions are also attained as the limit of unconstrained gradient descent.}\n}", "pdf": "http://proceedings.mlr.press/v97/nacson19a/nacson19a.pdf", "supp": "", "pdf_size": 320943, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14932776100895282750&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Technion, Israel; TTI Chicago, USA; USC Los Angeles, USA; TTI Chicago, USA; Technion, Israel", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/nacson19a.html", "aff_unique_index": "0;1;2;1;0", "aff_unique_norm": "Technion - Israel Institute of Technology;Toyota Technological Institute at Chicago;University of Southern California", "aff_unique_dep": ";;", "aff_unique_url": "https://www.technion.ac.il/en/;https://tti-chicago.org;https://www.usc.edu", "aff_unique_abbr": "Technion;TTI Chicago;USC", "aff_campus_unique_index": "1;2;1", "aff_campus_unique": ";Chicago;Los Angeles", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "Israel;United States" }, { "title": "Linear-Complexity Data-Parallel Earth Mover\u2019s Distance Approximations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3568", "id": "3568", "author_site": "Kubilay Atasu, Thomas Mittelholzer", "author": "Kubilay Atasu; Thomas Mittelholzer", "abstract": "The Earth Mover\u2019s Distance (EMD) is a state-of-the art metric for comparing discrete probability distributions, but its high distinguishability comes at a high cost in computational complexity. Even though linear-complexity approximation algorithms have been proposed to improve its scalability, these algorithms are either limited to vector spaces with only a few dimensions or they become ineffective when the degree of overlap between the probability distributions is high. We propose novel approximation algorithms that overcome both of these limitations, yet still achieve linear time complexity. All our algorithms are data parallel, and therefore, we can take advantage of massively parallel computing engines, such as Graphics Processing Units (GPUs). On the popular text-based 20 Newsgroups dataset, the new algorithms are four orders of magnitude faster than a multi-threaded CPU implementation of Word Mover\u2019s Distance and match its search accuracy. On MNIST images, the new algorithms are four orders of magnitude faster than Cuturi\u2019s GPU implementation of the Sinkhorn\u2019s algorithm while offering a slightly higher search accuracy.", "bibtex": "@InProceedings{pmlr-v97-atasu19a,\n title = \t {Linear-Complexity Data-Parallel Earth Mover\u2019s Distance Approximations},\n author = {Atasu, Kubilay and Mittelholzer, Thomas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {364--373},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/atasu19a/atasu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/atasu19a.html},\n abstract = \t {The Earth Mover\u2019s Distance (EMD) is a state-of-the art metric for comparing discrete probability distributions, but its high distinguishability comes at a high cost in computational complexity. Even though linear-complexity approximation algorithms have been proposed to improve its scalability, these algorithms are either limited to vector spaces with only a few dimensions or they become ineffective when the degree of overlap between the probability distributions is high. We propose novel approximation algorithms that overcome both of these limitations, yet still achieve linear time complexity. All our algorithms are data parallel, and therefore, we can take advantage of massively parallel computing engines, such as Graphics Processing Units (GPUs). On the popular text-based 20 Newsgroups dataset, the new algorithms are four orders of magnitude faster than a multi-threaded CPU implementation of Word Mover\u2019s Distance and match its search accuracy. On MNIST images, the new algorithms are four orders of magnitude faster than Cuturi\u2019s GPU implementation of the Sinkhorn\u2019s algorithm while offering a slightly higher search accuracy.}\n}", "pdf": "http://proceedings.mlr.press/v97/atasu19a/atasu19a.pdf", "supp": "", "pdf_size": 858691, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14762827563471854709&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "IBM Research - Zurich; HSR Hochschule f\u00a8ur Technik, Rapperswil", "aff_domain": "zurich.ibm.com; ", "email": "zurich.ibm.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/atasu19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "IBM;HSR Hochschule fur Technik", "aff_unique_dep": "Research;", "aff_unique_url": "https://www.ibm.com/research;https://www.hsr.ch", "aff_unique_abbr": "IBM;HSR", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Zurich;Rapperswil", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Lipschitz Generative Adversarial Nets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3698", "id": "3698", "author_site": "Zhiming Zhou, Jiadong Liang, Yuxuan Song, Lantao Yu, Hongwei Wang, Weinan Zhang, Yong Yu, Zhihua Zhang", "author": "Zhiming Zhou; Jiadong Liang; Yuxuan Song; Lantao Yu; Hongwei Wang; Weinan Zhang; Yong Yu; Zhihua Zhang", "abstract": "In this paper we show that generative adversarial networks (GANs) without restriction on the discriminative function space commonly suffer from the problem that the gradient produced by the discriminator is uninformative to guide the generator. By contrast, Wasserstein GAN (WGAN), where the discriminative function is restricted to 1-Lipschitz, does not suffer from such a gradient uninformativeness problem. We further show in the paper that the model with a compact dual form of Wasserstein distance, where the Lipschitz condition is relaxed, may also theoretically suffer from this issue. This implies the importance of Lipschitz condition and motivates us to study the general formulation of GANs with Lipschitz constraint, which leads to a new family of GANs that we call Lipschitz GANs (LGANs). We show that LGANs guarantee the existence and uniqueness of the optimal discriminative function as well as the existence of a unique Nash equilibrium. We prove that LGANs are generally capable of eliminating the gradient uninformativeness problem. According to our empirical analysis, LGANs are more stable and generate consistently higher quality samples compared with WGAN.", "bibtex": "@InProceedings{pmlr-v97-zhou19c,\n title = \t {{L}ipschitz Generative Adversarial Nets},\n author = {Zhou, Zhiming and Liang, Jiadong and Song, Yuxuan and Yu, Lantao and Wang, Hongwei and Zhang, Weinan and Yu, Yong and Zhang, Zhihua},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7584--7593},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhou19c/zhou19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhou19c.html},\n abstract = \t {In this paper we show that generative adversarial networks (GANs) without restriction on the discriminative function space commonly suffer from the problem that the gradient produced by the discriminator is uninformative to guide the generator. By contrast, Wasserstein GAN (WGAN), where the discriminative function is restricted to 1-Lipschitz, does not suffer from such a gradient uninformativeness problem. We further show in the paper that the model with a compact dual form of Wasserstein distance, where the Lipschitz condition is relaxed, may also theoretically suffer from this issue. This implies the importance of Lipschitz condition and motivates us to study the general formulation of GANs with Lipschitz constraint, which leads to a new family of GANs that we call Lipschitz GANs (LGANs). We show that LGANs guarantee the existence and uniqueness of the optimal discriminative function as well as the existence of a unique Nash equilibrium. We prove that LGANs are generally capable of eliminating the gradient uninformativeness problem. According to our empirical analysis, LGANs are more stable and generate consistently higher quality samples compared with WGAN.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhou19c/zhou19c.pdf", "supp": "", "pdf_size": 1930389, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17592312956170121478&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Shanghai Jiao Tong University; Peking University; Shanghai Jiao Tong University; Stanford University; Stanford University; Shanghai Jiao Tong University; Shanghai Jiao Tong University; Peking University", "aff_domain": "apex.sjtu.edu.cn; ; ; ; ; ; ; ", "email": "apex.sjtu.edu.cn; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/zhou19c.html", "aff_unique_index": "0;1;0;2;2;0;0;1", "aff_unique_norm": "Shanghai Jiao Tong University;Peking University;Stanford University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;http://www.pku.edu.cn;https://www.stanford.edu", "aff_unique_abbr": "SJTU;Peking U;Stanford", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;1;1;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Locally Private Bayesian Inference for Count Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4109", "id": "4109", "author_site": "Aaron Schein, Steven Wu, Alexandra Schofield, Mingyuan Zhou, Hanna Wallach", "author": "Aaron Schein; Zhiwei Steven Wu; Alexandra Schofield; Mingyuan Zhou; Hanna Wallach", "abstract": "We present a general and modular method for privacy-preserving Bayesian inference for Poisson factorization, a broad class of models that includes some of the most widely used models in the social sciences. Our method satisfies limited-precision local privacy, a generalization of local differential privacy that we introduce to formulate appropriate privacy guarantees for sparse count data. We present an MCMC algorithm that approximates the posterior distribution over the latent variables conditioned on data that has been locally privatized by the geometric mechanism. Our method is based on two insights: 1) a novel reinterpretation of the geometric mechanism in terms of the Skellam distribution and 2) a general theorem that relates the Skellam and Bessel distributions. We demonstrate our method\u2019s utility using two case studies that involve real-world email data. We show that our method consistently outperforms the commonly used naive approach, wherein inference proceeds as usual, treating the locally privatized data as if it were not privatized.", "bibtex": "@InProceedings{pmlr-v97-schein19a,\n title = \t {Locally Private {B}ayesian Inference for Count Models},\n author = {Schein, Aaron and Wu, Zhiwei Steven and Schofield, Alexandra and Zhou, Mingyuan and Wallach, Hanna},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5638--5648},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/schein19a/schein19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/schein19a.html},\n abstract = \t {We present a general and modular method for privacy-preserving Bayesian inference for Poisson factorization, a broad class of models that includes some of the most widely used models in the social sciences. Our method satisfies limited-precision local privacy, a generalization of local differential privacy that we introduce to formulate appropriate privacy guarantees for sparse count data. We present an MCMC algorithm that approximates the posterior distribution over the latent variables conditioned on data that has been locally privatized by the geometric mechanism. Our method is based on two insights: 1) a novel reinterpretation of the geometric mechanism in terms of the Skellam distribution and 2) a general theorem that relates the Skellam and Bessel distributions. We demonstrate our method\u2019s utility using two case studies that involve real-world email data. We show that our method consistently outperforms the commonly used naive approach, wherein inference proceeds as usual, treating the locally privatized data as if it were not privatized.}\n}", "pdf": "http://proceedings.mlr.press/v97/schein19a/schein19a.pdf", "supp": "", "pdf_size": 3131627, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16686616783132449314&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "University of Massachusetts Amherst; University of Minnesota; Cornell University; University of Texas at Austin; Microsoft", "aff_domain": "cs.umass.edu; ; ; ; ", "email": "cs.umass.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/schein19a.html", "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "University of Massachusetts Amherst;University of Minnesota;Cornell University;University of Texas at Austin;Microsoft", "aff_unique_dep": ";;;;Microsoft Corporation", "aff_unique_url": "https://www.umass.edu;https://www.minnesota.edu;https://www.cornell.edu;https://www.utexas.edu;https://www.microsoft.com", "aff_unique_abbr": "UMass Amherst;UMN;Cornell;UT Austin;Microsoft", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Amherst;;Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Look Ma, No Latent Variables: Accurate Cutset Networks via Compilation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3939", "id": "3939", "author_site": "Tahrima Rahman, Shasha Jin, Vibhav Gogate", "author": "Tahrima Rahman; Shasha Jin; Vibhav Gogate", "abstract": "Tractable probabilistic models obviate the need for unreliable approximate inference approaches and as a result often yield accurate query answers in practice. However, most tractable models that achieve state-of-the-art generalization performance (measured using test set likelihood score) use latent variables. Such models admit poly-time marginal (MAR) inference but do not admit poly-time (full) maximum-a-posteriori (MAP) inference. To address this problem, in this paper, we propose a novel approach for inducing cutset networks, a well-known tractable, highly interpretable representation that does not use latent variables and admits linear time MAR as well as MAP inference. Our approach addresses a major limitation of existing techniques that learn cutset networks from data in that their accuracy is quite low as compared to latent variable models such as ensembles of cutset networks and sum-product networks. The key idea in our approach is to construct deep cutset networks by not only learning them from data but also compiling them from a more accurate latent tractable model. We show experimentally that our new approach yields more accurate MAP estimates as compared with existing approaches and significantly improves the test set log-likelihood score of cutset networks bringing them closer in terms of generalization performance to latent variable models.", "bibtex": "@InProceedings{pmlr-v97-rahman19a,\n title = \t {Look Ma, No Latent Variables: Accurate Cutset Networks via Compilation},\n author = {Rahman, Tahrima and Jin, Shasha and Gogate, Vibhav},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5311--5320},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rahman19a/rahman19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rahman19a.html},\n abstract = \t {Tractable probabilistic models obviate the need for unreliable approximate inference approaches and as a result often yield accurate query answers in practice. However, most tractable models that achieve state-of-the-art generalization performance (measured using test set likelihood score) use latent variables. Such models admit poly-time marginal (MAR) inference but do not admit poly-time (full) maximum-a-posteriori (MAP) inference. To address this problem, in this paper, we propose a novel approach for inducing cutset networks, a well-known tractable, highly interpretable representation that does not use latent variables and admits linear time MAR as well as MAP inference. Our approach addresses a major limitation of existing techniques that learn cutset networks from data in that their accuracy is quite low as compared to latent variable models such as ensembles of cutset networks and sum-product networks. The key idea in our approach is to construct deep cutset networks by not only learning them from data but also compiling them from a more accurate latent tractable model. We show experimentally that our new approach yields more accurate MAP estimates as compared with existing approaches and significantly improves the test set log-likelihood score of cutset networks bringing them closer in terms of generalization performance to latent variable models.}\n}", "pdf": "http://proceedings.mlr.press/v97/rahman19a/rahman19a.pdf", "supp": "", "pdf_size": 4513293, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13090738231260644675&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, The University of Texas at Dallas, United States; Department of Computer Science, The University of Texas at Dallas, United States; Department of Computer Science, The University of Texas at Dallas, United States", "aff_domain": "utdallas.edu; ; ", "email": "utdallas.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/rahman19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Dallas", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.utdallas.edu", "aff_unique_abbr": "UT Dallas", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Dallas", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Lorentzian Distance Learning for Hyperbolic Representations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3914", "id": "3914", "author_site": "Marc Law, Renjie Liao, Jake Snell, Richard Zemel", "author": "Marc Law; Renjie Liao; Jake Snell; Richard Zemel", "abstract": "We introduce an approach to learn representations based on the Lorentzian distance in hyperbolic geometry. Hyperbolic geometry is especially suited to hierarchically-structured datasets, which are prevalent in the real world. Current hyperbolic representation learning methods compare examples with the Poincar\u00e9 distance. They try to minimize the distance of each node in a hierarchy with its descendants while maximizing its distance with other nodes. This formulation produces node representations close to the centroid of their descendants. To obtain efficient and interpretable algorithms, we exploit the fact that the centroid w.r.t the squared Lorentzian distance can be written in closed-form. We show that the Euclidean norm of such a centroid decreases as the curvature of the hyperbolic space decreases. This property makes it appropriate to represent hierarchies where parent nodes minimize the distances to their descendants and have smaller Euclidean norm than their children. Our approach obtains state-of-the-art results in retrieval and classification tasks on different datasets.", "bibtex": "@InProceedings{pmlr-v97-law19a,\n title = \t {Lorentzian Distance Learning for Hyperbolic Representations},\n author = {Law, Marc and Liao, Renjie and Snell, Jake and Zemel, Richard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3672--3681},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/law19a/law19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/law19a.html},\n abstract = \t {We introduce an approach to learn representations based on the Lorentzian distance in hyperbolic geometry. Hyperbolic geometry is especially suited to hierarchically-structured datasets, which are prevalent in the real world. Current hyperbolic representation learning methods compare examples with the Poincar\u00e9 distance. They try to minimize the distance of each node in a hierarchy with its descendants while maximizing its distance with other nodes. This formulation produces node representations close to the centroid of their descendants. To obtain efficient and interpretable algorithms, we exploit the fact that the centroid w.r.t the squared Lorentzian distance can be written in closed-form. We show that the Euclidean norm of such a centroid decreases as the curvature of the hyperbolic space decreases. This property makes it appropriate to represent hierarchies where parent nodes minimize the distances to their descendants and have smaller Euclidean norm than their children. Our approach obtains state-of-the-art results in retrieval and classification tasks on different datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/law19a/law19a.pdf", "supp": "", "pdf_size": 1061632, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18031661823590377058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Toronto + Vector Institute + NVIDIA; University of Toronto + Vector Institute; University of Toronto + Vector Institute; University of Toronto + Vector Institute", "aff_domain": "cs.toronto.edu; ; ; ", "email": "cs.toronto.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/law19a.html", "aff_unique_index": "0+1+2;0+1;0+1;0+1", "aff_unique_norm": "University of Toronto;Vector Institute;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai/;https://www.nvidia.com", "aff_unique_abbr": "U of T;Vector Institute;NVIDIA", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+1;0+0;0+0;0+0", "aff_country_unique": "Canada;United States" }, { "title": "Loss Landscapes of Regularized Linear Autoencoders", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4260", "id": "4260", "author_site": "Daniel Kunin, Jonathan Bloom, Aleksandrina Goeva, Cotton Seed", "author": "Daniel Kunin; Jonathan Bloom; Aleksandrina Goeva; Cotton Seed", "abstract": "Autoencoders are a deep learning model for representation learning. When trained to minimize the distance between the data and its reconstruction, linear autoencoders (LAEs) learn the subspace spanned by the top principal directions but cannot learn the principal directions themselves. In this paper, we prove that $L_2$-regularized LAEs are symmetric at all critical points and learn the principal directions as the left singular vectors of the decoder. We smoothly parameterize the critical manifold and relate the minima to the MAP estimate of probabilistic PCA. We illustrate these results empirically and consider implications for PCA algorithms, computational neuroscience, and the algebraic topology of learning.", "bibtex": "@InProceedings{pmlr-v97-kunin19a,\n title = \t {Loss Landscapes of Regularized Linear Autoencoders},\n author = {Kunin, Daniel and Bloom, Jonathan and Goeva, Aleksandrina and Seed, Cotton},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3560--3569},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kunin19a/kunin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kunin19a.html},\n abstract = \t {Autoencoders are a deep learning model for representation learning. When trained to minimize the distance between the data and its reconstruction, linear autoencoders (LAEs) learn the subspace spanned by the top principal directions but cannot learn the principal directions themselves. In this paper, we prove that $L_2$-regularized LAEs are symmetric at all critical points and learn the principal directions as the left singular vectors of the decoder. We smoothly parameterize the critical manifold and relate the minima to the MAP estimate of probabilistic PCA. We illustrate these results empirically and consider implications for PCA algorithms, computational neuroscience, and the algebraic topology of learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/kunin19a/kunin19a.pdf", "supp": "", "pdf_size": 1580104, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15048938764743692524&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Institute for Computational and Mathematical Engineering, Stanford University; Broad Institute of MIT and Harvard; Broad Institute of MIT and Harvard; Broad Institute of MIT and Harvard", "aff_domain": "stanford.edu;broadinstitute.org; ; ", "email": "stanford.edu;broadinstitute.org; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kunin19a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Stanford University;Broad Institute", "aff_unique_dep": "Institute for Computational and Mathematical Engineering;", "aff_unique_url": "https://www.stanford.edu;https://www.broadinstitute.org", "aff_unique_abbr": "Stanford;Broad", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Lossless or Quantized Boosting with Integer Arithmetic", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3592", "id": "3592", "author_site": "Richard Nock, Robert C Williamson", "author": "Richard Nock; Robert Williamson", "abstract": "In supervised learning, efficiency often starts with the choice of a good loss: support vector machines popularised Hinge loss, Adaboost popularised the exponential loss, etc. Recent trends in machine learning have highlighted the necessity for training routines to meet tight requirements on communication, bandwidth, energy, operations, encoding, among others. Fitting the often decades-old state of the art training routines into these new constraints does not go without pain and uncertainty or reduction in the original guarantees. Our paper starts with the design of a new strictly proper canonical, twice differentiable loss called the Q-loss. Importantly, its mirror update over (arbitrary) rational inputs uses only integer arithmetics \u2013 more precisely, the sole use of $+, -, /, \\times, |.|$. We build a learning algorithm which is able, under mild assumptions, to achieve a lossless boosting-compliant training. We give conditions for a quantization of its main memory footprint, weights, to be done while keeping the whole algorithm boosting-compliant. Experiments display that the algorithm can achieve a fast convergence during the early boosting rounds compared to AdaBoost, even with a weight storage that can be 30+ times smaller. Lastly, we show that the Bayes risk of the Q-loss can be used as node splitting criterion for decision trees and guarantees optimal boosting convergence.", "bibtex": "@InProceedings{pmlr-v97-nock19a,\n title = \t {Lossless or Quantized Boosting with Integer Arithmetic},\n author = {Nock, Richard and Williamson, Robert},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4829--4838},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nock19a/nock19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nock19a.html},\n abstract = \t {In supervised learning, efficiency often starts with the choice of a good loss: support vector machines popularised Hinge loss, Adaboost popularised the exponential loss, etc. Recent trends in machine learning have highlighted the necessity for training routines to meet tight requirements on communication, bandwidth, energy, operations, encoding, among others. Fitting the often decades-old state of the art training routines into these new constraints does not go without pain and uncertainty or reduction in the original guarantees. Our paper starts with the design of a new strictly proper canonical, twice differentiable loss called the Q-loss. Importantly, its mirror update over (arbitrary) rational inputs uses only integer arithmetics \u2013 more precisely, the sole use of $+, -, /, \\times, |.|$. We build a learning algorithm which is able, under mild assumptions, to achieve a lossless boosting-compliant training. We give conditions for a quantization of its main memory footprint, weights, to be done while keeping the whole algorithm boosting-compliant. Experiments display that the algorithm can achieve a fast convergence during the early boosting rounds compared to AdaBoost, even with a weight storage that can be 30+ times smaller. Lastly, we show that the Bayes risk of the Q-loss can be used as node splitting criterion for decision trees and guarantees optimal boosting convergence.}\n}", "pdf": "http://proceedings.mlr.press/v97/nock19a/nock19a.pdf", "supp": "", "pdf_size": 447421, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12697723213247026932&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Data61+The Australian National University+The University of Sydney; The Australian National University+Data61", "aff_domain": "data61.csiro.au;anu.edu.au", "email": "data61.csiro.au;anu.edu.au", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/nock19a.html", "aff_unique_index": "0+1+2;1+0", "aff_unique_norm": "Data61;Australian National University;University of Sydney", "aff_unique_dep": ";;", "aff_unique_url": "https://data61.csiro.au;https://www.anu.edu.au;https://www.sydney.edu.au", "aff_unique_abbr": "Data61;ANU;USYD", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0", "aff_country_unique": "Australia" }, { "title": "Low Latency Privacy Preserving Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3801", "id": "3801", "author_site": "Alon Brutzkus, Ran Gilad-Bachrach, Oren Elisha", "author": "Alon Brutzkus; Ran Gilad-Bachrach; Oren Elisha", "abstract": "When applying machine learning to sensitive data, one has to find a balance between accuracy, information security, and computational-complexity. Recent studies combined Homomorphic Encryption with neural networks to make inferences while protecting against information leakage. However, these methods are limited by the width and depth of neural networks that can be used (and hence the accuracy) and exhibit high latency even for relatively simple networks. In this study we provide two solutions that address these limitations. In the first solution, we present more than 10\\times improvement in latency and enable inference on wider networks compared to prior attempts with the same level of security. The improved performance is achieved by novel methods to represent the data during the computation. In the second solution, we apply the method of transfer learning to provide private inference services using deep networks with latency of \\sim0.16 seconds. We demonstrate the efficacy of our methods on several computer vision tasks.", "bibtex": "@InProceedings{pmlr-v97-brutzkus19a,\n title = \t {Low Latency Privacy Preserving Inference},\n author = {Brutzkus, Alon and Gilad-Bachrach, Ran and Elisha, Oren},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {812--821},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/brutzkus19a/brutzkus19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/brutzkus19a.html},\n abstract = \t {When applying machine learning to sensitive data, one has to find a balance between accuracy, information security, and computational-complexity. Recent studies combined Homomorphic Encryption with neural networks to make inferences while protecting against information leakage. However, these methods are limited by the width and depth of neural networks that can be used (and hence the accuracy) and exhibit high latency even for relatively simple networks. In this study we provide two solutions that address these limitations. In the first solution, we present more than 10\\times improvement in latency and enable inference on wider networks compared to prior attempts with the same level of security. The improved performance is achieved by novel methods to represent the data during the computation. In the second solution, we apply the method of transfer learning to provide private inference services using deep networks with latency of \\sim0.16 seconds. We demonstrate the efficacy of our methods on several computer vision tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/brutzkus19a/brutzkus19a.pdf", "supp": "", "pdf_size": 413189, "gs_citation": 325, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=86142108232916247&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Microsoft Research and Tel Aviv University, Israel; Microsoft, Israel; Microsoft Research, Israel", "aff_domain": "microsoft.com; ;microsoft.com", "email": "microsoft.com; ;microsoft.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/brutzkus19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;Israel" }, { "title": "Lower Bounds for Smooth Nonconvex Finite-Sum Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4168", "id": "4168", "author_site": "Dongruo Zhou, Quanquan Gu", "author": "Dongruo Zhou; Quanquan Gu", "abstract": "Smooth finite-sum optimization has been widely studied in both convex and nonconvex settings. However, existing lower bounds for finite-sum optimization are mostly limited to the setting where each component function is (strongly) convex, while the lower bounds for nonconvex finite-sum optimization remain largely unsolved. In this paper, we study the lower bounds for smooth nonconvex finite-sum optimization, where the objective function is the average of $n$ nonconvex component functions. We prove tight lower bounds for the complexity of finding $\\epsilon$-suboptimal point and $\\epsilon$-approximate stationary point in different settings, for a wide regime of the smallest eigenvalue of the Hessian of the objective function (or each component function). Given our lower bounds, we can show that existing algorithms including {KatyushaX} \\citep{allen2018katyushax}, {Natasha} \\citep{allen2017natasha} and {StagewiseKatyusha} \\citep{yang2018does} have achieved optimal {Incremental First-order Oracle} (IFO) complexity (i.e., number of IFO calls) up to logarithm factors for nonconvex finite-sum optimization. We also point out potential ways to further improve these complexity results, in terms of making stronger assumptions or by a different convergence analysis.", "bibtex": "@InProceedings{pmlr-v97-zhou19b,\n title = \t {Lower Bounds for Smooth Nonconvex Finite-Sum Optimization},\n author = {Zhou, Dongruo and Gu, Quanquan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7574--7583},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhou19b/zhou19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhou19b.html},\n abstract = \t {Smooth finite-sum optimization has been widely studied in both convex and nonconvex settings. However, existing lower bounds for finite-sum optimization are mostly limited to the setting where each component function is (strongly) convex, while the lower bounds for nonconvex finite-sum optimization remain largely unsolved. In this paper, we study the lower bounds for smooth nonconvex finite-sum optimization, where the objective function is the average of $n$ nonconvex component functions. We prove tight lower bounds for the complexity of finding $\\epsilon$-suboptimal point and $\\epsilon$-approximate stationary point in different settings, for a wide regime of the smallest eigenvalue of the Hessian of the objective function (or each component function). Given our lower bounds, we can show that existing algorithms including {KatyushaX} \\citep{allen2018katyushax}, {Natasha} \\citep{allen2017natasha} and {StagewiseKatyusha} \\citep{yang2018does} have achieved optimal {Incremental First-order Oracle} (IFO) complexity (i.e., number of IFO calls) up to logarithm factors for nonconvex finite-sum optimization. We also point out potential ways to further improve these complexity results, in terms of making stronger assumptions or by a different convergence analysis.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhou19b/zhou19b.pdf", "supp": "", "pdf_size": 345107, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11984382499376541143&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, University of California, Los Angeles; Department of Computer Science, University of California, Los Angeles", "aff_domain": "cs.ucla.edu;cs.ucla.edu", "email": "cs.ucla.edu;cs.ucla.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhou19b.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "MASS: Masked Sequence to Sequence Pre-training for Language Generation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3802", "id": "3802", "author_site": "Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu", "author": "Kaitao Song; Xu Tan; Tao Qin; Jianfeng Lu; Tie-Yan Liu", "abstract": "Pre-training and fine-tuning, e.g., BERT\u00a0\\citep{devlin2018bert}, have achieved great success in language understanding by transferring knowledge from rich-resource pre-training task to the low/zero-resource downstream tasks. Inspired by the success of BERT, we propose MAsked Sequence to Sequence pre-training (MASS) for the encoder-decoder based language generation tasks. MASS adopts the encoder-decoder framework to reconstruct a sentence fragment given the remaining part of the sentence: its encoder takes a sentence with randomly masked fragment (several consecutive tokens) as input, and its decoder tries to predict this masked fragment. In this way, MASS can jointly train the encoder and decoder to develop the capability of representation extraction and language modeling. By further fine-tuning on a variety of zero/low-resource language generation tasks, including neural machine translation, text summarization and conversational response generation (3 tasks and totally 8 datasets), MASS achieves significant improvements over the baselines without pre-training or with other pre-training methods. Especially, we achieve the state-of-the-art accuracy (30.02 in terms of BLEU score) on the unsupervised English-French translation, even beating the early attention-based supervised model\u00a0\\citep{bahdanau2015neural}.", "bibtex": "@InProceedings{pmlr-v97-song19d,\n title = \t {{MASS}: Masked Sequence to Sequence Pre-training for Language Generation},\n author = {Song, Kaitao and Tan, Xu and Qin, Tao and Lu, Jianfeng and Liu, Tie-Yan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5926--5936},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/song19d/song19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/song19d.html},\n abstract = \t {Pre-training and fine-tuning, e.g., BERT\u00a0\\citep{devlin2018bert}, have achieved great success in language understanding by transferring knowledge from rich-resource pre-training task to the low/zero-resource downstream tasks. Inspired by the success of BERT, we propose MAsked Sequence to Sequence pre-training (MASS) for the encoder-decoder based language generation tasks. MASS adopts the encoder-decoder framework to reconstruct a sentence fragment given the remaining part of the sentence: its encoder takes a sentence with randomly masked fragment (several consecutive tokens) as input, and its decoder tries to predict this masked fragment. In this way, MASS can jointly train the encoder and decoder to develop the capability of representation extraction and language modeling. By further fine-tuning on a variety of zero/low-resource language generation tasks, including neural machine translation, text summarization and conversational response generation (3 tasks and totally 8 datasets), MASS achieves significant improvements over the baselines without pre-training or with other pre-training methods. Especially, we achieve the state-of-the-art accuracy (30.02 in terms of BLEU score) on the unsupervised English-French translation, even beating the early attention-based supervised model\u00a0\\citep{bahdanau2015neural}.}\n}", "pdf": "http://proceedings.mlr.press/v97/song19d/song19d.pdf", "supp": "", "pdf_size": 7842140, "gs_citation": 1237, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9265562426073523323&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Key Laboratory of Intelligent Perception and Systems for High-Dimensional Information of Ministry of Education, Nanjing University of Science and Technology+Microsoft Research; Microsoft Research; Microsoft Research; Key Laboratory of Intelligent Perception and Systems for High-Dimensional Information of Ministry of Education, Nanjing University of Science and Technology; Microsoft Research", "aff_domain": "njust.edu.cn;microsoft.com;microsoft.com;njust.edu.cn;microsoft.com", "email": "njust.edu.cn;microsoft.com;microsoft.com;njust.edu.cn;microsoft.com", "github": "https://github.com/microsoft/MASS", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/song19d.html", "aff_unique_index": "0+1;1;1;0;1", "aff_unique_norm": "Nanjing University of Science and Technology;Microsoft", "aff_unique_dep": "Key Laboratory of Intelligent Perception and Systems for High-Dimensional Information;Microsoft Research", "aff_unique_url": "http://www.nust.edu.cn/;https://www.microsoft.com/en-us/research", "aff_unique_abbr": ";MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;1;0;1", "aff_country_unique": "China;United States" }, { "title": "ME-Net: Towards Effective Adversarial Robustness with Matrix Estimation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4204", "id": "4204", "author_site": "Yuzhe Yang, GUO ZHANG, Zhi Xu, Dina Katabi", "author": "Yuzhe Yang; Guo Zhang; Dina Katabi; Zhi Xu", "abstract": "Deep neural networks are vulnerable to adversarial attacks. The literature is rich with algorithms that can easily craft successful adversarial examples. In contrast, the performance of defense techniques still lags behind. This paper proposes ME-Net, a defense method that leverages matrix estimation (ME). In ME-Net, images are preprocessed using two steps: first pixels are randomly dropped from the image; then, the image is reconstructed using ME. We show that this process destroys the adversarial structure of the noise, while re-enforcing the global structure in the original image. Since humans typically rely on such global structures in classifying images, the process makes the network mode compatible with human perception. We conduct comprehensive experiments on prevailing benchmarks such as MNIST, CIFAR-10, SVHN, and Tiny-ImageNet. Comparing ME-Net with state-of-the-art defense mechanisms shows that ME-Net consistently outperforms prior techniques, improving robustness against both black-box and white-box attacks.", "bibtex": "@InProceedings{pmlr-v97-yang19e,\n title = \t {{ME}-Net: Towards Effective Adversarial Robustness with Matrix Estimation},\n author = {Yang, Yuzhe and Zhang, Guo and Katabi, Dina and Xu, Zhi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7025--7034},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yang19e/yang19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/yang19e.html},\n abstract = \t {Deep neural networks are vulnerable to adversarial attacks. The literature is rich with algorithms that can easily craft successful adversarial examples. In contrast, the performance of defense techniques still lags behind. This paper proposes ME-Net, a defense method that leverages matrix estimation (ME). In ME-Net, images are preprocessed using two steps: first pixels are randomly dropped from the image; then, the image is reconstructed using ME. We show that this process destroys the adversarial structure of the noise, while re-enforcing the global structure in the original image. Since humans typically rely on such global structures in classifying images, the process makes the network mode compatible with human perception. We conduct comprehensive experiments on prevailing benchmarks such as MNIST, CIFAR-10, SVHN, and Tiny-ImageNet. Comparing ME-Net with state-of-the-art defense mechanisms shows that ME-Net consistently outperforms prior techniques, improving robustness against both black-box and white-box attacks.}\n}", "pdf": "http://proceedings.mlr.press/v97/yang19e/yang19e.pdf", "supp": "", "pdf_size": 8094858, "gs_citation": 222, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15543482510654180189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "MIT CSAIL; MIT CSAIL; MIT CSAIL; MIT CSAIL", "aff_domain": "mit.edu; ; ;mit.edu", "email": "mit.edu; ; ;mit.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/yang19e.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://www.csail.mit.edu", "aff_unique_abbr": "MIT CSAIL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "MIWAE: Deep Generative Modelling and Imputation of Incomplete Data Sets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4140", "id": "4140", "author_site": "Pierre-Alexandre Mattei, Jes Frellsen", "author": "Pierre-Alexandre Mattei; Jes Frellsen", "abstract": "We consider the problem of handling missing data with deep latent variable models (DLVMs). First, we present a simple technique to train DLVMs when the training set contains missing-at-random data. Our approach, called MIWAE, is based on the importance-weighted autoencoder (IWAE), and maximises a potentially tight lower bound of the log-likelihood of the observed data. Compared to the original IWAE, our algorithm does not induce any additional computational overhead due to the missing data. We also develop Monte Carlo techniques for single and multiple imputation using a DLVM trained on an incomplete data set. We illustrate our approach by training a convolutional DLVM on incomplete static binarisations of MNIST. Moreover, on various continuous data sets, we show that MIWAE provides extremely accurate single imputations, and is highly competitive with state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v97-mattei19a,\n title = \t {{MIWAE}: Deep Generative Modelling and Imputation of Incomplete Data Sets},\n author = {Mattei, Pierre-Alexandre and Frellsen, Jes},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4413--4423},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mattei19a/mattei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mattei19a.html},\n abstract = \t {We consider the problem of handling missing data with deep latent variable models (DLVMs). First, we present a simple technique to train DLVMs when the training set contains missing-at-random data. Our approach, called MIWAE, is based on the importance-weighted autoencoder (IWAE), and maximises a potentially tight lower bound of the log-likelihood of the observed data. Compared to the original IWAE, our algorithm does not induce any additional computational overhead due to the missing data. We also develop Monte Carlo techniques for single and multiple imputation using a DLVM trained on an incomplete data set. We illustrate our approach by training a convolutional DLVM on incomplete static binarisations of MNIST. Moreover, on various continuous data sets, we show that MIWAE provides extremely accurate single imputations, and is highly competitive with state-of-the-art methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/mattei19a/mattei19a.pdf", "supp": "", "pdf_size": 384330, "gs_citation": 356, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8190422968566808439&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, IT University of Copenhagen, Denmark; Department of Computer Science, IT University of Copenhagen, Denmark", "aff_domain": "itu.dk;itu.dk", "email": "itu.dk;itu.dk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/mattei19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "IT University of Copenhagen", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://itu.dk", "aff_unique_abbr": "ITU Copenhagen", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Denmark" }, { "title": "MONK Outlier-Robust Mean Embedding Estimation by Median-of-Means", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3874", "id": "3874", "author_site": "Matthieu Lerasle, Zoltan Szabo, Timoth\u00e9e Mathieu, Guillaume Lecue", "author": "Matthieu Lerasle; Zoltan Szabo; Timoth\u00e9e Mathieu; Guillaume Lecue", "abstract": "Mean embeddings provide an extremely flexible and powerful tool in machine learning and statistics to represent probability distributions and define a semi-metric (MMD, maximum mean discrepancy; also called N-distance or energy distance), with numerous successful applications. The representation is constructed as the expectation of the feature map defined by a kernel. As a mean, its classical empirical estimator, however, can be arbitrary severely affected even by a single outlier in case of unbounded features. To the best of our knowledge, unfortunately even the consistency of the existing few techniques trying to alleviate this serious sensitivity bottleneck is unknown. In this paper, we show how the recently emerged principle of median-of-means can be used to design estimators for kernel mean embedding and MMD with excessive resistance properties to outliers, and optimal sub-Gaussian deviation bounds under mild assumptions.", "bibtex": "@InProceedings{pmlr-v97-lerasle19a,\n title = \t {{MONK} Outlier-Robust Mean Embedding Estimation by Median-of-Means},\n author = {Lerasle, Matthieu and Szabo, Zoltan and Mathieu, Timoth{\\'e}e and Lecue, Guillaume},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3782--3793},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lerasle19a/lerasle19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lerasle19a.html},\n abstract = \t {Mean embeddings provide an extremely flexible and powerful tool in machine learning and statistics to represent probability distributions and define a semi-metric (MMD, maximum mean discrepancy; also called N-distance or energy distance), with numerous successful applications. The representation is constructed as the expectation of the feature map defined by a kernel. As a mean, its classical empirical estimator, however, can be arbitrary severely affected even by a single outlier in case of unbounded features. To the best of our knowledge, unfortunately even the consistency of the existing few techniques trying to alleviate this serious sensitivity bottleneck is unknown. In this paper, we show how the recently emerged principle of median-of-means can be used to design estimators for kernel mean embedding and MMD with excessive resistance properties to outliers, and optimal sub-Gaussian deviation bounds under mild assumptions.}\n}", "pdf": "http://proceedings.mlr.press/v97/lerasle19a/lerasle19a.pdf", "supp": "", "pdf_size": 750754, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1294367033291625299&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 15, "aff": "Laboratoire de Math\u00e9matiques d\u2019Orsay, Univ. Paris-Sud, France+CNRS, Universit\u00e9 Paris Saclay, France; CMAP, \u00c9cole Polytechnique, Palaiseau, France; Laboratoire de Math\u00e9matiques d\u2019Orsay, Univ. Paris-Sud, France; CREST ENSAE ParisTech, France", "aff_domain": "math.u-psud.fr;polytechnique.edu; ; ", "email": "math.u-psud.fr;polytechnique.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/lerasle19a.html", "aff_unique_index": "0+1;2;0;3", "aff_unique_norm": "University of Paris-Sud;Universit\u00e9 Paris Saclay;Ecole Polytechnique;CREST ENSAE ParisTech", "aff_unique_dep": "Laboratoire de Math\u00e9matiques d\u2019Orsay;;CMAP;CREST", "aff_unique_url": "https://www.universite-paris-sud.fr;https://www.universite-paris-saclay.fr;https://www.ecp.fr;https://www.ensae.fr", "aff_unique_abbr": "Paris-Sud;UPS;\u00c9cole Polytechnique;ENSAE", "aff_campus_unique_index": "0;2;0;3", "aff_campus_unique": "Orsay;;Palaiseau;Paris", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "France" }, { "title": "Making Convolutional Networks Shift-Invariant Again", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4179", "id": "4179", "author": "Richard Zhang", "abstract": "Modern convolutional networks are not shift-invariant, as small input shifts or translations can cause drastic changes in the output. Commonly used downsampling methods, such as max-pooling, strided-convolution, and average-pooling, ignore the sampling theorem. The well-known signal processing fix is anti-aliasing by low-pass filtering before downsampling. However, simply inserting this module into deep networks leads to performance degradation; as a result, it is seldomly used today. We show that when integrated correctly, it is compatible with existing architectural components, such as max-pooling. The technique is general and can be incorporated across layer types and applications, such as image classification and conditional image generation. In addition to increased shift-invariance, we also observe, surprisingly, that anti-aliasing boosts accuracy in ImageNet classification, across several commonly-used architectures. This indicates that anti-aliasing serves as effective regularization. Our results demonstrate that this classical signal processing technique has been undeservingly overlooked in modern deep networks.", "bibtex": "@InProceedings{pmlr-v97-zhang19a,\n title = \t {Making Convolutional Networks Shift-Invariant Again},\n author = {Zhang, Richard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7324--7334},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19a/zhang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19a.html},\n abstract = \t {Modern convolutional networks are not shift-invariant, as small input shifts or translations can cause drastic changes in the output. Commonly used downsampling methods, such as max-pooling, strided-convolution, and average-pooling, ignore the sampling theorem. The well-known signal processing fix is anti-aliasing by low-pass filtering before downsampling. However, simply inserting this module into deep networks leads to performance degradation; as a result, it is seldomly used today. We show that when integrated correctly, it is compatible with existing architectural components, such as max-pooling. The technique is general and can be incorporated across layer types and applications, such as image classification and conditional image generation. In addition to increased shift-invariance, we also observe, surprisingly, that anti-aliasing boosts accuracy in ImageNet classification, across several commonly-used architectures. This indicates that anti-aliasing serves as effective regularization. Our results demonstrate that this classical signal processing technique has been undeservingly overlooked in modern deep networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19a/zhang19a.pdf", "supp": "", "pdf_size": 2076956, "gs_citation": 1056, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6405795848737680233&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Adobe Research, San Francisco, CA", "aff_domain": "adobe.com", "email": "adobe.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/zhang19a.html", "aff_unique_index": "0", "aff_unique_norm": "Adobe", "aff_unique_dep": "Adobe Research", "aff_unique_url": "https://research.adobe.com", "aff_unique_abbr": "Adobe", "aff_campus_unique_index": "0", "aff_campus_unique": "San Francisco", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Making Decisions that Reduce Discriminatory Impacts", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4198", "id": "4198", "author_site": "Matt J. Kusner, Chris Russell, Joshua Loftus, Ricardo Silva", "author": "Matt Kusner; Chris Russell; Joshua Loftus; Ricardo Silva", "abstract": "As machine learning algorithms move into real-world settings, it is crucial to ensure they are aligned with societal values. There has been much work on one aspect of this, namely the discriminatory prediction problem: How can we reduce discrimination in the predictions themselves? While an important question, solutions to this problem only apply in a restricted setting, as we have full control over the predictions. Often we care about the non-discrimination of quantities we do not have full control over. Thus, we describe another key aspect of this challenge, the discriminatory impact problem: How can we reduce discrimination arising from the real-world impact of decisions? To address this, we describe causal methods that model the relevant parts of the real-world system in which the decisions are made. Unlike previous approaches, these models not only allow us to map the causal pathway of a single decision, but also to model the effect of interference\u2013how the impact on an individual depends on decisions made about other people. Often, the goal of decision policies is to maximize a beneficial impact overall. To reduce the discrimination of these benefits, we devise a constraint inspired by recent work in counterfactual fairness, and give an efficient procedure to solve the constrained optimization problem. We demonstrate our approach with an example: how to increase students taking college entrance exams in New York City public schools.", "bibtex": "@InProceedings{pmlr-v97-kusner19a,\n title = \t {Making Decisions that Reduce Discriminatory Impacts},\n author = {Kusner, Matt and Russell, Chris and Loftus, Joshua and Silva, Ricardo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3591--3600},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kusner19a/kusner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kusner19a.html},\n abstract = \t {As machine learning algorithms move into real-world settings, it is crucial to ensure they are aligned with societal values. There has been much work on one aspect of this, namely the discriminatory prediction problem: How can we reduce discrimination in the predictions themselves? While an important question, solutions to this problem only apply in a restricted setting, as we have full control over the predictions. Often we care about the non-discrimination of quantities we do not have full control over. Thus, we describe another key aspect of this challenge, the discriminatory impact problem: How can we reduce discrimination arising from the real-world impact of decisions? To address this, we describe causal methods that model the relevant parts of the real-world system in which the decisions are made. Unlike previous approaches, these models not only allow us to map the causal pathway of a single decision, but also to model the effect of interference\u2013how the impact on an individual depends on decisions made about other people. Often, the goal of decision policies is to maximize a beneficial impact overall. To reduce the discrimination of these benefits, we devise a constraint inspired by recent work in counterfactual fairness, and give an efficient procedure to solve the constrained optimization problem. We demonstrate our approach with an example: how to increase students taking college entrance exams in New York City public schools.}\n}", "pdf": "http://proceedings.mlr.press/v97/kusner19a/kusner19a.pdf", "supp": "", "pdf_size": 5412293, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17313334050897020325&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "The Alan Turing Institute+University of Oxford; The Alan Turing Institute+University of Surrey; New York University; University College London", "aff_domain": "cs.ox.ac.uk;turing.ac.uk; ; ", "email": "cs.ox.ac.uk;turing.ac.uk; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kusner19a.html", "aff_unique_index": "0+1;0+2;3;4", "aff_unique_norm": "Alan Turing Institute;University of Oxford;University of Surrey;New York University;University College London", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.turing.ac.uk;https://www.ox.ac.uk;https://www.surrey.ac.uk;https://www.nyu.edu;https://www.ucl.ac.uk", "aff_unique_abbr": "ATI;Oxford;Surrey;NYU;UCL", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Making Deep Q-learning methods robust to time discretization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4112", "id": "4112", "author_site": "Corentin Tallec, Leonard Blier, Yann Ollivier", "author": "Corentin Tallec; L\u00e9onard Blier; Yann Ollivier", "abstract": "Despite remarkable successes, Deep Reinforce- ment Learning (DRL) is not robust to hyperparam- eterization, implementation details, or small envi- ronment changes (Henderson et al. 2017, Zhang et al. 2018). Overcoming such sensitivity is key to making DRL applicable to real world problems. In this paper, we identify sensitivity to time dis- cretization in near continuous-time environments as a critical factor; this covers, e.g., changing the number of frames per second, or the action frequency of the controller. Empirically, we find that Q-learning-based approaches such as Deep Q- learning (Mnih et al., 2015) and Deep Determinis- tic Policy Gradient (Lillicrap et al., 2015) collapse with small time steps. Formally, we prove that Q-learning does not exist in continuous time. We detail a principled way to build an off-policy RL algorithm that yields similar performances over a wide range of time discretizations, and confirm this robustness empirically.", "bibtex": "@InProceedings{pmlr-v97-tallec19a,\n title = \t {Making Deep Q-learning methods robust to time discretization},\n author = {Tallec, Corentin and Blier, L{\\'e}onard and Ollivier, Yann},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6096--6104},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tallec19a/tallec19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tallec19a.html},\n abstract = \t {Despite remarkable successes, Deep Reinforce- ment Learning (DRL) is not robust to hyperparam- eterization, implementation details, or small envi- ronment changes (Henderson et al. 2017, Zhang et al. 2018). Overcoming such sensitivity is key to making DRL applicable to real world problems. In this paper, we identify sensitivity to time dis- cretization in near continuous-time environments as a critical factor; this covers, e.g., changing the number of frames per second, or the action frequency of the controller. Empirically, we find that Q-learning-based approaches such as Deep Q- learning (Mnih et al., 2015) and Deep Determinis- tic Policy Gradient (Lillicrap et al., 2015) collapse with small time steps. Formally, we prove that Q-learning does not exist in continuous time. We detail a principled way to build an off-policy RL algorithm that yields similar performances over a wide range of time discretizations, and confirm this robustness empirically.}\n}", "pdf": "http://proceedings.mlr.press/v97/tallec19a/tallec19a.pdf", "supp": "", "pdf_size": 1234553, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8552260394006938416&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "TAckling the Underspeci\ufb01ed, Universit \u00b4e Paris Sud; TAckling the Underspeci\ufb01ed, Universit \u00b4e Paris Sud + Facebook Arti\ufb01cial Intelligence Research; Facebook Arti\ufb01cial Intelligence Research", "aff_domain": "inria.fr; ; ", "email": "inria.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/tallec19a.html", "aff_unique_index": "0;0+1;1", "aff_unique_norm": "Universit \u0301e Paris Sud;Meta", "aff_unique_dep": ";Artificial Intelligence Research", "aff_unique_url": "https://www.universite-paris-sud.fr;https://research.facebook.com", "aff_unique_abbr": "UPS;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0+1;1", "aff_country_unique": "France;United States" }, { "title": "Mallows ranking models: maximum likelihood estimate and regeneration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3913", "id": "3913", "author": "Wenpin Tang", "abstract": "This paper is concerned with various Mallows ranking models. We study the statistical properties of the MLE of Mallows\u2019 $\\phi$ model. We also make connections of various Mallows ranking models, encompassing recent progress in mathematics. Motivated by the infinite top-$t$ ranking model, we propose an algorithm to select the model size $t$ automatically. The key idea relies on the renewal property of such an infinite random permutation. Our algorithm shows good performance on several data sets.", "bibtex": "@InProceedings{pmlr-v97-tang19a,\n title = \t {Mallows ranking models: maximum likelihood estimate and regeneration},\n author = {Tang, Wenpin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6125--6134},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tang19a/tang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tang19a.html},\n abstract = \t {This paper is concerned with various Mallows ranking models. We study the statistical properties of the MLE of Mallows\u2019 $\\phi$ model. We also make connections of various Mallows ranking models, encompassing recent progress in mathematics. Motivated by the infinite top-$t$ ranking model, we propose an algorithm to select the model size $t$ automatically. The key idea relies on the renewal property of such an infinite random permutation. Our algorithm shows good performance on several data sets.}\n}", "pdf": "http://proceedings.mlr.press/v97/tang19a/tang19a.pdf", "supp": "", "pdf_size": 342497, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5348335039259360210&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Mathematics, University of California, Los Angeles, USA", "aff_domain": "math.ucla.edu", "email": "math.ucla.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/tang19a.html", "aff_unique_index": "0", "aff_unique_norm": "University of California, Los Angeles", "aff_unique_dep": "Department of Mathematics", "aff_unique_url": "https://www.ucla.edu", "aff_unique_abbr": "UCLA", "aff_campus_unique_index": "0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Manifold Mixup: Better Representations by Interpolating Hidden States", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3776", "id": "3776", "author_site": "Vikas Verma, Alex Lamb, Christopher Beckham, Amir Najafi, Ioannis Mitliagkas, David Lopez-Paz, Yoshua Bengio", "author": "Vikas Verma; Alex Lamb; Christopher Beckham; Amir Najafi; Ioannis Mitliagkas; David Lopez-Paz; Yoshua Bengio", "abstract": "Deep neural networks excel at learning the training data, but often provide incorrect and confident predictions when evaluated on slightly different test examples. This includes distribution shifts, outliers, and adversarial examples. To address these issues, we propose \\manifoldmixup{}, a simple regularizer that encourages neural networks to predict less confidently on interpolations of hidden representations. \\manifoldmixup{} leverages semantic interpolations as additional training signal, obtaining neural networks with smoother decision boundaries at multiple levels of representation. As a result, neural networks trained with \\manifoldmixup{} learn flatter class-representations, that is, with fewer directions of variance. We prove theory on why this flattening happens under ideal conditions, validate it empirically on practical situations, and connect it to the previous works on information theory and generalization. In spite of incurring no significant computation and being implemented in a few lines of code, \\manifoldmixup{} improves strong baselines in supervised learning, robustness to single-step adversarial attacks, and test log-likelihood.", "bibtex": "@InProceedings{pmlr-v97-verma19a,\n title = \t {Manifold Mixup: Better Representations by Interpolating Hidden States},\n author = {Verma, Vikas and Lamb, Alex and Beckham, Christopher and Najafi, Amir and Mitliagkas, Ioannis and Lopez-Paz, David and Bengio, Yoshua},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6438--6447},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/verma19a/verma19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/verma19a.html},\n abstract = \t {Deep neural networks excel at learning the training data, but often provide incorrect and confident predictions when evaluated on slightly different test examples. This includes distribution shifts, outliers, and adversarial examples. To address these issues, we propose \\manifoldmixup{}, a simple regularizer that encourages neural networks to predict less confidently on interpolations of hidden representations. \\manifoldmixup{} leverages semantic interpolations as additional training signal, obtaining neural networks with smoother decision boundaries at multiple levels of representation. As a result, neural networks trained with \\manifoldmixup{} learn flatter class-representations, that is, with fewer directions of variance. We prove theory on why this flattening happens under ideal conditions, validate it empirically on practical situations, and connect it to the previous works on information theory and generalization. In spite of incurring no significant computation and being implemented in a few lines of code, \\manifoldmixup{} improves strong baselines in supervised learning, robustness to single-step adversarial attacks, and test log-likelihood.}\n}", "pdf": "http://proceedings.mlr.press/v97/verma19a/verma19a.pdf", "supp": "", "pdf_size": 849687, "gs_citation": 1641, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5005853392111011711&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Aalto University, Finland+Montr\u00e9al Institute for Learning Algorithms (MILA); Montr\u00e9al Institute for Learning Algorithms (MILA); Montr\u00e9al Institute for Learning Algorithms (MILA); Sharif University of Technology; Montr\u00e9al Institute for Learning Algorithms (MILA); Facebook Research; Montr\u00e9al Institute for Learning Algorithms (MILA)", "aff_domain": "gmail.com;aalto.fi;iro.umontreal.ca; ; ; ; ", "email": "gmail.com;aalto.fi;iro.umontreal.ca; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/verma19a.html", "aff_unique_index": "0+1;1;1;2;1;3;1", "aff_unique_norm": "Aalto University;Montr\u00e9al Institute for Learning Algorithms;Sharif University of Technology;Meta", "aff_unique_dep": ";Computer Science;;Facebook Research", "aff_unique_url": "https://www.aalto.fi;https://mila.quebec;https://www.sharif.edu;https://research.facebook.com", "aff_unique_abbr": "Aalto;MILA;SUT;FB Research", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Montr\u00e9al", "aff_country_unique_index": "0+1;1;1;2;1;3;1", "aff_country_unique": "Finland;Canada;Iran;United States" }, { "title": "Matrix-Free Preconditioning in Online Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3934", "id": "3934", "author_site": "Ashok Cutkosky, Tamas Sarlos", "author": "Ashok Cutkosky; Tamas Sarlos", "abstract": "We provide an online convex optimization algorithm with regret that interpolates between the regret of an algorithm using an optimal preconditioning matrix and one using a diagonal preconditioning matrix. Our regret bound is never worse than that obtained by diagonal preconditioning, and in certain setting even surpasses that of algorithms with full-matrix preconditioning. Importantly, our algorithm runs in the same time and space complexity as online gradient descent. Along the way we incorporate new techniques that mildly streamline and improve logarithmic factors in prior regret analyses. We conclude by benchmarking our algorithm on synthetic data and deep learning tasks.", "bibtex": "@InProceedings{pmlr-v97-cutkosky19b,\n title = \t {Matrix-Free Preconditioning in Online Learning},\n author = {Cutkosky, Ashok and Sarlos, Tamas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1455--1464},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cutkosky19b/cutkosky19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/cutkosky19b.html},\n abstract = \t {We provide an online convex optimization algorithm with regret that interpolates between the regret of an algorithm using an optimal preconditioning matrix and one using a diagonal preconditioning matrix. Our regret bound is never worse than that obtained by diagonal preconditioning, and in certain setting even surpasses that of algorithms with full-matrix preconditioning. Importantly, our algorithm runs in the same time and space complexity as online gradient descent. Along the way we incorporate new techniques that mildly streamline and improve logarithmic factors in prior regret analyses. We conclude by benchmarking our algorithm on synthetic data and deep learning tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/cutkosky19b/cutkosky19b.pdf", "supp": "", "pdf_size": 1824235, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6039480237146880900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Google Research, California, USA; Google Research, California, USA", "aff_domain": "google.com; ", "email": "google.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/cutkosky19b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google Research", "aff_campus_unique_index": "0;0", "aff_campus_unique": "California", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Maximum Entropy-Regularized Multi-Goal Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3833", "id": "3833", "author_site": "Rui Zhao, Xudong Sun, Volker Tresp", "author": "Rui Zhao; Xudong Sun; Volker Tresp", "abstract": "In Multi-Goal Reinforcement Learning, an agent learns to achieve multiple goals with a goal-conditioned policy. During learning, the agent first collects the trajectories into a replay buffer, and later these trajectories are selected randomly for replay. However, the achieved goals in the replay buffer are often biased towards the behavior policies. From a Bayesian perspective, when there is no prior knowledge about the target goal distribution, the agent should learn uniformly from diverse achieved goals. Therefore, we first propose a novel multi-goal RL objective based on weighted entropy. This objective encourages the agent to maximize the expected return, as well as to achieve more diverse goals. Secondly, we developed a maximum entropy-based prioritization framework to optimize the proposed objective. For evaluation of this framework, we combine it with Deep Deterministic Policy Gradient, both with or without Hindsight Experience Replay. On a set of multi-goal robotic tasks of OpenAI Gym, we compare our method with other baselines and show promising improvements in both performance and sample-efficiency.", "bibtex": "@InProceedings{pmlr-v97-zhao19d,\n title = \t {Maximum Entropy-Regularized Multi-Goal Reinforcement Learning},\n author = {Zhao, Rui and Sun, Xudong and Tresp, Volker},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7553--7562},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhao19d/zhao19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhao19d.html},\n abstract = \t {In Multi-Goal Reinforcement Learning, an agent learns to achieve multiple goals with a goal-conditioned policy. During learning, the agent first collects the trajectories into a replay buffer, and later these trajectories are selected randomly for replay. However, the achieved goals in the replay buffer are often biased towards the behavior policies. From a Bayesian perspective, when there is no prior knowledge about the target goal distribution, the agent should learn uniformly from diverse achieved goals. Therefore, we first propose a novel multi-goal RL objective based on weighted entropy. This objective encourages the agent to maximize the expected return, as well as to achieve more diverse goals. Secondly, we developed a maximum entropy-based prioritization framework to optimize the proposed objective. For evaluation of this framework, we combine it with Deep Deterministic Policy Gradient, both with or without Hindsight Experience Replay. On a set of multi-goal robotic tasks of OpenAI Gym, we compare our method with other baselines and show promising improvements in both performance and sample-efficiency.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhao19d/zhao19d.pdf", "supp": "", "pdf_size": 3583244, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12004531622883216435&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Faculty of Mathematics, Informatics and Statistics, Ludwig Maximilian University of Munich, Munich, Bavaria, Germany+Siemens AG, Munich, Bavaria, Germany; Faculty of Mathematics, Informatics and Statistics, Ludwig Maximilian University of Munich, Munich, Bavaria, Germany; Faculty of Mathematics, Informatics and Statistics, Ludwig Maximilian University of Munich, Munich, Bavaria, Germany+Siemens AG, Munich, Bavaria, Germany", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhao19d.html", "aff_unique_index": "0+1;0;0+1", "aff_unique_norm": "Ludwig Maximilian University of Munich;Siemens AG", "aff_unique_dep": "Faculty of Mathematics, Informatics and Statistics;", "aff_unique_url": "https://www.lmu.de;https://www.siemens.com", "aff_unique_abbr": "LMU;Siemens", "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "Munich", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "Germany" }, { "title": "Maximum Likelihood Estimation for Learning Populations of Parameters", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4310", "id": "4310", "author_site": "Ramya Korlakai Vinayak, Weihao Kong, Gregory Valiant, Sham Kakade", "author": "Ramya Korlakai Vinayak; Weihao Kong; Gregory Valiant; Sham Kakade", "abstract": "Consider a setting with $N$ independent individuals, each with an unknown parameter, $p_i \\in [0, 1]$ drawn from some unknown distribution $P^\\star$. After observing the outcomes of $t$ independent Bernoulli trials, i.e., $X_i \\sim \\text{Binomial}(t, p_i)$ per individual, our objective is to accurately estimate $P^\\star$ in the sparse regime, namely when $t \\ll N$. This problem arises in numerous domains, including the social sciences, psychology, health-care, and biology, where the size of the population under study is usually large yet the number of observations per individual is often limited. Our main result shows that, in this sparse regime where $t \\ll N$, the maximum likelihood estimator (MLE) is both statistically minimax optimal and efficiently computable. Precisely, for sufficiently large $N$, the MLE achieves the information theoretic optimal error bound of $\\mathcal{O}(\\frac{1}{t})$ for $t < c\\log{N}$, with regards to the earth mover\u2019s distance (between the estimated and true distributions). More generally, in an exponentially large interval of $t$ beyond $c \\log{N}$, the MLE achieves the minimax error bound of $\\mathcal{O}(\\frac{1}{\\sqrt{t\\log N}})$. In contrast, regardless of how large $N$ is, the naive \"plug-in\" estimator for this problem only achieves the sub-optimal error of $\\Theta(\\frac{1}{\\sqrt{t}})$. Empirically, we also demonstrate the MLE performs well on both synthetic as well as real datasets.", "bibtex": "@InProceedings{pmlr-v97-vinayak19a,\n title = \t {Maximum Likelihood Estimation for Learning Populations of Parameters},\n author = {Vinayak, Ramya Korlakai and Kong, Weihao and Valiant, Gregory and Kakade, Sham},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6448--6457},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/vinayak19a/vinayak19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/vinayak19a.html},\n abstract = \t {Consider a setting with $N$ independent individuals, each with an unknown parameter, $p_i \\in [0, 1]$ drawn from some unknown distribution $P^\\star$. After observing the outcomes of $t$ independent Bernoulli trials, i.e., $X_i \\sim \\text{Binomial}(t, p_i)$ per individual, our objective is to accurately estimate $P^\\star$ in the sparse regime, namely when $t \\ll N$. This problem arises in numerous domains, including the social sciences, psychology, health-care, and biology, where the size of the population under study is usually large yet the number of observations per individual is often limited. Our main result shows that, in this sparse regime where $t \\ll N$, the maximum likelihood estimator (MLE) is both statistically minimax optimal and efficiently computable. Precisely, for sufficiently large $N$, the MLE achieves the information theoretic optimal error bound of $\\mathcal{O}(\\frac{1}{t})$ for $t < c\\log{N}$, with regards to the earth mover\u2019s distance (between the estimated and true distributions). More generally, in an exponentially large interval of $t$ beyond $c \\log{N}$, the MLE achieves the minimax error bound of $\\mathcal{O}(\\frac{1}{\\sqrt{t\\log N}})$. In contrast, regardless of how large $N$ is, the naive \"plug-in\" estimator for this problem only achieves the sub-optimal error of $\\Theta(\\frac{1}{\\sqrt{t}})$. Empirically, we also demonstrate the MLE performs well on both synthetic as well as real datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/vinayak19a/vinayak19a.pdf", "supp": "", "pdf_size": 461110, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11923953414406157851&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle; Department of Computer Science, Stanford University, Stanford; Department of Computer Science, Stanford University, Stanford; Paul G. Allen School of Computer Science and Engineering, University of Washington, Seattle", "aff_domain": "cs.washington.edu; ; ; ", "email": "cs.washington.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/vinayak19a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Washington;Stanford University", "aff_unique_dep": "Paul G. Allen School of Computer Science and Engineering;Department of Computer Science", "aff_unique_url": "https://www.washington.edu;https://www.stanford.edu", "aff_unique_abbr": "UW;Stanford", "aff_campus_unique_index": "0;1;1;0", "aff_campus_unique": "Seattle;Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "MeanSum: A Neural Model for Unsupervised Multi-Document Abstractive Summarization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3891", "id": "3891", "author_site": "Eric Chu, Peter Liu", "author": "Eric Chu; Peter Liu", "abstract": "Abstractive summarization has been studied using neural sequence transduction methods with datasets of large, paired document-summary examples. However, such datasets are rare and the models trained from them do not generalize to other domains. Recently, some progress has been made in learning sequence-to-sequence mappings with only unpaired examples. In our work, we consider the setting where there are only documents (product or business reviews) with no summaries provided, and propose an end-to-end, neural model architecture to perform unsupervised abstractive summarization. Our proposed model consists of an auto-encoder where the mean of the representations of the input reviews decodes to a reasonable summary-review. We consider variants of the proposed architecture and perform an ablation study to show the importance of specific components. We show through metrics and human evaluation that the generated summaries are highly abstractive, fluent, relevant, and representative of the average sentiment of the input reviews. Finally, we collect a ground-truth evaluation dataset and show that our model outperforms a strong extractive baseline.", "bibtex": "@InProceedings{pmlr-v97-chu19b,\n title = \t {{M}ean{S}um: A Neural Model for Unsupervised Multi-Document Abstractive Summarization},\n author = {Chu, Eric and Liu, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1223--1232},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chu19b/chu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/chu19b.html},\n abstract = \t {Abstractive summarization has been studied using neural sequence transduction methods with datasets of large, paired document-summary examples. However, such datasets are rare and the models trained from them do not generalize to other domains. Recently, some progress has been made in learning sequence-to-sequence mappings with only unpaired examples. In our work, we consider the setting where there are only documents (product or business reviews) with no summaries provided, and propose an end-to-end, neural model architecture to perform unsupervised abstractive summarization. Our proposed model consists of an auto-encoder where the mean of the representations of the input reviews decodes to a reasonable summary-review. We consider variants of the proposed architecture and perform an ablation study to show the importance of specific components. We show through metrics and human evaluation that the generated summaries are highly abstractive, fluent, relevant, and representative of the average sentiment of the input reviews. Finally, we collect a ground-truth evaluation dataset and show that our model outperforms a strong extractive baseline.}\n}", "pdf": "http://proceedings.mlr.press/v97/chu19b/chu19b.pdf", "supp": "", "pdf_size": 550032, "gs_citation": 254, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11126017598001925179&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "MIT Media Lab+Google Brain; Google Brain", "aff_domain": "mit.edu;google.com", "email": "mit.edu;google.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/chu19b.html", "aff_unique_index": "0+1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google", "aff_unique_dep": "Media Lab;Google Brain", "aff_unique_url": "http://www.media.mit.edu/;https://brain.google.com", "aff_unique_abbr": "MIT;Google Brain", "aff_campus_unique_index": "0+1;1", "aff_campus_unique": "Cambridge;Mountain View", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Measurements of Three-Level Hierarchical Structure in the Outliers in the Spectrum of Deepnet Hessians", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3758", "id": "3758", "author": "Vardan Papyan", "abstract": "We expose a structure in deep classifying neural networks in the derivative of the logits with respect to the parameters of the model, which is used to explain the existence of outliers in the spectrum of the Hessian. Previous works decomposed the Hessian into two components, attributing the outliers to one of them, the so-called Covariance of gradients. We show this term is not a Covariance but a second moment matrix, i.e., it is influenced by means of gradients. These means possess an additive two-way structure that is the source of the outliers in the spectrum. This structure can be used to approximate the principal subspace of the Hessian using certain \"averaging\" operations, avoiding the need for high-dimensional eigenanalysis. We corroborate this claim across different datasets, architectures and sample sizes.", "bibtex": "@InProceedings{pmlr-v97-papyan19a,\n title = \t {Measurements of Three-Level Hierarchical Structure in the Outliers in the Spectrum of Deepnet Hessians},\n author = {Papyan, Vardan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5012--5021},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/papyan19a/papyan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/papyan19a.html},\n abstract = \t {We expose a structure in deep classifying neural networks in the derivative of the logits with respect to the parameters of the model, which is used to explain the existence of outliers in the spectrum of the Hessian. Previous works decomposed the Hessian into two components, attributing the outliers to one of them, the so-called Covariance of gradients. We show this term is not a Covariance but a second moment matrix, i.e., it is influenced by means of gradients. These means possess an additive two-way structure that is the source of the outliers in the spectrum. This structure can be used to approximate the principal subspace of the Hessian using certain \"averaging\" operations, avoiding the need for high-dimensional eigenanalysis. We corroborate this claim across different datasets, architectures and sample sizes.}\n}", "pdf": "http://proceedings.mlr.press/v97/papyan19a/papyan19a.pdf", "supp": "", "pdf_size": 9576874, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14622188729818532803&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, Stanford University, Stanford, CA 94305", "aff_domain": "stanford.edu", "email": "stanford.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/papyan19a.html", "aff_unique_index": "0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Memory-Optimal Direct Convolutions for Maximizing Classification Accuracy in Embedded Applications", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4308", "id": "4308", "author_site": "Albert Gural, Boris Murmann", "author": "Albert Gural; Boris Murmann", "abstract": "In the age of Internet of Things (IoT), embedded devices ranging from ARM Cortex M0s with hundreds of KB of RAM to Arduinos with 2KB RAM are expected to perform increasingly sophisticated classification tasks, such as voice and gesture recognition, activity tracking, and biometric security. While convolutional neural networks (CNNs), together with spectrogram preprocessing, are a natural solution to many of these classification tasks, storage of the network\u2019s activations often exceeds the hard memory constraints of embedded platforms. This paper presents memory-optimal direct convolutions as a way to push classification accuracy as high as possible given strict hardware memory constraints at the expense of extra compute. We therefore explore the opposite end of the compute-memory trade-off curve from standard approaches that minimize latency. We validate the memory-optimal CNN technique with an Arduino implementation of the 10-class MNIST classification task, fitting the network specification, weights, and activations entirely within 2KB SRAM and achieving a state-of-the-art classification accuracy for small-scale embedded systems of 99.15%.", "bibtex": "@InProceedings{pmlr-v97-gural19a,\n title = \t {Memory-Optimal Direct Convolutions for Maximizing Classification Accuracy in Embedded Applications},\n author = {Gural, Albert and Murmann, Boris},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2515--2524},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gural19a/gural19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gural19a.html},\n abstract = \t {In the age of Internet of Things (IoT), embedded devices ranging from ARM Cortex M0s with hundreds of KB of RAM to Arduinos with 2KB RAM are expected to perform increasingly sophisticated classification tasks, such as voice and gesture recognition, activity tracking, and biometric security. While convolutional neural networks (CNNs), together with spectrogram preprocessing, are a natural solution to many of these classification tasks, storage of the network\u2019s activations often exceeds the hard memory constraints of embedded platforms. This paper presents memory-optimal direct convolutions as a way to push classification accuracy as high as possible given strict hardware memory constraints at the expense of extra compute. We therefore explore the opposite end of the compute-memory trade-off curve from standard approaches that minimize latency. We validate the memory-optimal CNN technique with an Arduino implementation of the 10-class MNIST classification task, fitting the network specification, weights, and activations entirely within 2KB SRAM and achieving a state-of-the-art classification accuracy for small-scale embedded systems of 99.15%.}\n}", "pdf": "http://proceedings.mlr.press/v97/gural19a/gural19a.pdf", "supp": "", "pdf_size": 7788654, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17168781053968674929&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Department of Electrical Engineering, Stanford University, Stanford, USA; Department of Electrical Engineering, Stanford University, Stanford, USA", "aff_domain": "stanford.edu;stanford.edu", "email": "stanford.edu;stanford.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/gural19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Electrical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Meta-Learning Neural Bloom Filters", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3846", "id": "3846", "author_site": "Jack Rae, Sergey Bartunov, Timothy Lillicrap", "author": "Jack Rae; Sergey Bartunov; Timothy Lillicrap", "abstract": "There has been a recent trend in training neural networks to replace data structures that have been crafted by hand, with an aim for faster execution, better accuracy, or greater compression. In this setting, a neural data structure is instantiated by training a network over many epochs of its inputs until convergence. In applications where inputs arrive at high throughput, or are ephemeral, training a network from scratch is not practical. This motivates the need for few-shot neural data structures. In this paper we explore the learning of approximate set membership over a set of data in one-shot via meta-learning. We propose a novel memory architecture, the Neural Bloom Filter, which is able to achieve significant compression gains over classical Bloom Filters and existing memory-augmented neural networks.", "bibtex": "@InProceedings{pmlr-v97-rae19a,\n title = \t {Meta-Learning Neural Bloom Filters},\n author = {Rae, Jack and Bartunov, Sergey and Lillicrap, Timothy},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5271--5280},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rae19a/rae19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rae19a.html},\n abstract = \t {There has been a recent trend in training neural networks to replace data structures that have been crafted by hand, with an aim for faster execution, better accuracy, or greater compression. In this setting, a neural data structure is instantiated by training a network over many epochs of its inputs until convergence. In applications where inputs arrive at high throughput, or are ephemeral, training a network from scratch is not practical. This motivates the need for few-shot neural data structures. In this paper we explore the learning of approximate set membership over a set of data in one-shot via meta-learning. We propose a novel memory architecture, the Neural Bloom Filter, which is able to achieve significant compression gains over classical Bloom Filters and existing memory-augmented neural networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/rae19a/rae19a.pdf", "supp": "", "pdf_size": 783191, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5792515027365493091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "DeepMind, London, UK+CoMPLEX, Computer Science, University College London, London, UK; DeepMind, London, UK; DeepMind, London, UK+CoMPLEX, Computer Science, University College London, London, UK", "aff_domain": "google.com; ; ", "email": "google.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/rae19a.html", "aff_unique_index": "0+1;0;0+1", "aff_unique_norm": "DeepMind;University College London", "aff_unique_dep": ";Computer Science", "aff_unique_url": "https://deepmind.com;https://www.ucl.ac.uk", "aff_unique_abbr": "DeepMind;UCL", "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "London", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Metric-Optimized Example Weights", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3932", "id": "3932", "author_site": "Sen Zhao, Mahdi Milani Fard, Harikrishna Narasimhan, Maya Gupta", "author": "Sen Zhao; Mahdi Milani Fard; Harikrishna Narasimhan; Maya Gupta", "abstract": "Real-world machine learning applications often have complex test metrics, and may have training and test data that are not identically distributed. Motivated by known connections between complex test metrics and cost-weighted learning, we propose addressing these issues by using a weighted loss function with a standard loss, where the weights on the training examples are learned to optimize the test metric on a validation set. These metric-optimized example weights can be learned for any test metric, including black box and customized ones for specific applications. We illustrate the performance of the proposed method on diverse public benchmark datasets and real-world applications. We also provide a generalization bound for the method.", "bibtex": "@InProceedings{pmlr-v97-zhao19b,\n title = \t {Metric-Optimized Example Weights},\n author = {Zhao, Sen and Fard, Mahdi Milani and Narasimhan, Harikrishna and Gupta, Maya},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7533--7542},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhao19b/zhao19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhao19b.html},\n abstract = \t {Real-world machine learning applications often have complex test metrics, and may have training and test data that are not identically distributed. Motivated by known connections between complex test metrics and cost-weighted learning, we propose addressing these issues by using a weighted loss function with a standard loss, where the weights on the training examples are learned to optimize the test metric on a validation set. These metric-optimized example weights can be learned for any test metric, including black box and customized ones for specific applications. We illustrate the performance of the proposed method on diverse public benchmark datasets and real-world applications. We also provide a generalization bound for the method.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhao19b/zhao19b.pdf", "supp": "", "pdf_size": 915296, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5876377887875144782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Google AI; Google AI; Google AI; Google AI", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/zhao19b.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google AI", "aff_unique_url": "https://ai.google", "aff_unique_abbr": "Google AI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "MetricGAN: Generative Adversarial Networks based Black-box Metric Scores Optimization for Speech Enhancement", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3820", "id": "3820", "author_site": "Szu-Wei Fu, Chien-Feng Liao, Yu Tsao, Shou-De Lin", "author": "Szu-Wei Fu; Chien-Feng Liao; Yu Tsao; Shou-De Lin", "abstract": "Adversarial loss in a conditional generative adversarial network (GAN) is not designed to directly optimize evaluation metrics of a target task, and thus, may not always guide the generator in a GAN to generate data with improved metric scores. To overcome this issue, we propose a novel MetricGAN approach with an aim to optimize the generator with respect to one or multiple evaluation metrics. Moreover, based on MetricGAN, the metric scores of the generated data can also be arbitrarily specified by users. We tested the proposed MetricGAN on a speech enhancement task, which is particularly suitable to verify the proposed approach because there are multiple metrics measuring different aspects of speech signals. Moreover, these metrics are generally complex and could not be fully optimized by Lp or conventional adversarial losses.", "bibtex": "@InProceedings{pmlr-v97-fu19b,\n title = \t {{M}etric{GAN}: Generative Adversarial Networks based Black-box Metric Scores Optimization for Speech Enhancement},\n author = {Fu, Szu-Wei and Liao, Chien-Feng and Tsao, Yu and Lin, Shou-De},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2031--2041},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fu19b/fu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/fu19b.html},\n abstract = \t {Adversarial loss in a conditional generative adversarial network (GAN) is not designed to directly optimize evaluation metrics of a target task, and thus, may not always guide the generator in a GAN to generate data with improved metric scores. To overcome this issue, we propose a novel MetricGAN approach with an aim to optimize the generator with respect to one or multiple evaluation metrics. Moreover, based on MetricGAN, the metric scores of the generated data can also be arbitrarily specified by users. We tested the proposed MetricGAN on a speech enhancement task, which is particularly suitable to verify the proposed approach because there are multiple metrics measuring different aspects of speech signals. Moreover, these metrics are generally complex and could not be fully optimized by Lp or conventional adversarial losses.}\n}", "pdf": "http://proceedings.mlr.press/v97/fu19b/fu19b.pdf", "supp": "", "pdf_size": 3020335, "gs_citation": 413, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10740262477107408585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "National Taiwan University, Taiwan+Academia Sinica, Taiwan; National Taiwan University, Taiwan+Academia Sinica, Taiwan; Academia Sinica, Taiwan; National Taiwan University, Taiwan", "aff_domain": "ntu.edu.tw;ntu.edu.tw;citi.sinica.edu.tw;csie.ntu.edu.tw", "email": "ntu.edu.tw;ntu.edu.tw;citi.sinica.edu.tw;csie.ntu.edu.tw", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/fu19b.html", "aff_unique_index": "0+1;0+1;1;0", "aff_unique_norm": "National Taiwan University;Academia Sinica", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.tw;https://www.sinica.edu.tw", "aff_unique_abbr": "NTU;Academia Sinica", "aff_campus_unique_index": "0+0;0+0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0+0;0+0;0;0", "aff_country_unique": "China" }, { "title": "Metropolis-Hastings Generative Adversarial Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3693", "id": "3693", "author_site": "Ryan Turner, Jane Hung, Eric Frank, Yunus Saatchi, Jason Yosinski", "author": "Ryan Turner; Jane Hung; Eric Frank; Yunus Saatchi; Jason Yosinski", "abstract": "We introduce the Metropolis-Hastings generative adversarial network (MH-GAN), which combines aspects of Markov chain Monte Carlo and GANs. The MH-GAN draws samples from the distribution implicitly defined by a GAN\u2019s discriminator-generator pair, as opposed to standard GANs which draw samples from the distribution defined only by the generator. It uses the discriminator from GAN training to build a wrapper around the generator for improved sampling. With a perfect discriminator, this wrapped generator samples from the true distribution on the data exactly even when the generator is imperfect. We demonstrate the benefits of the improved generator on multiple benchmark datasets, including CIFAR-10 and CelebA, using the DCGAN, WGAN, and progressive GAN.", "bibtex": "@InProceedings{pmlr-v97-turner19a,\n title = \t {{M}etropolis-{H}astings Generative Adversarial Networks},\n author = {Turner, Ryan and Hung, Jane and Frank, Eric and Saatchi, Yunus and Yosinski, Jason},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6345--6353},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/turner19a/turner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/turner19a.html},\n abstract = \t {We introduce the Metropolis-Hastings generative adversarial network (MH-GAN), which combines aspects of Markov chain Monte Carlo and GANs. The MH-GAN draws samples from the distribution implicitly defined by a GAN\u2019s discriminator-generator pair, as opposed to standard GANs which draw samples from the distribution defined only by the generator. It uses the discriminator from GAN training to build a wrapper around the generator for improved sampling. With a perfect discriminator, this wrapped generator samples from the true distribution on the data exactly even when the generator is imperfect. We demonstrate the benefits of the improved generator on multiple benchmark datasets, including CIFAR-10 and CelebA, using the DCGAN, WGAN, and progressive GAN.}\n}", "pdf": "http://proceedings.mlr.press/v97/turner19a/turner19a.pdf", "supp": "", "pdf_size": 5935790, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18080915212804537296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Uber AI Labs; Uber AI Labs; Uber AI Labs; Uber AI Labs; Uber AI Labs", "aff_domain": "uber.com; ; ; ; ", "email": "uber.com; ; ; ; ", "github": "github.com/uber-research/metropolis-hastings-gans", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/turner19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Uber", "aff_unique_dep": "Uber AI Labs", "aff_unique_url": "https://www.uber.com", "aff_unique_abbr": "Uber AI Labs", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Minimal Achievable Sufficient Statistic Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4189", "id": "4189", "author_site": "Milan Cvitkovic, G\u00fcnther Koliander", "author": "Milan Cvitkovic; G\u00fcnther Koliander", "abstract": "We introduce Minimal Achievable Sufficient Statistic (MASS) Learning, a machine learning training objective for which the minima are minimal sufficient statistics with respect to a class of functions being optimized over (e.g., deep networks). In deriving MASS Learning, we also introduce Conserved Differential Information (CDI), an information-theoretic quantity that {\u2014} unlike standard mutual information {\u2014} can be usefully applied to deterministically-dependent continuous random variables like the input and output of a deep network. In a series of experiments, we show that deep networks trained with MASS Learning achieve competitive performance on supervised learning, regularization, and uncertainty quantification benchmarks.", "bibtex": "@InProceedings{pmlr-v97-cvitkovic19a,\n title = \t {Minimal Achievable Sufficient Statistic Learning},\n author = {Cvitkovic, Milan and Koliander, G{\\\"u}nther},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1465--1474},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cvitkovic19a/cvitkovic19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cvitkovic19a.html},\n abstract = \t {We introduce Minimal Achievable Sufficient Statistic (MASS) Learning, a machine learning training objective for which the minima are minimal sufficient statistics with respect to a class of functions being optimized over (e.g., deep networks). In deriving MASS Learning, we also introduce Conserved Differential Information (CDI), an information-theoretic quantity that {\u2014} unlike standard mutual information {\u2014} can be usefully applied to deterministically-dependent continuous random variables like the input and output of a deep network. In a series of experiments, we show that deep networks trained with MASS Learning achieve competitive performance on supervised learning, regularization, and uncertainty quantification benchmarks.}\n}", "pdf": "http://proceedings.mlr.press/v97/cvitkovic19a/cvitkovic19a.pdf", "supp": "", "pdf_size": 602871, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16216829176165913924&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, California, USA; Acoustics Research Institute, Austrian Academy of Sciences, Vienna, Austria", "aff_domain": "caltech.edu; ", "email": "caltech.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/cvitkovic19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "California Institute of Technology;Austrian Academy of Sciences", "aff_unique_dep": "Department of Computing and Mathematical Sciences;Acoustics Research Institute", "aff_unique_url": "https://www.caltech.edu;https://www.oeaw.ac.at", "aff_unique_abbr": "Caltech;OEAW", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Pasadena;Vienna", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;Austria" }, { "title": "MixHop: Higher-Order Graph Convolutional Architectures via Sparsified Neighborhood Mixing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3702", "id": "3702", "author_site": "Sami Abu-El-Haija, Bryan Perozzi, Amol Kapoor, Nazanin Alipourfard, Kristina Lerman, Hrayr Harutyunyan, Greg Ver Steeg, Aram Galstyan", "author": "Sami Abu-El-Haija; Bryan Perozzi; Amol Kapoor; Nazanin Alipourfard; Kristina Lerman; Hrayr Harutyunyan; Greg Ver Steeg; Aram Galstyan", "abstract": "Existing popular methods for semi-supervised learning with Graph Neural Networks (such as the Graph Convolutional Network) provably cannot learn a general class of neighborhood mixing relationships. To address this weakness, we propose a new model, MixHop, that can learn these relationships, including difference operators, by repeatedly mixing feature representations of neighbors at various distances. MixHop requires no additional memory or computational complexity, and outperforms on challenging baselines. In addition, we propose sparsity regularization that allows us to visualize how the network prioritizes neighborhood information across different graph datasets. Our analysis of the learned architectures reveals that neighborhood mixing varies per datasets.", "bibtex": "@InProceedings{pmlr-v97-abu-el-haija19a,\n title = \t {{M}ix{H}op: Higher-Order Graph Convolutional Architectures via Sparsified Neighborhood Mixing},\n author = {Abu-El-Haija, Sami and Perozzi, Bryan and Kapoor, Amol and Alipourfard, Nazanin and Lerman, Kristina and Harutyunyan, Hrayr and Steeg, Greg Ver and Galstyan, Aram},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {21--29},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/abu-el-haija19a/abu-el-haija19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/abu-el-haija19a.html},\n abstract = \t {Existing popular methods for semi-supervised learning with Graph Neural Networks (such as the Graph Convolutional Network) provably cannot learn a general class of neighborhood mixing relationships. To address this weakness, we propose a new model, MixHop, that can learn these relationships, including difference operators, by repeatedly mixing feature representations of neighbors at various distances. MixHop requires no additional memory or computational complexity, and outperforms on challenging baselines. In addition, we propose sparsity regularization that allows us to visualize how the network prioritizes neighborhood information across different graph datasets. Our analysis of the learned architectures reveals that neighborhood mixing varies per datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/abu-el-haija19a/abu-el-haija19a.pdf", "supp": "", "pdf_size": 364650, "gs_citation": 1185, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8927230189965016671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "Information Sciences Institute, University of Southern California; Google AI, New York; Google AI, New York; Information Sciences Institute, University of Southern California; Information Sciences Institute, University of Southern California; Information Sciences Institute, University of Southern California; Information Sciences Institute, University of Southern California; Information Sciences Institute, University of Southern California", "aff_domain": "haija.org;acm.org; ; ; ; ; ; ", "email": "haija.org;acm.org; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/abu-el-haija19a.html", "aff_unique_index": "0;1;1;0;0;0;0;0", "aff_unique_norm": "University of Southern California;Google", "aff_unique_dep": "Information Sciences Institute;Google AI", "aff_unique_url": "https://www.usc.edu;https://ai.google", "aff_unique_abbr": "USC;Google AI", "aff_campus_unique_index": "0;1;1;0;0;0;0;0", "aff_campus_unique": "Los Angeles;New York", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Mixture Models for Diverse Machine Translation: Tricks of the Trade", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4319", "id": "4319", "author_site": "Tianxiao Shen, Myle Ott, Michael Auli, Marc'Aurelio Ranzato", "author": "Tianxiao Shen; Myle Ott; Michael Auli; Marc\u2019Aurelio Ranzato", "abstract": "Mixture models trained via EM are among the simplest, most widely used and well understood latent variable models in the machine learning literature. Surprisingly, these models have been hardly explored in text generation applications such as machine translation. In principle, they provide a latent variable to control generation and produce a diverse set of hypotheses. In practice, however, mixture models are prone to degeneracies\u2014often only one component gets trained or the latent variable is simply ignored. We find that disabling dropout noise in responsibility computation is critical to successful training. In addition, the design choices of parameterization, prior distribution, hard versus soft EM and online versus offline assignment can dramatically affect model performance. We develop an evaluation protocol to assess both quality and diversity of generations against multiple references, and provide an extensive empirical study of several mixture model variants. Our analysis shows that certain types of mixture models are more robust and offer the best trade-off between translation quality and diversity compared to variational models and diverse decoding approaches.\\footnote{Code to reproduce the results in this paper is available at \\url{https://github.com/pytorch/fairseq}}", "bibtex": "@InProceedings{pmlr-v97-shen19c,\n title = \t {Mixture Models for Diverse Machine Translation: Tricks of the Trade},\n author = {Shen, Tianxiao and Ott, Myle and Auli, Michael and Ranzato, Marc'Aurelio},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5719--5728},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shen19c/shen19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/shen19c.html},\n abstract = \t {Mixture models trained via EM are among the simplest, most widely used and well understood latent variable models in the machine learning literature. Surprisingly, these models have been hardly explored in text generation applications such as machine translation. In principle, they provide a latent variable to control generation and produce a diverse set of hypotheses. In practice, however, mixture models are prone to degeneracies\u2014often only one component gets trained or the latent variable is simply ignored. We find that disabling dropout noise in responsibility computation is critical to successful training. In addition, the design choices of parameterization, prior distribution, hard versus soft EM and online versus offline assignment can dramatically affect model performance. We develop an evaluation protocol to assess both quality and diversity of generations against multiple references, and provide an extensive empirical study of several mixture model variants. Our analysis shows that certain types of mixture models are more robust and offer the best trade-off between translation quality and diversity compared to variational models and diverse decoding approaches.\\footnote{Code to reproduce the results in this paper is available at \\url{https://github.com/pytorch/fairseq}}}\n}", "pdf": "http://proceedings.mlr.press/v97/shen19c/shen19c.pdf", "supp": "", "pdf_size": 1255533, "gs_citation": 153, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10713606322116851955&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "MIT CSAIL; Facebook AI Research; Facebook AI Research; Facebook AI Research", "aff_domain": "csail.mit.edu; ; ; ", "email": "csail.mit.edu; ; ; ", "github": "https://github.com/pytorch/fairseq", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/shen19c.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Meta", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;Facebook AI Research", "aff_unique_url": "https://www.csail.mit.edu;https://research.facebook.com", "aff_unique_abbr": "MIT CSAIL;FAIR", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Model Comparison for Semantic Grouping", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3714", "id": "3714", "author_site": "Francisco Vargas, Kamen Brestnichki, Nils Hammerla", "author": "Francisco Vargas; Kamen Brestnichki; Nils Hammerla", "abstract": "We introduce a probabilistic framework for quantifying the semantic similarity between two groups of embeddings. We formulate the task of semantic similarity as a model comparison task in which we contrast a generative model which jointly models two sentences versus one that does not. We illustrate how this framework can be used for the Semantic Textual Similarity tasks using clear assumptions about how the embeddings of words are generated. We apply model comparison that utilises information criteria to address some of the shortcomings of Bayesian model comparison, whilst still penalising model complexity. We achieve competitive results by applying the proposed framework with an appropriate choice of likelihood on the STS datasets.", "bibtex": "@InProceedings{pmlr-v97-vargas19a,\n title = \t {Model Comparison for Semantic Grouping},\n author = {Vargas, Francisco and Brestnichki, Kamen and Hammerla, Nils},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6410--6417},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/vargas19a/vargas19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/vargas19a.html},\n abstract = \t {We introduce a probabilistic framework for quantifying the semantic similarity between two groups of embeddings. We formulate the task of semantic similarity as a model comparison task in which we contrast a generative model which jointly models two sentences versus one that does not. We illustrate how this framework can be used for the Semantic Textual Similarity tasks using clear assumptions about how the embeddings of words are generated. We apply model comparison that utilises information criteria to address some of the shortcomings of Bayesian model comparison, whilst still penalising model complexity. We achieve competitive results by applying the proposed framework with an appropriate choice of likelihood on the STS datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/vargas19a/vargas19a.pdf", "supp": "", "pdf_size": 317887, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18345833118099808380&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Babylon Health; Babylon Health; Babylon Health", "aff_domain": "babylonhealth.com; ; ", "email": "babylonhealth.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/vargas19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Babylon Health", "aff_unique_dep": "", "aff_unique_url": "https://www.babylonhealth.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Model Function Based Conditional Gradient Method with Armijo-like Line Search", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4029", "id": "4029", "author_site": "Peter Ochs, Yura Malitsky", "author": "Peter Ochs; Yura Malitsky", "abstract": "The Conditional Gradient Method is generalized to a class of non-smooth non-convex optimization problems with many applications in machine learning. The proposed algorithm iterates by minimizing so-called model functions over the constraint set. Complemented with an Armijo line search procedure, we prove that subsequences converge to a stationary point. The abstract framework of model functions provides great flexibility in the design of concrete algorithms. As special cases, for example, we develop an algorithm for additive composite problems and an algorithm for non-linear composite problems which leads to a Gauss-Newton-type algorithm. Both instances are novel in non-smooth non-convex optimization and come with numerous applications in machine learning. We perform an experiment on a non-linear robust regression problem and discuss the flexibility of the proposed framework in several matrix factorization formulations.", "bibtex": "@InProceedings{pmlr-v97-ochs19a,\n title = \t {Model Function Based Conditional Gradient Method with Armijo-like Line Search},\n author = {Ochs, Peter and Malitsky, Yura},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4891--4900},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ochs19a/ochs19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ochs19a.html},\n abstract = \t {The Conditional Gradient Method is generalized to a class of non-smooth non-convex optimization problems with many applications in machine learning. The proposed algorithm iterates by minimizing so-called model functions over the constraint set. Complemented with an Armijo line search procedure, we prove that subsequences converge to a stationary point. The abstract framework of model functions provides great flexibility in the design of concrete algorithms. As special cases, for example, we develop an algorithm for additive composite problems and an algorithm for non-linear composite problems which leads to a Gauss-Newton-type algorithm. Both instances are novel in non-smooth non-convex optimization and come with numerous applications in machine learning. We perform an experiment on a non-linear robust regression problem and discuss the flexibility of the proposed framework in several matrix factorization formulations.}\n}", "pdf": "http://proceedings.mlr.press/v97/ochs19a/ochs19a.pdf", "supp": "", "pdf_size": 357556, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16933270718863846380&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "University of G\u00f6ttingen, G\u00f6ttingen, Germany; Saarland University, Saarbr\u00fccken, Germany", "aff_domain": "math.uni-goettingen.de;math.uni-sb.de", "email": "math.uni-goettingen.de;math.uni-sb.de", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ochs19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of G\u00f6ttingen;Saarland University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-goettingen.de;https://www.uni-saarland.de", "aff_unique_abbr": "UG;UdS", "aff_campus_unique_index": "0;1", "aff_campus_unique": "G\u00f6ttingen;Saarbr\u00fccken", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Model-Based Active Exploration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4091", "id": "4091", "author_site": "Pranav Shyam, Wojciech Ja\u015bkowski, Faustino Gomez", "author": "Pranav Shyam; Wojciech Ja\u015bkowski; Faustino Gomez", "abstract": "Efficient exploration is an unsolved problem in Reinforcement Learning which is usually addressed by reactively rewarding the agent for fortuitously encountering novel situations. This paper introduces an efficient", "bibtex": "@InProceedings{pmlr-v97-shyam19a,\n title = \t {Model-Based Active Exploration},\n author = {Shyam, Pranav and Ja{\\'{s}}kowski, Wojciech and Gomez, Faustino},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5779--5788},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shyam19a/shyam19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/shyam19a.html},\n abstract = \t {Efficient exploration is an unsolved problem in Reinforcement Learning which is usually addressed by reactively rewarding the agent for fortuitously encountering novel situations. This paper introduces an efficient", "pdf": "http://proceedings.mlr.press/v97/shyam19a/shyam19a.pdf", "supp": "", "pdf_size": 3067796, "gs_citation": 242, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4949040749673510686&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "NNAISENSE, Lugano, Switzerland; NNAISENSE, Lugano, Switzerland; NNAISENSE, Lugano, Switzerland", "aff_domain": "nnaisense.com; ; ", "email": "nnaisense.com; ; ", "github": "https://github.com/nnaisense/max", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/shyam19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "NNAISENSE", "aff_unique_dep": "", "aff_unique_url": "", "aff_unique_abbr": "", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Lugano", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Molecular Hypergraph Grammar with Its Application to Molecular Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3670", "id": "3670", "author": "Hiroshi Kajino", "abstract": "Molecular optimization aims to discover novel molecules with desirable properties, and its two fundamental challenges are: (i) it is not trivial to generate valid molecules in a controllable way due to hard chemical constraints such as the valency conditions, and (ii) it is often costly to evaluate a property of a novel molecule, and therefore, the number of property evaluations is limited. These challenges are to some extent alleviated by a combination of a variational autoencoder (VAE) and Bayesian optimization (BO), where VAE converts a molecule into/from its latent continuous vector, and BO optimizes a latent continuous vector (and its corresponding molecule) within a limited number of property evaluations. While the most recent work, for the first time, achieved 100% validity, its architecture is rather complex due to auxiliary neural networks other than VAE, making it difficult to train. This paper presents a molecular hypergraph grammar variational autoencoder (MHG-VAE), which uses a single VAE to achieve 100% validity. Our idea is to develop a graph grammar encoding the hard chemical constraints, called molecular hypergraph grammar (MHG), which guides VAE to always generate valid molecules. We also present an algorithm to construct MHG from a set of molecules.", "bibtex": "@InProceedings{pmlr-v97-kajino19a,\n title = \t {Molecular Hypergraph Grammar with Its Application to Molecular Optimization},\n author = {Kajino, Hiroshi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3183--3191},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kajino19a/kajino19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kajino19a.html},\n abstract = \t {Molecular optimization aims to discover novel molecules with desirable properties, and its two fundamental challenges are: (i) it is not trivial to generate valid molecules in a controllable way due to hard chemical constraints such as the valency conditions, and (ii) it is often costly to evaluate a property of a novel molecule, and therefore, the number of property evaluations is limited. These challenges are to some extent alleviated by a combination of a variational autoencoder (VAE) and Bayesian optimization (BO), where VAE converts a molecule into/from its latent continuous vector, and BO optimizes a latent continuous vector (and its corresponding molecule) within a limited number of property evaluations. While the most recent work, for the first time, achieved 100% validity, its architecture is rather complex due to auxiliary neural networks other than VAE, making it difficult to train. This paper presents a molecular hypergraph grammar variational autoencoder (MHG-VAE), which uses a single VAE to achieve 100% validity. Our idea is to develop a graph grammar encoding the hard chemical constraints, called molecular hypergraph grammar (MHG), which guides VAE to always generate valid molecules. We also present an algorithm to construct MHG from a set of molecules.}\n}", "pdf": "http://proceedings.mlr.press/v97/kajino19a/kajino19a.pdf", "supp": "", "pdf_size": 449028, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5042267657829386252&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "", "aff_domain": "", "email": "", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/kajino19a.html" }, { "title": "Moment-Based Variational Inference for Markov Jump Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4135", "id": "4135", "author_site": "Christian Wildner, Heinz Koeppl", "author": "Christian Wildner; Heinz Koeppl", "abstract": "We propose moment-based variational inference as a flexible framework for approximate smoothing of latent Markov jump processes. The main ingredient of our approach is to partition the set of all transitions of the latent process into classes. This allows to express the Kullback-Leibler divergence from the approximate to the posterior process in terms of a set of moment functions that arise naturally from the chosen partition. To illustrate possible choices of the partition, we consider special classes of jump processes that frequently occur in applications. We then extend the results to latent parameter inference and demonstrate the method on several examples.", "bibtex": "@InProceedings{pmlr-v97-wildner19a,\n title = \t {Moment-Based Variational Inference for {M}arkov Jump Processes},\n author = {Wildner, Christian and Koeppl, Heinz},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6766--6775},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wildner19a/wildner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wildner19a.html},\n abstract = \t {We propose moment-based variational inference as a flexible framework for approximate smoothing of latent Markov jump processes. The main ingredient of our approach is to partition the set of all transitions of the latent process into classes. This allows to express the Kullback-Leibler divergence from the approximate to the posterior process in terms of a set of moment functions that arise naturally from the chosen partition. To illustrate possible choices of the partition, we consider special classes of jump processes that frequently occur in applications. We then extend the results to latent parameter inference and demonstrate the method on several examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/wildner19a/wildner19a.pdf", "supp": "", "pdf_size": 1736494, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4410930447046330066&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering and Information Technology, Technische Universit\u00e4t Darmstadt, Germany; Department of Electrical Engineering and Information Technology, Technische Universit\u00e4t Darmstadt, Germany", "aff_domain": "bcs.tu-darmstadt.de;bcs.tu-darmstadt.de", "email": "bcs.tu-darmstadt.de;bcs.tu-darmstadt.de", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/wildner19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "Department of Electrical Engineering and Information Technology", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "Monge blunts Bayes: Hardness Results for Adversarial Training", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3589", "id": "3589", "author_site": "Zac Cranko, Aditya Menon, Richard Nock, Cheng Soon Ong, Zhan Shi, Christian Walder", "author": "Zac Cranko; Aditya Menon; Richard Nock; Cheng Soon Ong; Zhan Shi; Christian Walder", "abstract": "The last few years have seen a staggering number of empirical studies of the robustness of neural networks in a model of adversarial perturbations of their inputs. Most rely on an adversary which carries out local modifications within prescribed balls. None however has so far questioned the broader picture: how to frame a", "bibtex": "@InProceedings{pmlr-v97-cranko19a,\n title = \t {Monge blunts Bayes: Hardness Results for Adversarial Training},\n author = {Cranko, Zac and Menon, Aditya and Nock, Richard and Ong, Cheng Soon and Shi, Zhan and Walder, Christian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1406--1415},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cranko19a/cranko19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cranko19a.html},\n abstract = \t {The last few years have seen a staggering number of empirical studies of the robustness of neural networks in a model of adversarial perturbations of their inputs. Most rely on an adversary which carries out local modifications within prescribed balls. None however has so far questioned the broader picture: how to frame a", "pdf": "http://proceedings.mlr.press/v97/cranko19a/cranko19a.pdf", "supp": "", "pdf_size": 2100064, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7844607549534889&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "aff": "The Australian National University (Australia)+Data61 (Australia)+The University of Sydney (Australia); Google Research (USA); Data61 (Australia)+The Australian National University (Australia)+The University of Sydney (Australia); Data61 (Australia)+The Australian National University (Australia); University of Illinois at Chicago (USA); Data61 (Australia)+The Australian National University (Australia)", "aff_domain": "{data61.csiro.au,anu.edu.au};google.com;{data61.csiro.au,anu.edu.au};{data61.csiro.au,anu.edu.au};uic.edu;{data61.csiro.au,anu.edu.au}", "email": "{data61.csiro.au,anu.edu.au};google.com;{data61.csiro.au,anu.edu.au};{data61.csiro.au,anu.edu.au};uic.edu;{data61.csiro.au,anu.edu.au}", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/cranko19a.html", "aff_unique_index": "0+1+2;3;1+0+2;1+0;4;1+0", "aff_unique_norm": "Australian National University;Data61;University of Sydney;Google;University of Illinois at Chicago", "aff_unique_dep": ";;;Google Research;", "aff_unique_url": "https://www.anu.edu.au;https://www.data61.csiro.au;https://www.sydney.edu.au;https://research.google;https://www.uic.edu", "aff_unique_abbr": "ANU;Data61;USYD;Google Research;UIC", "aff_campus_unique_index": ";1;;;2;", "aff_campus_unique": ";Mountain View;Chicago", "aff_country_unique_index": "0+0+0;1;0+0+0;0+0;1;0+0", "aff_country_unique": "Australia;United States" }, { "title": "More Efficient Off-Policy Evaluation through Regularized Targeted Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4251", "id": "4251", "author_site": "Aurelien Bibaut, Ivana Malenica, Nikos Vlassis, Mark van der Laan", "author": "Aurelien Bibaut; Ivana Malenica; Nikos Vlassis; Mark Van Der Laan", "abstract": "We study the problem of off-policy evaluation (OPE) in Reinforcement Learning (RL), where the aim is to estimate the performance of a new policy given historical data that may have been generated by a different policy, or policies. In particular, we introduce a novel doubly-robust estimator for the OPE problem in RL, based on the Targeted Maximum Likelihood Estimation principle from the statistical causal inference literature. We also introduce several variance reduction techniques that lead to impressive performance gains in off-policy evaluation. We show empirically that our estimator uniformly wins over existing off-policy evaluation methods across multiple RL environments and various levels of model misspecification. Finally, we further the existing theoretical analysis of estimators for the RL off-policy estimation problem by showing their $O_P(1/\\sqrt{n})$ rate of convergence and characterizing their asymptotic distribution.", "bibtex": "@InProceedings{pmlr-v97-bibaut19a,\n title = \t {More Efficient Off-Policy Evaluation through Regularized Targeted Learning},\n author = {Bibaut, Aurelien and Malenica, Ivana and Vlassis, Nikos and Van Der Laan, Mark},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {654--663},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bibaut19a/bibaut19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bibaut19a.html},\n abstract = \t {We study the problem of off-policy evaluation (OPE) in Reinforcement Learning (RL), where the aim is to estimate the performance of a new policy given historical data that may have been generated by a different policy, or policies. In particular, we introduce a novel doubly-robust estimator for the OPE problem in RL, based on the Targeted Maximum Likelihood Estimation principle from the statistical causal inference literature. We also introduce several variance reduction techniques that lead to impressive performance gains in off-policy evaluation. We show empirically that our estimator uniformly wins over existing off-policy evaluation methods across multiple RL environments and various levels of model misspecification. Finally, we further the existing theoretical analysis of estimators for the RL off-policy estimation problem by showing their $O_P(1/\\sqrt{n})$ rate of convergence and characterizing their asymptotic distribution.}\n}", "pdf": "http://proceedings.mlr.press/v97/bibaut19a/bibaut19a.pdf", "supp": "", "pdf_size": 895591, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14049048585206676376&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "University of California, Berkeley; University of California, Berkeley; Net\ufb02ix, Los Gatos; University of California, Berkeley", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/bibaut19a.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of California, Berkeley;Netflix", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.netflix.com", "aff_unique_abbr": "UC Berkeley;Netflix", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Berkeley;Los Gatos", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Multi-Agent Adversarial Inverse Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3622", "id": "3622", "author_site": "Lantao Yu, Jiaming Song, Stefano Ermon", "author": "Lantao Yu; Jiaming Song; Stefano Ermon", "abstract": "Reinforcement learning agents are prone to undesired behaviors due to reward mis-specification. Finding a set of reward functions to properly guide agent behaviors is particularly challenging in multi-agent scenarios. Inverse reinforcement learning provides a framework to automatically acquire suitable reward functions from expert demonstrations. Its extension to multi-agent settings, however, is difficult due to the more complex notions of rational behaviors. In this paper, we propose MA-AIRL, a new framework for multi-agent inverse reinforcement learning, which is effective and scalable for Markov games with high-dimensional state-action space and unknown dynamics. We derive our algorithm based on a new solution concept and maximum pseudolikelihood estimation within an adversarial reward learning framework. In the experiments, we demonstrate that MA-AIRL can recover reward functions that are highly correlated with the ground truth rewards, while significantly outperforms prior methods in terms of policy imitation.", "bibtex": "@InProceedings{pmlr-v97-yu19e,\n title = \t {Multi-Agent Adversarial Inverse Reinforcement Learning},\n author = {Yu, Lantao and Song, Jiaming and Ermon, Stefano},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7194--7201},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yu19e/yu19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/yu19e.html},\n abstract = \t {Reinforcement learning agents are prone to undesired behaviors due to reward mis-specification. Finding a set of reward functions to properly guide agent behaviors is particularly challenging in multi-agent scenarios. Inverse reinforcement learning provides a framework to automatically acquire suitable reward functions from expert demonstrations. Its extension to multi-agent settings, however, is difficult due to the more complex notions of rational behaviors. In this paper, we propose MA-AIRL, a new framework for multi-agent inverse reinforcement learning, which is effective and scalable for Markov games with high-dimensional state-action space and unknown dynamics. We derive our algorithm based on a new solution concept and maximum pseudolikelihood estimation within an adversarial reward learning framework. In the experiments, we demonstrate that MA-AIRL can recover reward functions that are highly correlated with the ground truth rewards, while significantly outperforms prior methods in terms of policy imitation.}\n}", "pdf": "http://proceedings.mlr.press/v97/yu19e/yu19e.pdf", "supp": "", "pdf_size": 536382, "gs_citation": 188, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13913946030309510400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "cs.stanford.edu; ;cs.stanford.edu", "email": "cs.stanford.edu; ;cs.stanford.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yu19e.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Multi-Frequency Phase Synchronization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3893", "id": "3893", "author_site": "Tingran Gao, Zhizhen Zhao", "author": "Tingran Gao; Zhizhen Zhao", "abstract": "We propose a novel formulation for phase synchronization\u2014the statistical problem of jointly estimating alignment angles from noisy pairwise comparisons\u2014as a nonconvex optimization problem that enforces consistency among the pairwise comparisons in multiple frequency channels. Inspired by harmonic retrieval in signal processing, we develop a simple yet efficient two-stage algorithm that leverages the multi-frequency information. We demonstrate in theory and practice that the proposed algorithm significantly outperforms state-of-the-art phase synchronization algorithms, at a mild computational costs incurred by using the extra frequency channels. We also extend our algorithmic framework to general synchronization problems over compact Lie groups.", "bibtex": "@InProceedings{pmlr-v97-gao19f,\n title = \t {Multi-Frequency Phase Synchronization},\n author = {Gao, Tingran and Zhao, Zhizhen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2132--2141},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gao19f/gao19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/gao19f.html},\n abstract = \t {We propose a novel formulation for phase synchronization\u2014the statistical problem of jointly estimating alignment angles from noisy pairwise comparisons\u2014as a nonconvex optimization problem that enforces consistency among the pairwise comparisons in multiple frequency channels. Inspired by harmonic retrieval in signal processing, we develop a simple yet efficient two-stage algorithm that leverages the multi-frequency information. We demonstrate in theory and practice that the proposed algorithm significantly outperforms state-of-the-art phase synchronization algorithms, at a mild computational costs incurred by using the extra frequency channels. We also extend our algorithmic framework to general synchronization problems over compact Lie groups.}\n}", "pdf": "http://proceedings.mlr.press/v97/gao19f/gao19f.pdf", "supp": "", "pdf_size": 814814, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4057979684308066769&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Committee on Computational and Applied Mathematics, Department of Statistics, University of Chicago, Chicago IL, USA+Department of Electrical and Computer Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, Urbana IL, USA; Department of Electrical and Computer Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, Urbana IL, USA", "aff_domain": "galton.uchicago.edu;illinois.edu", "email": "galton.uchicago.edu;illinois.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/gao19f.html", "aff_unique_index": "0+1;1", "aff_unique_norm": "University of Chicago;University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Statistics;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.uchicago.edu;https://illinois.edu", "aff_unique_abbr": "UChicago;UIUC", "aff_campus_unique_index": "0+1;1", "aff_campus_unique": "Chicago;Urbana", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Multi-Frequency Vector Diffusion Maps", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3945", "id": "3945", "author_site": "Yifeng Fan, Zhizhen Zhao", "author": "Yifeng Fan; Zhizhen Zhao", "abstract": "We introduce multi-frequency vector diffusion maps (MFVDM), a new framework for organizing and analyzing high dimensional data sets. The new method is a mathematical and algorithmic generalization of vector diffusion maps (VDM) and other non-linear dimensionality reduction methods. The idea of MFVDM is to incorporates multiple unitary irreducible representations of the alignment group which introduces robustness to noise. We illustrate the efficacy of MFVDM on synthetic and cryo-EM image datasets, achieving better nearest neighbors search and alignment estimation than other baselines as VDM and diffusion maps (DM), especially on extremely noisy data.", "bibtex": "@InProceedings{pmlr-v97-fan19a,\n title = \t {Multi-Frequency Vector Diffusion Maps},\n author = {Fan, Yifeng and Zhao, Zhizhen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1843--1852},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fan19a/fan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fan19a.html},\n abstract = \t {We introduce multi-frequency vector diffusion maps (MFVDM), a new framework for organizing and analyzing high dimensional data sets. The new method is a mathematical and algorithmic generalization of vector diffusion maps (VDM) and other non-linear dimensionality reduction methods. The idea of MFVDM is to incorporates multiple unitary irreducible representations of the alignment group which introduces robustness to noise. We illustrate the efficacy of MFVDM on synthetic and cryo-EM image datasets, achieving better nearest neighbors search and alignment estimation than other baselines as VDM and diffusion maps (DM), especially on extremely noisy data.}\n}", "pdf": "http://proceedings.mlr.press/v97/fan19a/fan19a.pdf", "supp": "", "pdf_size": 7371721, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9993659227107670106&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, Illinois, USA; Department of Electrical and Computer Engineering, Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, Illinois, USA", "aff_domain": "illinois.edu; ", "email": "illinois.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/fan19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Multi-Object Representation Learning with Iterative Variational Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4327", "id": "4327", "author_site": "Klaus Greff, Raphael Lopez Kaufman, Rishabh Kabra, Nicholas Watters, Christopher Burgess, Daniel Zoran, Loic Matthey, Matthew Botvinick, Alexander Lerchner", "author": "Klaus Greff; Rapha\u00ebl Lopez Kaufman; Rishabh Kabra; Nick Watters; Christopher Burgess; Daniel Zoran; Loic Matthey; Matthew Botvinick; Alexander Lerchner", "abstract": "Human perception is structured around objects which form the basis for our higher-level cognition and impressive systematic generalization abilities. Yet most work on representation learning focuses on feature learning without even considering multiple objects, or treats segmentation as an (often supervised) preprocessing step. Instead, we argue for the importance of learning to segment and represent objects jointly. We demonstrate that, starting from the simple assumption that a scene is composed of multiple entities, it is possible to learn to segment images into interpretable objects with disentangled representations. Our method learns \u2013 without supervision \u2013 to inpaint occluded parts, and extrapolates to scenes with more objects and to unseen objects with novel feature combinations. We also show that, due to the use of iterative variational inference, our system is able to learn multi-modal posteriors for ambiguous inputs and extends naturally to sequences.", "bibtex": "@InProceedings{pmlr-v97-greff19a,\n title = \t {Multi-Object Representation Learning with Iterative Variational Inference},\n author = {Greff, Klaus and Kaufman, Rapha{\\\"e}l Lopez and Kabra, Rishabh and Watters, Nick and Burgess, Christopher and Zoran, Daniel and Matthey, Loic and Botvinick, Matthew and Lerchner, Alexander},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2424--2433},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/greff19a/greff19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/greff19a.html},\n abstract = \t {Human perception is structured around objects which form the basis for our higher-level cognition and impressive systematic generalization abilities. Yet most work on representation learning focuses on feature learning without even considering multiple objects, or treats segmentation as an (often supervised) preprocessing step. Instead, we argue for the importance of learning to segment and represent objects jointly. We demonstrate that, starting from the simple assumption that a scene is composed of multiple entities, it is possible to learn to segment images into interpretable objects with disentangled representations. Our method learns \u2013 without supervision \u2013 to inpaint occluded parts, and extrapolates to scenes with more objects and to unseen objects with novel feature combinations. We also show that, due to the use of iterative variational inference, our system is able to learn multi-modal posteriors for ambiguous inputs and extends naturally to sequences.}\n}", "pdf": "http://proceedings.mlr.press/v97/greff19a/greff19a.pdf", "supp": "", "pdf_size": 4939548, "gs_citation": 565, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=213712144958725221&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "The Swiss AI lab IDSIA, Lugano, Switzerland+DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK; DeepMind, London, UK", "aff_domain": "startmail.com; ; ; ; ; ; ; ; ", "email": "startmail.com; ; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v97/greff19a.html", "aff_unique_index": "0+1;1;1;1;1;1;1;1;1", "aff_unique_norm": "IDSIA;DeepMind", "aff_unique_dep": "Swiss AI lab;", "aff_unique_url": "https://www.idsia.ch/;https://deepmind.com", "aff_unique_abbr": "IDSIA;DeepMind", "aff_campus_unique_index": "0+1;1;1;1;1;1;1;1;1", "aff_campus_unique": "Lugano;London", "aff_country_unique_index": "0+1;1;1;1;1;1;1;1;1", "aff_country_unique": "Switzerland;United Kingdom" }, { "title": "Multi-objective training of Generative Adversarial Networks with multiple discriminators", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4114", "id": "4114", "author_site": "Isabela Albuquerque, Joao Monteiro, Thang Doan, Breandan Considine, Tiago Falk, Ioannis Mitliagkas", "author": "Isabela Albuquerque; Joao Monteiro; Thang Doan; Breandan Considine; Tiago Falk; Ioannis Mitliagkas", "abstract": "Recent literature has demonstrated promising results for training Generative Adversarial Networks by employing a set of discriminators, in contrast to the traditional game involving one generator against a single adversary. Such methods perform single-objective optimization on some simple consolidation of the losses, e.g. an arithmetic average. In this work, we revisit the multiple-discriminator setting by framing the simultaneous minimization of losses provided by different models as a multi-objective optimization problem. Specifically, we evaluate the performance of multiple gradient descent and the hypervolume maximization algorithm on a number of different datasets. Moreover, we argue that the previously proposed methods and hypervolume maximization can all be seen as variations of multiple gradient descent in which the update direction can be computed efficiently. Our results indicate that hypervolume maximization presents a better compromise between sample quality and computational cost than previous methods.", "bibtex": "@InProceedings{pmlr-v97-albuquerque19a,\n title = \t {Multi-objective training of Generative Adversarial Networks with multiple discriminators},\n author = {Albuquerque, Isabela and Monteiro, Joao and Doan, Thang and Considine, Breandan and Falk, Tiago and Mitliagkas, Ioannis},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {202--211},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/albuquerque19a/albuquerque19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/albuquerque19a.html},\n abstract = \t {Recent literature has demonstrated promising results for training Generative Adversarial Networks by employing a set of discriminators, in contrast to the traditional game involving one generator against a single adversary. Such methods perform single-objective optimization on some simple consolidation of the losses, e.g. an arithmetic average. In this work, we revisit the multiple-discriminator setting by framing the simultaneous minimization of losses provided by different models as a multi-objective optimization problem. Specifically, we evaluate the performance of multiple gradient descent and the hypervolume maximization algorithm on a number of different datasets. Moreover, we argue that the previously proposed methods and hypervolume maximization can all be seen as variations of multiple gradient descent in which the update direction can be computed efficiently. Our results indicate that hypervolume maximization presents a better compromise between sample quality and computational cost than previous methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/albuquerque19a/albuquerque19a.pdf", "supp": "", "pdf_size": 581478, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15572645706146610128&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "INRS-EMT, Universit \u00b4e du Qu \u00b4ebec, Montreal, Canada; INRS-EMT, Universit \u00b4e du Qu \u00b4ebec, Montreal, Canada; Desautels Faculty of Management, McGill University, Montreal, Canada; Quebec Arti\ufb01cial Intelligence Institute, Universit \u00b4e de Montr \u00b4eal, Montreal, Canada; INRS-EMT, Universit \u00b4e du Qu \u00b4ebec, Montreal, Canada; Quebec Arti\ufb01cial Intelligence Institute, Universit \u00b4e de Montr \u00b4eal, Montreal, Canada", "aff_domain": "emt.inrs.ca;emt.inrs.ca; ; ;emt.inrs.ca; ", "email": "emt.inrs.ca;emt.inrs.ca; ; ;emt.inrs.ca; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/albuquerque19a.html", "aff_unique_index": "0;0;1;2;0;2", "aff_unique_norm": "Universit\u00e9 du Qu\u00e9bec;McGill University;Universit \u00b4e de Montr \u00b4eal", "aff_unique_dep": "INRS-EMT;Desautels Faculty of Management;Quebec Arti\ufb01cial Intelligence Institute", "aff_unique_url": "https://www.inrs.ca;https://www.mcgill.ca;https://www.usherbrooke.ca", "aff_unique_abbr": "INRS;McGill;UM", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Montreal", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Multiplicative Weights Updates as a distributed constrained optimization algorithm: Convergence to second-order stationary points almost always", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3976", "id": "3976", "author_site": "Ioannis Panageas, Georgios Piliouras, xiao wang", "author": "Ioannis Panageas; Georgios Piliouras; Xiao Wang", "abstract": "Non-concave maximization has been the subject of much recent study in the optimization and machine learning communities, specifically in deep learning. Recent papers ([Ge et al. 2015, Lee et al 2017] and references therein) indicate that first order methods work well and avoid saddles points. Results as in [Lee \\etal 2017], however, are limited to the", "bibtex": "@InProceedings{pmlr-v97-panageas19a,\n title = \t {Multiplicative Weights Updates as a distributed constrained optimization algorithm: Convergence to second-order stationary points almost always},\n author = {Panageas, Ioannis and Piliouras, Georgios and Wang, Xiao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4961--4969},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/panageas19a/panageas19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/panageas19a.html},\n abstract = \t {Non-concave maximization has been the subject of much recent study in the optimization and machine learning communities, specifically in deep learning. Recent papers ([Ge et al. 2015, Lee et al 2017] and references therein) indicate that first order methods work well and avoid saddles points. Results as in [Lee \\etal 2017], however, are limited to the", "pdf": "http://proceedings.mlr.press/v97/panageas19a/panageas19a.pdf", "supp": "", "pdf_size": 546448, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1914049408922695238&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Singapore University of Technology and Design; Singapore University of Technology and Design; Singapore University of Technology and Design", "aff_domain": "sutd.edu.sg;sutd.edu.sg;sutd.edu.sg", "email": "sutd.edu.sg;sutd.edu.sg;sutd.edu.sg", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/panageas19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Singapore University of Technology and Design", "aff_unique_dep": "", "aff_unique_url": "https://www.sutd.edu.sg", "aff_unique_abbr": "SUTD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Singapore" }, { "title": "Multivariate Submodular Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4248", "id": "4248", "author_site": "Richard Santiago, F. Bruce Shepherd", "author": "Richard Santiago; F. Bruce Shepherd", "abstract": "Submodular functions have found a wealth of new applications in data science and machine learning models in recent years. This has been coupled with many algorithmic advances in the area of submodular optimization: (SO) $\\min/\\max\u00a0f(S): S \\in \\mathcal{F}$, where $\\mathcal{F}$ is a given family of feasible sets over a ground set $V$ and $f:2^V \\rightarrow \\mathbb{R}$ is submodular. In this work we focus on a more general class of", "bibtex": "@InProceedings{pmlr-v97-santiago19a,\n title = \t {Multivariate Submodular Optimization},\n author = {Santiago, Richard and Shepherd, F. Bruce},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5599--5609},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/santiago19a/santiago19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/santiago19a.html},\n abstract = \t {Submodular functions have found a wealth of new applications in data science and machine learning models in recent years. This has been coupled with many algorithmic advances in the area of submodular optimization: (SO) $\\min/\\max\u00a0f(S): S \\in \\mathcal{F}$, where $\\mathcal{F}$ is a given family of feasible sets over a ground set $V$ and $f:2^V \\rightarrow \\mathbb{R}$ is submodular. In this work we focus on a more general class of", "pdf": "http://proceedings.mlr.press/v97/santiago19a/santiago19a.pdf", "supp": "", "pdf_size": 362832, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15050563892523548825&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Computer Science, McGill University, Montreal, Canada; Department of Computer Science, University of British Columbia, Vancouver, Canada", "aff_domain": "mail.mcgill.ca; ", "email": "mail.mcgill.ca; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/santiago19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "McGill University;University of British Columbia", "aff_unique_dep": "School of Computer Science;Department of Computer Science", "aff_unique_url": "https://www.mcgill.ca;https://www.ubc.ca", "aff_unique_abbr": "McGill;UBC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Montreal;Vancouver", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "title": "Multivariate-Information Adversarial Ensemble for Scalable Joint Distribution Matching", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3858", "id": "3858", "author_site": "Ziliang Chen, ZHANFU YANG, Xiaoxi Wang, Xiaodan Liang, xiaopeng yan, Guanbin Li, Liang Lin", "author": "Ziliang Chen; Zhanfu Yang; Xiaoxi Wang; Xiaodan Liang; Xiaopeng Yan; Guanbin Li; Liang Lin", "abstract": "A broad range of cross-$m$-domain generation researches boil down to matching a joint distribution by deep generative models (DGMs). Hitherto algorithms excel in pairwise domains while as $m$ increases, remain struggling to scale themselves to \ufb01t a joint distribution. In this paper, we propose a domain-scalable DGM, i.e., MMI-ALI for $m$-domain joint distribution matching. As an $m$-domain ensemble model of ALIs (Dumoulin et al., 2016), MMI-ALI is adversarially trained with maximizing Multivariate Mutual Information (MMI) w.r.t. joint variables of each pair of domains and their shared feature. The negative MMIs are upper bounded by a series of feasible losses provably leading to matching $m$-domain joint distributions. MMI-ALI linearly scales as $m$ increases and thus, strikes a right balance between ef\ufb01cacy and scalability. We evaluate MMI-ALI in diverse challenging $m$-domain scenarios and verify its superiority.", "bibtex": "@InProceedings{pmlr-v97-chen19l,\n title = \t {Multivariate-Information Adversarial Ensemble for Scalable Joint Distribution Matching},\n author = {Chen, Ziliang and Yang, Zhanfu and Wang, Xiaoxi and Liang, Xiaodan and Yan, Xiaopeng and Li, Guanbin and Lin, Liang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1112--1121},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19l/chen19l.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19l.html},\n abstract = \t {A broad range of cross-$m$-domain generation researches boil down to matching a joint distribution by deep generative models (DGMs). Hitherto algorithms excel in pairwise domains while as $m$ increases, remain struggling to scale themselves to \ufb01t a joint distribution. In this paper, we propose a domain-scalable DGM, i.e., MMI-ALI for $m$-domain joint distribution matching. As an $m$-domain ensemble model of ALIs (Dumoulin et al., 2016), MMI-ALI is adversarially trained with maximizing Multivariate Mutual Information (MMI) w.r.t. joint variables of each pair of domains and their shared feature. The negative MMIs are upper bounded by a series of feasible losses provably leading to matching $m$-domain joint distributions. MMI-ALI linearly scales as $m$ increases and thus, strikes a right balance between ef\ufb01cacy and scalability. We evaluate MMI-ALI in diverse challenging $m$-domain scenarios and verify its superiority.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19l/chen19l.pdf", "supp": "", "pdf_size": 4741235, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2407036909986043494&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "Sun Yat-sen University, China+Purdue University, USA; Purdue University, USA; Sun Yat-sen University, China; Sun Yat-sen University, China; Sun Yat-sen University, China; Sun Yat-sen University, China; Sun Yat-sen University, China", "aff_domain": "ieee.org;purdue.edu;ieee.org;ieee.org;ieee.org;ieee.org;ieee.org", "email": "ieee.org;purdue.edu;ieee.org;ieee.org;ieee.org;ieee.org;ieee.org", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/chen19l.html", "aff_unique_index": "0+1;1;0;0;0;0;0", "aff_unique_norm": "Sun Yat-sen University;Purdue University", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn;https://www.purdue.edu", "aff_unique_abbr": "SYSU;Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+1;1;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "title": "Myopic Posterior Sampling for Adaptive Goal Oriented Design of Experiments", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3616", "id": "3616", "author_site": "Kirthevasan Kandasamy, Willie Neiswanger, Reed Zhang, Akshay Krishnamurthy, Jeff Schneider, Barnab\u00e1s P\u00f3czos", "author": "Kirthevasan Kandasamy; Willie Neiswanger; Reed Zhang; Akshay Krishnamurthy; Jeff Schneider; Barnabas Poczos", "abstract": "Bayesian methods for adaptive decision-making, such as Bayesian optimisation, active learning, and active search have seen great success in relevant applications. However, real world data collection tasks are more broad and complex, as we may need to achieve a combination of the above goals and/or application specific goals. In such scenarios, specialised methods have limited applicability. In this work, we design a new myopic strategy for a wide class of adaptive design of experiment (DOE) problems, where we wish to collect data in order to fulfil a given goal. Our approach, Myopic Posterior Sampling (MPS), which is inspired by the classical posterior sampling algorithm for multi-armed bandits, enables us to address a broad suite of DOE tasks where a practitioner may incorporate domain expertise about the system and specify her desired goal via a reward function. Empirically, this general-purpose strategy is competitive with more specialised methods in a wide array of synthetic and real world DOE tasks. More importantly, it enables addressing complex DOE goals where no existing method seems applicable. On the theoretical side, we leverage ideas from adaptive submodularity and reinforcement learning to derive conditions under which MPS achieves sublinear regret against natural benchmark policies.", "bibtex": "@InProceedings{pmlr-v97-kandasamy19a,\n title = \t {Myopic Posterior Sampling for Adaptive Goal Oriented Design of Experiments},\n author = {Kandasamy, Kirthevasan and Neiswanger, Willie and Zhang, Reed and Krishnamurthy, Akshay and Schneider, Jeff and Poczos, Barnabas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3222--3232},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kandasamy19a/kandasamy19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kandasamy19a.html},\n abstract = \t {Bayesian methods for adaptive decision-making, such as Bayesian optimisation, active learning, and active search have seen great success in relevant applications. However, real world data collection tasks are more broad and complex, as we may need to achieve a combination of the above goals and/or application specific goals. In such scenarios, specialised methods have limited applicability. In this work, we design a new myopic strategy for a wide class of adaptive design of experiment (DOE) problems, where we wish to collect data in order to fulfil a given goal. Our approach, Myopic Posterior Sampling (MPS), which is inspired by the classical posterior sampling algorithm for multi-armed bandits, enables us to address a broad suite of DOE tasks where a practitioner may incorporate domain expertise about the system and specify her desired goal via a reward function. Empirically, this general-purpose strategy is competitive with more specialised methods in a wide array of synthetic and real world DOE tasks. More importantly, it enables addressing complex DOE goals where no existing method seems applicable. On the theoretical side, we leverage ideas from adaptive submodularity and reinforcement learning to derive conditions under which MPS achieves sublinear regret against natural benchmark policies.}\n}", "pdf": "http://proceedings.mlr.press/v97/kandasamy19a/kandasamy19a.pdf", "supp": "", "pdf_size": 643115, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12693371106027835496&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University; Microsoft Research; Carnegie Mellon University; Carnegie Mellon University", "aff_domain": "cs.cmu.edu; ; ; ; ; ", "email": "cs.cmu.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/kandasamy19a.html", "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Carnegie Mellon University;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "CMU;MSR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "NAS-Bench-101: Towards Reproducible Neural Architecture Search", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4009", "id": "4009", "author_site": "Chris Ying, Aaron Klein, Eric Christiansen, Esteban Real, Kevin Murphy, Frank Hutter", "author": "Chris Ying; Aaron Klein; Eric Christiansen; Esteban Real; Kevin Murphy; Frank Hutter", "abstract": "Recent advances in neural architecture search (NAS) demand tremendous computational resources, which makes it difficult to reproduce experiments and imposes a barrier-to-entry to researchers without access to large-scale computation. We aim to ameliorate these problems by introducing NAS-Bench-101, the first public architecture dataset for NAS research. To build NAS-Bench-101, we carefully constructed a compact, yet expressive, search space, exploiting graph isomorphisms to identify 423k unique convolutional architectures. We trained and evaluated all of these architectures multiple times on CIFAR-10 and compiled the results into a large dataset of over 5 million trained models. This allows researchers to evaluate the quality of a diverse range of models in milliseconds by querying the pre-computed dataset. We demonstrate its utility by analyzing the dataset as a whole and by benchmarking a range of architecture optimization algorithms.", "bibtex": "@InProceedings{pmlr-v97-ying19a,\n title = \t {{NAS}-Bench-101: Towards Reproducible Neural Architecture Search},\n author = {Ying, Chris and Klein, Aaron and Christiansen, Eric and Real, Esteban and Murphy, Kevin and Hutter, Frank},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7105--7114},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ying19a/ying19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ying19a.html},\n abstract = \t {Recent advances in neural architecture search (NAS) demand tremendous computational resources, which makes it difficult to reproduce experiments and imposes a barrier-to-entry to researchers without access to large-scale computation. We aim to ameliorate these problems by introducing NAS-Bench-101, the first public architecture dataset for NAS research. To build NAS-Bench-101, we carefully constructed a compact, yet expressive, search space, exploiting graph isomorphisms to identify 423k unique convolutional architectures. We trained and evaluated all of these architectures multiple times on CIFAR-10 and compiled the results into a large dataset of over 5 million trained models. This allows researchers to evaluate the quality of a diverse range of models in milliseconds by querying the pre-computed dataset. We demonstrate its utility by analyzing the dataset as a whole and by benchmarking a range of architecture optimization algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/ying19a/ying19a.pdf", "supp": "", "pdf_size": 960525, "gs_citation": 905, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15320152819361688152&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Google Brain, Mountain View, California, USA+Department of Computer Science, University of Freiburg, Germany; Department of Computer Science, University of Freiburg, Germany; Google Brain, Mountain View, California, USA; Google Brain, Mountain View, California, USA; Google Brain, Mountain View, California, USA; Department of Computer Science, University of Freiburg, Germany", "aff_domain": "chrisying.net;cs.uni-freiburg.de;google.com; ; ; ", "email": "chrisying.net;cs.uni-freiburg.de;google.com; ; ; ", "github": "https://github.com/google-research/nasbench", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/ying19a.html", "aff_unique_index": "0+1;1;0;0;0;1", "aff_unique_norm": "Google;University of Freiburg", "aff_unique_dep": "Google Brain;Department of Computer Science", "aff_unique_url": "https://brain.google.com;https://www.uni-freiburg.de", "aff_unique_abbr": "Google Brain;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0+1;1;0;0;0;1", "aff_country_unique": "United States;Germany" }, { "title": "NATTACK: Learning the Distributions of Adversarial Examples for an Improved Black-Box Attack on Deep Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3662", "id": "3662", "author_site": "Yandong li, Lijun Li, Liqiang Wang, Tong Zhang, Boqing Gong", "author": "Yandong Li; Lijun Li; Liqiang Wang; Tong Zhang; Boqing Gong", "abstract": "Powerful adversarial attack methods are vital for understanding how to construct robust deep neural networks (DNNs) and for thoroughly testing defense techniques. In this paper, we propose a black-box adversarial attack algorithm that can defeat both vanilla DNNs and those generated by various defense techniques developed recently. Instead of searching for an \"optimal\" adversarial example for a benign input to a targeted DNN, our algorithm finds a probability density distribution over a small region centered around the input, such that a sample drawn from this distribution is likely an adversarial example, without the need of accessing the DNN\u2019s internal layers or weights. Our approach is universal as it can successfully attack different neural networks by a single algorithm. It is also strong; according to the testing against 2 vanilla DNNs and 13 defended ones, it outperforms state-of-the-art black-box or white-box attack methods for most test cases. Additionally, our results reveal that adversarial training remains one of the best defense techniques, and the adversarial examples are not as transferable across defended DNNs as them across vanilla DNNs.", "bibtex": "@InProceedings{pmlr-v97-li19g,\n title = \t {{NATTACK}: Learning the Distributions of Adversarial Examples for an Improved Black-Box Attack on Deep Neural Networks},\n author = {Li, Yandong and Li, Lijun and Wang, Liqiang and Zhang, Tong and Gong, Boqing},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3866--3876},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19g/li19g.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19g.html},\n abstract = \t {Powerful adversarial attack methods are vital for understanding how to construct robust deep neural networks (DNNs) and for thoroughly testing defense techniques. In this paper, we propose a black-box adversarial attack algorithm that can defeat both vanilla DNNs and those generated by various defense techniques developed recently. Instead of searching for an \"optimal\" adversarial example for a benign input to a targeted DNN, our algorithm finds a probability density distribution over a small region centered around the input, such that a sample drawn from this distribution is likely an adversarial example, without the need of accessing the DNN\u2019s internal layers or weights. Our approach is universal as it can successfully attack different neural networks by a single algorithm. It is also strong; according to the testing against 2 vanilla DNNs and 13 defended ones, it outperforms state-of-the-art black-box or white-box attack methods for most test cases. Additionally, our results reveal that adversarial training remains one of the best defense techniques, and the adversarial examples are not as transferable across defended DNNs as them across vanilla DNNs.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19g/li19g.pdf", "supp": "", "pdf_size": 611563, "gs_citation": 308, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1133340624710172210&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "University of Central Florida; University of Central Florida; University of Central Florida; Hong Kong University of Science and Technology; Google", "aff_domain": "outlook.com; ; ;outlook.com; ", "email": "outlook.com; ; ;outlook.com; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/li19g.html", "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Central Florida;Hong Kong University of Science and Technology;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.ucf.edu;https://www.ust.hk;https://www.google.com", "aff_unique_abbr": "UCF;HKUST;Google", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Hong Kong SAR;Mountain View", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Natural Analysts in Adaptive Data Analysis", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3916", "id": "3916", "author_site": "Tijana Zrnic, Moritz Hardt", "author": "Tijana Zrnic; Moritz Hardt", "abstract": "Adaptive data analysis is frequently criticized for its pessimistic generalization guarantees. The source of these pessimistic bounds is a model that permits arbitrary, possibly adversarial analysts that optimally use information to bias results. While being a central issue in the field, still lacking are notions of natural analysts that allow for more optimistic bounds faithful to the reality that typical analysts aren\u2019t adversarial. In this work, we propose notions of natural analysts that smoothly interpolate between the optimal non-adaptive bounds and the best-known adaptive generalization bounds. To accomplish this, we model the analyst\u2019s knowledge as evolving according to the rules of an unknown dynamical system that takes in revealed information and outputs new statistical queries to the data. This allows us to restrict the analyst through different natural control-theoretic notions. One such notion corresponds to a recency bias, formalizing an inability to arbitrarily use distant information. Another complementary notion formalizes an anchoring bias, a tendency to weight initial information more strongly. Both notions come with quantitative parameters that smoothly interpolate between the non-adaptive case and the fully adaptive case, allowing for a rich spectrum of intermediate analysts that are neither non-adaptive nor adversarial. Natural not only from a cognitive perspective, we show that our notions also capture standard optimization methods, like gradient descent in various settings. This gives a new interpretation to the fact that gradient descent tends to overfit much less than its adaptive nature might suggest.", "bibtex": "@InProceedings{pmlr-v97-zrnic19a,\n title = \t {Natural Analysts in Adaptive Data Analysis},\n author = {Zrnic, Tijana and Hardt, Moritz},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7703--7711},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zrnic19a/zrnic19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zrnic19a.html},\n abstract = \t {Adaptive data analysis is frequently criticized for its pessimistic generalization guarantees. The source of these pessimistic bounds is a model that permits arbitrary, possibly adversarial analysts that optimally use information to bias results. While being a central issue in the field, still lacking are notions of natural analysts that allow for more optimistic bounds faithful to the reality that typical analysts aren\u2019t adversarial. In this work, we propose notions of natural analysts that smoothly interpolate between the optimal non-adaptive bounds and the best-known adaptive generalization bounds. To accomplish this, we model the analyst\u2019s knowledge as evolving according to the rules of an unknown dynamical system that takes in revealed information and outputs new statistical queries to the data. This allows us to restrict the analyst through different natural control-theoretic notions. One such notion corresponds to a recency bias, formalizing an inability to arbitrarily use distant information. Another complementary notion formalizes an anchoring bias, a tendency to weight initial information more strongly. Both notions come with quantitative parameters that smoothly interpolate between the non-adaptive case and the fully adaptive case, allowing for a rich spectrum of intermediate analysts that are neither non-adaptive nor adversarial. Natural not only from a cognitive perspective, we show that our notions also capture standard optimization methods, like gradient descent in various settings. This gives a new interpretation to the fact that gradient descent tends to overfit much less than its adaptive nature might suggest.}\n}", "pdf": "http://proceedings.mlr.press/v97/zrnic19a/zrnic19a.pdf", "supp": "", "pdf_size": 305982, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6636389822427467509&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, Berkeley, USA; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, Berkeley, USA", "aff_domain": "eecs.berkeley.edu; ", "email": "eecs.berkeley.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zrnic19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Near optimal finite time identification of arbitrary linear dynamical systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3634", "id": "3634", "author_site": "Tuhin Sarkar, Alexander Rakhlin", "author": "Tuhin Sarkar; Alexander Rakhlin", "abstract": "We derive finite time error bounds for estimating general linear time-invariant (LTI) systems from a single observed trajectory using the method of least squares. We provide the first analysis of the general case when eigenvalues of the LTI system are arbitrarily distributed in three regimes: stable, marginally stable, and explosive. Our analysis yields sharp upper bounds for each of these cases separately. We observe that although the underlying process behaves quite differently in each of these three regimes, the systematic analysis of a self\u2013normalized martingale difference term helps bound identification error up to logarithmic factors of the lower bound. On the other hand, we demonstrate that the least squares solution may be statistically inconsistent under certain conditions even when the signal-to-noise ratio is high.", "bibtex": "@InProceedings{pmlr-v97-sarkar19a,\n title = \t {Near optimal finite time identification of arbitrary linear dynamical systems},\n author = {Sarkar, Tuhin and Rakhlin, Alexander},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5610--5618},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sarkar19a/sarkar19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/sarkar19a.html},\n abstract = \t {We derive finite time error bounds for estimating general linear time-invariant (LTI) systems from a single observed trajectory using the method of least squares. We provide the first analysis of the general case when eigenvalues of the LTI system are arbitrarily distributed in three regimes: stable, marginally stable, and explosive. Our analysis yields sharp upper bounds for each of these cases separately. We observe that although the underlying process behaves quite differently in each of these three regimes, the systematic analysis of a self\u2013normalized martingale difference term helps bound identification error up to logarithmic factors of the lower bound. On the other hand, we demonstrate that the least squares solution may be statistically inconsistent under certain conditions even when the signal-to-noise ratio is high.}\n}", "pdf": "http://proceedings.mlr.press/v97/sarkar19a/sarkar19a.pdf", "supp": "", "pdf_size": 791903, "gs_citation": 231, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18006932699411453959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Electrical Engineering and Computer Sciences, MIT; Department of Brain and Cognitive Sciences, MIT", "aff_domain": "mit.edu; ", "email": "mit.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/sarkar19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Nearest Neighbor and Kernel Survival Analysis: Nonasymptotic Error Bounds and Strong Consistency Rates", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3653", "id": "3653", "author": "George Chen", "abstract": "We establish the first nonasymptotic error bounds for Kaplan-Meier-based nearest neighbor and kernel survival probability estimators where feature vectors reside in metric spaces. Our bounds imply rates of strong consistency for these nonparametric estimators and, up to a log factor, match an existing lower bound for conditional CDF estimation. Our proof strategy also yields nonasymptotic guarantees for nearest neighbor and kernel variants of the Nelson-Aalen cumulative hazards estimator. We experimentally compare these methods on four datasets. We find that for the kernel survival estimator, a good choice of kernel is one learned using random survival forests.", "bibtex": "@InProceedings{pmlr-v97-chen19a,\n title = \t {Nearest Neighbor and Kernel Survival Analysis: Nonasymptotic Error Bounds and Strong Consistency Rates},\n author = {Chen, George},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1001--1010},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19a/chen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19a.html},\n abstract = \t {We establish the first nonasymptotic error bounds for Kaplan-Meier-based nearest neighbor and kernel survival probability estimators where feature vectors reside in metric spaces. Our bounds imply rates of strong consistency for these nonparametric estimators and, up to a log factor, match an existing lower bound for conditional CDF estimation. Our proof strategy also yields nonasymptotic guarantees for nearest neighbor and kernel variants of the Nelson-Aalen cumulative hazards estimator. We experimentally compare these methods on four datasets. We find that for the kernel survival estimator, a good choice of kernel is one learned using random survival forests.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19a/chen19a.pdf", "supp": "", "pdf_size": 2063322, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8933665534360085465&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Heinz College of Information Systems and Public Policy, Carnegie Mellon University, Pittsburgh, PA, USA", "aff_domain": "cmu.edu", "email": "cmu.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/chen19a.html", "aff_unique_index": "0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "Heinz College of Information Systems and Public Policy", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Neural Collaborative Subspace Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3595", "id": "3595", "author_site": "Tong Zhang, Pan Ji, Mehrtash Harandi, Wenbing Huang, HONGDONG LI", "author": "Tong Zhang; Pan Ji; Mehrtash Harandi; Wenbing Huang; Hongdong Li", "abstract": "We introduce the Neural Collaborative Subspace Clustering, a neural model that discovers clusters of data points drawn from a union of low-dimensional subspaces. In contrast to previous attempts, our model runs without the aid of spectral clustering. This makes our algorithm one of the kinds that can gracefully scale to large datasets. At its heart, our neural model benefits from a classifier which determines whether a pair of points lies on the same subspace or not. Essential to our model is the construction of two affinity matrices, one from the classifier and the other from a notion of subspace self-expressiveness, to supervise training in a collaborative scheme. We thoroughly assess and contrast the performance of our model against various state-of-the-art clustering algorithms including deep subspace-based ones.", "bibtex": "@InProceedings{pmlr-v97-zhang19g,\n title = \t {Neural Collaborative Subspace Clustering},\n author = {Zhang, Tong and Ji, Pan and Harandi, Mehrtash and Huang, Wenbing and Li, Hongdong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7384--7393},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19g/zhang19g.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19g.html},\n abstract = \t {We introduce the Neural Collaborative Subspace Clustering, a neural model that discovers clusters of data points drawn from a union of low-dimensional subspaces. In contrast to previous attempts, our model runs without the aid of spectral clustering. This makes our algorithm one of the kinds that can gracefully scale to large datasets. At its heart, our neural model benefits from a classifier which determines whether a pair of points lies on the same subspace or not. Essential to our model is the construction of two affinity matrices, one from the classifier and the other from a notion of subspace self-expressiveness, to supervise training in a collaborative scheme. We thoroughly assess and contrast the performance of our model against various state-of-the-art clustering algorithms including deep subspace-based ones.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19g/zhang19g.pdf", "supp": "", "pdf_size": 1924729, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16795462391679761524&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Motovis Australia Pty Ltd+Australian National University; NEC Labs America; Monash University; Tencent AI Lab; Australian National University", "aff_domain": "gmail.com; ; ; ; ", "email": "gmail.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zhang19g.html", "aff_unique_index": "0+1;2;3;4;1", "aff_unique_norm": "Motovis Australia Pty Ltd;Australian National University;NEC Labs America;Monash University;Tencent", "aff_unique_dep": ";;;;Tencent AI Lab", "aff_unique_url": ";https://www.anu.edu.au;https://www.nec-labs.com;https://www.monash.edu;https://ai.tencent.com", "aff_unique_abbr": ";ANU;NEC LA;Monash;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0;2;0", "aff_country_unique": "Australia;United States;China" }, { "title": "Neural Inverse Knitting: From Images to Manufacturing Instructions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3617", "id": "3617", "author_site": "Alexandre Kaspar, Tae-Hyun Oh, Liane Makatura, Petr Kellnhofer, Wojciech Matusik", "author": "Alexandre Kaspar; Tae-Hyun Oh; Liane Makatura; Petr Kellnhofer; Wojciech Matusik", "abstract": "Motivated by the recent potential of mass customization brought by whole-garment knitting machines, we introduce the new problem of automatic machine instruction generation using a single image of the desired physical product, which we apply to machine knitting. We propose to tackle this problem by directly learning to synthesize regular machine instructions from real images. We create a cured dataset of real samples with their instruction counterpart and propose to use synthetic images to augment it in a novel way. We theoretically motivate our data mixing framework and show empirical results suggesting that making real images look more synthetic is beneficial in our problem setup.", "bibtex": "@InProceedings{pmlr-v97-kaspar19a,\n title = \t {Neural Inverse Knitting: From Images to Manufacturing Instructions},\n author = {Kaspar, Alexandre and Oh, Tae-Hyun and Makatura, Liane and Kellnhofer, Petr and Matusik, Wojciech},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3272--3281},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kaspar19a/kaspar19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kaspar19a.html},\n abstract = \t {Motivated by the recent potential of mass customization brought by whole-garment knitting machines, we introduce the new problem of automatic machine instruction generation using a single image of the desired physical product, which we apply to machine knitting. We propose to tackle this problem by directly learning to synthesize regular machine instructions from real images. We create a cured dataset of real samples with their instruction counterpart and propose to use synthetic images to augment it in a novel way. We theoretically motivate our data mixing framework and show empirical results suggesting that making real images look more synthetic is beneficial in our problem setup.}\n}", "pdf": "http://proceedings.mlr.press/v97/kaspar19a/kaspar19a.pdf", "supp": "", "pdf_size": 3653161, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15939506219703518176&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "Computer Science & Artificial Intelligence Laboratory (CSAIL), Massachusetts Institute of Technology (MIT), Cambridge, MA, USA; Computer Science & Artificial Intelligence Laboratory (CSAIL), Massachusetts Institute of Technology (MIT), Cambridge, MA, USA; Computer Science & Artificial Intelligence Laboratory (CSAIL), Massachusetts Institute of Technology (MIT), Cambridge, MA, USA; Computer Science & Artificial Intelligence Laboratory (CSAIL), Massachusetts Institute of Technology (MIT), Cambridge, MA, USA; Computer Science & Artificial Intelligence Laboratory (CSAIL), Massachusetts Institute of Technology (MIT), Cambridge, MA, USA", "aff_domain": "mit.edu; ; ; ; ", "email": "mit.edu; ; ; ; ", "github": "", "project": "http://deepknitting.csail.mit.edu", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kaspar19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "Computer Science & Artificial Intelligence Laboratory", "aff_unique_url": "https://www.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Joint Source-Channel Coding", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3941", "id": "3941", "author_site": "Kristy Choi, Kedar Tatwawadi, Aditya Grover, Tsachy Weissman, Stefano Ermon", "author": "Kristy Choi; Kedar Tatwawadi; Aditya Grover; Tsachy Weissman; Stefano Ermon", "abstract": "For reliable transmission across a noisy communication channel, classical results from information theory show that it is asymptotically optimal to separate out the source and channel coding processes. However, this decomposition can fall short in the finite bit-length regime, as it requires non-trivial tuning of hand-crafted codes and assumes infinite computational power for decoding. In this work, we propose to jointly learn the encoding and decoding processes using a new discrete variational autoencoder model. By adding noise into the latent codes to simulate the channel during training, we learn to both compress and error-correct given a fixed bit-length and computational budget. We obtain codes that are not only competitive against several separation schemes, but also learn useful robust representations of the data for downstream tasks such as classification. Finally, inference amortization yields an extremely fast neural decoder, almost an order of magnitude faster compared to standard decoding methods based on iterative belief propagation.", "bibtex": "@InProceedings{pmlr-v97-choi19a,\n title = \t {Neural Joint Source-Channel Coding},\n author = {Choi, Kristy and Tatwawadi, Kedar and Grover, Aditya and Weissman, Tsachy and Ermon, Stefano},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1182--1192},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/choi19a/choi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/choi19a.html},\n abstract = \t {For reliable transmission across a noisy communication channel, classical results from information theory show that it is asymptotically optimal to separate out the source and channel coding processes. However, this decomposition can fall short in the finite bit-length regime, as it requires non-trivial tuning of hand-crafted codes and assumes infinite computational power for decoding. In this work, we propose to jointly learn the encoding and decoding processes using a new discrete variational autoencoder model. By adding noise into the latent codes to simulate the channel during training, we learn to both compress and error-correct given a fixed bit-length and computational budget. We obtain codes that are not only competitive against several separation schemes, but also learn useful robust representations of the data for downstream tasks such as classification. Finally, inference amortization yields an extremely fast neural decoder, almost an order of magnitude faster compared to standard decoding methods based on iterative belief propagation.}\n}", "pdf": "http://proceedings.mlr.press/v97/choi19a/choi19a.pdf", "supp": "", "pdf_size": 1108612, "gs_citation": 142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13260217163651536800&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Stanford University; Department of Electrical Engineering, Stanford University; Department of Computer Science, Stanford University; Department of Electrical Engineering, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "cs.stanford.edu; ; ; ; ", "email": "cs.stanford.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/choi19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Neural Logic Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3834", "id": "3834", "author_site": "zhengyao jiang, Shan Luo", "author": "Zhengyao Jiang; Shan Luo", "abstract": "Deep reinforcement learning (DRL) has achieved significant breakthroughs in various tasks. However, most DRL algorithms suffer a problem of generalising the learned policy, which makes the policy performance largely affected even by minor modifications of the training environment. Except that, the use of deep neural networks makes the learned policies hard to be interpretable. To address these two challenges, we propose a novel algorithm named Neural Logic Reinforcement Learning (NLRL) to represent the policies in reinforcement learning by first-order logic. NLRL is based on policy gradient methods and differentiable inductive logic programming that have demonstrated significant advantages in terms of interpretability and generalisability in supervised tasks. Extensive experiments conducted on cliff-walking and blocks manipulation tasks demonstrate that NLRL can induce interpretable policies achieving near-optimal performance while showing good generalisability to environments of different initial states and problem sizes.", "bibtex": "@InProceedings{pmlr-v97-jiang19a,\n title = \t {Neural Logic Reinforcement Learning},\n author = {Jiang, Zhengyao and Luo, Shan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3110--3119},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jiang19a/jiang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jiang19a.html},\n abstract = \t {Deep reinforcement learning (DRL) has achieved significant breakthroughs in various tasks. However, most DRL algorithms suffer a problem of generalising the learned policy, which makes the policy performance largely affected even by minor modifications of the training environment. Except that, the use of deep neural networks makes the learned policies hard to be interpretable. To address these two challenges, we propose a novel algorithm named Neural Logic Reinforcement Learning (NLRL) to represent the policies in reinforcement learning by first-order logic. NLRL is based on policy gradient methods and differentiable inductive logic programming that have demonstrated significant advantages in terms of interpretability and generalisability in supervised tasks. Extensive experiments conducted on cliff-walking and blocks manipulation tasks demonstrate that NLRL can induce interpretable policies achieving near-optimal performance while showing good generalisability to environments of different initial states and problem sizes.}\n}", "pdf": "http://proceedings.mlr.press/v97/jiang19a/jiang19a.pdf", "supp": "", "pdf_size": 321951, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18074632043038701502&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, University of Liverpool, Liverpool, United Kingdom; Department of Computer Science, University of Liverpool, Liverpool, United Kingdom", "aff_domain": "student.liverpool.ac.uk;liverpool.ac.uk", "email": "student.liverpool.ac.uk;liverpool.ac.uk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/jiang19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Liverpool", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.liverpool.ac.uk", "aff_unique_abbr": "Liv Uni", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Liverpool", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Neural Network Attributions: A Causal Perspective", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4003", "id": "4003", "author_site": "Aditya Chattopadhyay, Piyushi Manupriya, Anirban Sarkar, Vineeth N Balasubramanian", "author": "Aditya Chattopadhyay; Piyushi Manupriya; Anirban Sarkar; Vineeth N Balasubramanian", "abstract": "We propose a new attribution method for neural networks developed using \ufb01rst principles of causality (to the best of our knowledge, the \ufb01rst such). The neural network architecture is viewed as a Structural Causal Model, and a methodology to compute the causal effect of each feature on the output is presented. With reasonable assumptions on the causal structure of the input data, we propose algorithms to ef\ufb01ciently compute the causal effects, as well as scale the approach to data with large dimensionality. We also show how this method can be used for recurrent neural networks. We report experimental results on both simulated and real datasets showcasing the promise and usefulness of the proposed algorithm.", "bibtex": "@InProceedings{pmlr-v97-chattopadhyay19a,\n title = \t {Neural Network Attributions: A Causal Perspective},\n author = {Chattopadhyay, Aditya and Manupriya, Piyushi and Sarkar, Anirban and Balasubramanian, Vineeth N},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {981--990},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chattopadhyay19a/chattopadhyay19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chattopadhyay19a.html},\n abstract = \t {We propose a new attribution method for neural networks developed using \ufb01rst principles of causality (to the best of our knowledge, the \ufb01rst such). The neural network architecture is viewed as a Structural Causal Model, and a methodology to compute the causal effect of each feature on the output is presented. With reasonable assumptions on the causal structure of the input data, we propose algorithms to ef\ufb01ciently compute the causal effects, as well as scale the approach to data with large dimensionality. We also show how this method can be used for recurrent neural networks. We report experimental results on both simulated and real datasets showcasing the promise and usefulness of the proposed algorithm.}\n}", "pdf": "http://proceedings.mlr.press/v97/chattopadhyay19a/chattopadhyay19a.pdf", "supp": "", "pdf_size": 3029382, "gs_citation": 181, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6244906153620520343&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Center for Imaging Science, Johns Hopkins University, Baltimore, USA; Department of Computer Science and Engineering, Indian Institute of Technology Hyderabad, Telangana, India; Department of Computer Science and Engineering, Indian Institute of Technology Hyderabad, Telangana, India; Department of Computer Science and Engineering, Indian Institute of Technology Hyderabad, Telangana, India", "aff_domain": "jhu.edu; ; ;iith.ac.in", "email": "jhu.edu; ; ;iith.ac.in", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chattopadhyay19a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Johns Hopkins University;Indian Institute of Technology Hyderabad", "aff_unique_dep": "Center for Imaging Science;Department of Computer Science and Engineering", "aff_unique_url": "https://www.jhu.edu;https://www.iith.ac.in", "aff_unique_abbr": "JHU;IIT Hyderabad", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Baltimore;Hyderabad", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;India" }, { "title": "Neural Separation of Observed and Unobserved Distributions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3603", "id": "3603", "author_site": "Tavi Halperin, Ariel Ephrat, Yedid Hoshen", "author": "Tavi Halperin; Ariel Ephrat; Yedid Hoshen", "abstract": "Separating mixed distributions is a long standing challenge for machine learning and signal processing. Most current methods either rely on making strong assumptions on the source distributions or rely on having training samples of each source in the mixture. In this work, we introduce a new method\u2014Neural Egg Separation\u2014to tackle the scenario of extracting a signal from an unobserved distribution additively mixed with a signal from an observed distribution. Our method iteratively learns to separate the known distribution from progressively finer estimates of the unknown distribution. In some settings, Neural Egg Separation is initialization sensitive, we therefore introduce Latent Mixture Masking which ensures a good initialization. Extensive experiments on audio and image separation tasks show that our method outperforms current methods that use the same level of supervision, and often achieves similar performance to full supervision.", "bibtex": "@InProceedings{pmlr-v97-halperin19a,\n title = \t {Neural Separation of Observed and Unobserved Distributions},\n author = {Halperin, Tavi and Ephrat, Ariel and Hoshen, Yedid},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2566--2575},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/halperin19a/halperin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/halperin19a.html},\n abstract = \t {Separating mixed distributions is a long standing challenge for machine learning and signal processing. Most current methods either rely on making strong assumptions on the source distributions or rely on having training samples of each source in the mixture. In this work, we introduce a new method\u2014Neural Egg Separation\u2014to tackle the scenario of extracting a signal from an unobserved distribution additively mixed with a signal from an observed distribution. Our method iteratively learns to separate the known distribution from progressively finer estimates of the unknown distribution. In some settings, Neural Egg Separation is initialization sensitive, we therefore introduce Latent Mixture Masking which ensures a good initialization. Extensive experiments on audio and image separation tasks show that our method outperforms current methods that use the same level of supervision, and often achieves similar performance to full supervision.}\n}", "pdf": "http://proceedings.mlr.press/v97/halperin19a/halperin19a.pdf", "supp": "", "pdf_size": 562688, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3583406841206585968&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, The Hebrew University of Jerusalem, Jerusalem, Israel; Google Research; Facebook AI Research", "aff_domain": "mail.huji.ac.il; ; ", "email": "mail.huji.ac.il; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/halperin19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Hebrew University of Jerusalem;Google;Meta", "aff_unique_dep": "Department of Computer Science;Google Research;Facebook AI Research", "aff_unique_url": "https://www.huji.ac.il;https://research.google;https://research.facebook.com", "aff_unique_abbr": "HUJI;Google Research;FAIR", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Jerusalem;Mountain View;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Israel;United States" }, { "title": "Neurally-Guided Structure Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4250", "id": "4250", "author_site": "Sidi Lu, Jiayuan Mao, Josh Tenenbaum, Jiajun Wu", "author": "Sidi Lu; Jiayuan Mao; Joshua Tenenbaum; Jiajun Wu", "abstract": "Most structure inference methods either rely on exhaustive search or are purely data-driven. Exhaustive search robustly infers the structure of arbitrarily complex data, but it is slow. Data-driven methods allow efficient inference, but do not generalize when test data have more complex structures than training data. In this paper, we propose a hybrid inference algorithm, the Neurally-Guided Structure Inference (NG-SI), keeping the advantages of both search-based and data-driven methods. The key idea of NG-SI is to use a neural network to guide the hierarchical, layer-wise search over the compositional space of structures. We evaluate our algorithm on two representative structure inference tasks: probabilistic matrix decomposition and symbolic program parsing. It outperforms data-driven and search-based alternatives on both tasks.", "bibtex": "@InProceedings{pmlr-v97-lu19b,\n title = \t {Neurally-Guided Structure Inference},\n author = {Lu, Sidi and Mao, Jiayuan and Tenenbaum, Joshua and Wu, Jiajun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4144--4153},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lu19b/lu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/lu19b.html},\n abstract = \t {Most structure inference methods either rely on exhaustive search or are purely data-driven. Exhaustive search robustly infers the structure of arbitrarily complex data, but it is slow. Data-driven methods allow efficient inference, but do not generalize when test data have more complex structures than training data. In this paper, we propose a hybrid inference algorithm, the Neurally-Guided Structure Inference (NG-SI), keeping the advantages of both search-based and data-driven methods. The key idea of NG-SI is to use a neural network to guide the hierarchical, layer-wise search over the compositional space of structures. We evaluate our algorithm on two representative structure inference tasks: probabilistic matrix decomposition and symbolic program parsing. It outperforms data-driven and search-based alternatives on both tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/lu19b/lu19b.pdf", "supp": "", "pdf_size": 1544631, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5257130458357507277&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Shanghai Jiao Tong University; MIT CSAIL + IIIS, Tsinghua University; Department of Brain and Cognitive Sciences, MIT + Center for Brains, Minds and Machines (CBMM), MIT; MIT", "aff_domain": "apex.sjtu.edu.cn;mit.edu; ;mit.edu", "email": "apex.sjtu.edu.cn;mit.edu; ;mit.edu", "github": "", "project": "http://ngsi.csail.mit.edu", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/lu19b.html", "aff_unique_index": "0;1+2;1+1;1", "aff_unique_norm": "Shanghai Jiao Tong University;Massachusetts Institute of Technology;Tsinghua University", "aff_unique_dep": ";Computer Science and Artificial Intelligence Laboratory;Institute for Interdisciplinary Information Sciences", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.csail.mit.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "SJTU;MIT CSAIL;THU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Cambridge", "aff_country_unique_index": "0;1+0;1+1;1", "aff_country_unique": "China;United States" }, { "title": "Neuron birth-death dynamics accelerates gradient descent and converges asymptotically", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4286", "id": "4286", "author_site": "Grant Rotskoff, Samy Jelassi, Joan Bruna, Eric Vanden-Eijnden", "author": "Grant Rotskoff; Samy Jelassi; Joan Bruna; Eric Vanden-Eijnden", "abstract": "Neural networks with a large number of parameters admit a mean-field description, which has recently served as a theoretical explanation for the favorable training properties of models with a large number of parameters. In this regime, gradient descent obeys a deterministic partial differential equation (PDE) that converges to a globally optimal solution for networks with a single hidden layer under appropriate assumptions. In this work, we propose a non-local mass transport dynamics that leads to a modified PDE with the same minimizer. We implement this non-local dynamics as a stochastic neuronal birth/death process and we prove that it accelerates the rate of convergence in the mean-field limit. We subsequently realize this PDE with two classes of numerical schemes that converge to the mean-field equation, each of which can easily be implemented for neural networks with finite numbers of parameters. We illustrate our algorithms with two models to provide intuition for the mechanism through which convergence is accelerated.", "bibtex": "@InProceedings{pmlr-v97-rotskoff19a,\n title = \t {Neuron birth-death dynamics accelerates gradient descent and converges asymptotically},\n author = {Rotskoff, Grant and Jelassi, Samy and Bruna, Joan and Vanden-Eijnden, Eric},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5508--5517},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rotskoff19a/rotskoff19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rotskoff19a.html},\n abstract = \t {Neural networks with a large number of parameters admit a mean-field description, which has recently served as a theoretical explanation for the favorable training properties of models with a large number of parameters. In this regime, gradient descent obeys a deterministic partial differential equation (PDE) that converges to a globally optimal solution for networks with a single hidden layer under appropriate assumptions. In this work, we propose a non-local mass transport dynamics that leads to a modified PDE with the same minimizer. We implement this non-local dynamics as a stochastic neuronal birth/death process and we prove that it accelerates the rate of convergence in the mean-field limit. We subsequently realize this PDE with two classes of numerical schemes that converge to the mean-field equation, each of which can easily be implemented for neural networks with finite numbers of parameters. We illustrate our algorithms with two models to provide intuition for the mechanism through which convergence is accelerated.}\n}", "pdf": "http://proceedings.mlr.press/v97/rotskoff19a/rotskoff19a.pdf", "supp": "", "pdf_size": 610882, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1894588217884714871&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Courant Institute, New York University, New York, USA+Center for Data Science, New York University, New York, USA; Center for Data Science, New York University, New York, USA+Princeton University, Princeton, New Jersey, USA; Courant Institute, New York University, New York, USA+Center for Data Science, New York University, New York, USA; Courant Institute, New York University, New York, USA", "aff_domain": "cims.nyu.edu; ; ; ", "email": "cims.nyu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/rotskoff19a.html", "aff_unique_index": "0+0;0+1;0+0;0", "aff_unique_norm": "New York University;Princeton University", "aff_unique_dep": "Courant Institute;", "aff_unique_url": "https://www.nyu.edu;https://www.princeton.edu", "aff_unique_abbr": "NYU;Princeton", "aff_campus_unique_index": "0+0;0+1;0+0;0", "aff_campus_unique": "New York;Princeton", "aff_country_unique_index": "0+0;0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "New results on information theoretic clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3668", "id": "3668", "author_site": "Ferdinando Cicalese, Eduardo Laber, Lucas Murtinho", "author": "Ferdinando Cicalese; Eduardo Laber; Lucas Murtinho", "abstract": "We study the problem of optimizing the clustering of a set of vectors when the quality of the clustering is measured by the Entropy or the Gini impurity measure. Our results contribute to the state of the art both in terms of best known approximation guarantees and inapproximability bounds: (i) we give the first polynomial time algorithm for Entropy impurity based clustering with approximation guarantee independent of the number of vectors and (ii) we show that the problem of clustering based on entropy impurity does not admit a PTAS. This also implies an inapproximability result in information theoretic clustering for probability distributions closing a problem left open in [Chaudhury and McGregor, COLT08] and [Ackermann et al., ECCC11]. We also report experiments with a new clustering method that was designed on top of the theoretical tools leading to the above results. These experiments suggest a practical applicability for our method, in particular, when the number of clusters is large.", "bibtex": "@InProceedings{pmlr-v97-cicalese19a,\n title = \t {New results on information theoretic clustering},\n author = {Cicalese, Ferdinando and Laber, Eduardo and Murtinho, Lucas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1242--1251},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cicalese19a/cicalese19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cicalese19a.html},\n abstract = \t {We study the problem of optimizing the clustering of a set of vectors when the quality of the clustering is measured by the Entropy or the Gini impurity measure. Our results contribute to the state of the art both in terms of best known approximation guarantees and inapproximability bounds: (i) we give the first polynomial time algorithm for Entropy impurity based clustering with approximation guarantee independent of the number of vectors and (ii) we show that the problem of clustering based on entropy impurity does not admit a PTAS. This also implies an inapproximability result in information theoretic clustering for probability distributions closing a problem left open in [Chaudhury and McGregor, COLT08] and [Ackermann et al., ECCC11]. We also report experiments with a new clustering method that was designed on top of the theoretical tools leading to the above results. These experiments suggest a practical applicability for our method, in particular, when the number of clusters is large.}\n}", "pdf": "http://proceedings.mlr.press/v97/cicalese19a/cicalese19a.pdf", "supp": "", "pdf_size": 386369, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8873394002042683542&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Department of Computer Science, University of Verona, Verona, Italy; Departamento de Inform\u00e1tica, PUC-RIO, Rio de Janeiro, Brazil; Departamento de Inform\u00e1tica, PUC-RIO, Rio de Janeiro, Brazil", "aff_domain": "univr.it;gmail.com; ", "email": "univr.it;gmail.com; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/cicalese19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Verona;Pontif\u00edcia Universidade Cat\u00f3lica do Rio de Janeiro", "aff_unique_dep": "Department of Computer Science;Departamento de Inform\u00e1tica", "aff_unique_url": "https://www.univr.it;https://www.puc-rio.br", "aff_unique_abbr": ";PUC-RIO", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Verona;Rio de Janeiro", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Italy;Brazil" }, { "title": "Noise2Self: Blind Denoising by Self-Supervision", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4200", "id": "4200", "author_site": "Joshua Batson, Loic Royer", "author": "Joshua Batson; Loic Royer", "abstract": "We propose a general framework for denoising high-dimensional measurements which requires no prior on the signal, no estimate of the noise, and no clean training data. The only assumption is that the noise exhibits statistical independence across different dimensions of the measurement, while the true signal exhibits some correlation. For a broad class of functions (\u201c$\\mathcal{J}$-invariant\u201d), it is then possible to estimate the performance of a denoiser from noisy data alone. This allows us to calibrate $\\mathcal{J}$-invariant versions of any parameterised denoising algorithm, from the single hyperparameter of a median filter to the millions of weights of a deep neural network. We demonstrate this on natural image and microscopy data, where we exploit noise independence between pixels, and on single-cell gene expression data, where we exploit independence between detections of individual molecules. This framework generalizes recent work on training neural nets from noisy images and on cross-validation for matrix factorization.", "bibtex": "@InProceedings{pmlr-v97-batson19a,\n title = \t {{N}oise2{S}elf: Blind Denoising by Self-Supervision},\n author = {Batson, Joshua and Royer, Loic},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {524--533},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/batson19a/batson19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/batson19a.html},\n abstract = \t {We propose a general framework for denoising high-dimensional measurements which requires no prior on the signal, no estimate of the noise, and no clean training data. The only assumption is that the noise exhibits statistical independence across different dimensions of the measurement, while the true signal exhibits some correlation. For a broad class of functions (\u201c$\\mathcal{J}$-invariant\u201d), it is then possible to estimate the performance of a denoiser from noisy data alone. This allows us to calibrate $\\mathcal{J}$-invariant versions of any parameterised denoising algorithm, from the single hyperparameter of a median filter to the millions of weights of a deep neural network. We demonstrate this on natural image and microscopy data, where we exploit noise independence between pixels, and on single-cell gene expression data, where we exploit independence between detections of individual molecules. This framework generalizes recent work on training neural nets from noisy images and on cross-validation for matrix factorization.}\n}", "pdf": "http://proceedings.mlr.press/v97/batson19a/batson19a.pdf", "supp": "", "pdf_size": 5334627, "gs_citation": 877, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16484478987296907806&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Chan-Zuckerberg Biohub; Chan-Zuckerberg Biohub", "aff_domain": "czbiohub.org;czbiohub.org", "email": "czbiohub.org;czbiohub.org", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/batson19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Chan-Zuckerberg Biohub", "aff_unique_dep": "", "aff_unique_url": "https://www.chanzuckerberg.com/biohub", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Noisy Dual Principal Component Pursuit", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3955", "id": "3955", "author_site": "Tianyu Ding, Zhihui Zhu, Tianjiao Ding, Yunchen Yang, Daniel Robinson, Manolis Tsakiris, Rene Vidal", "author": "Tianyu Ding; Zhihui Zhu; Tianjiao Ding; Yunchen Yang; Rene Vidal; Manolis Tsakiris; Daniel Robinson", "abstract": "Dual Principal Component Pursuit (DPCP) is a recently proposed non-convex optimization based method for learning subspaces of high relative dimension from noiseless datasets contaminated by as many outliers as the square of the number of inliers. Experimentally, DPCP has proved to be robust to noise and outperform the popular RANSAC on 3D vision tasks such as road plane detection and relative poses estimation from three views. This paper extends the global optimality and convergence theory of DPCP to the case of data corrupted by noise, and further demonstrates its robustness using synthetic and real data.", "bibtex": "@InProceedings{pmlr-v97-ding19b,\n title = \t {Noisy Dual Principal Component Pursuit},\n author = {Ding, Tianyu and Zhu, Zhihui and Ding, Tianjiao and Yang, Yunchen and Vidal, Rene and Tsakiris, Manolis and Robinson, Daniel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1617--1625},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ding19b/ding19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/ding19b.html},\n abstract = \t {Dual Principal Component Pursuit (DPCP) is a recently proposed non-convex optimization based method for learning subspaces of high relative dimension from noiseless datasets contaminated by as many outliers as the square of the number of inliers. Experimentally, DPCP has proved to be robust to noise and outperform the popular RANSAC on 3D vision tasks such as road plane detection and relative poses estimation from three views. This paper extends the global optimality and convergence theory of DPCP to the case of data corrupted by noise, and further demonstrates its robustness using synthetic and real data.}\n}", "pdf": "http://proceedings.mlr.press/v97/ding19b/ding19b.pdf", "supp": "", "pdf_size": 9488877, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12723164251230278115&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "Department of Applied Mathematics & Statistics, Johns Hopkins University, USA+Mathematical Institute for Data Science, Johns Hopkins University, USA; Department of Applied Mathematics & Statistics, Johns Hopkins University, USA+Mathematical Institute for Data Science, Johns Hopkins University, USA; School of Information Science and Technology, ShanghaiTech University, China; School of Information Science and Technology, ShanghaiTech University, China; Mathematical Institute for Data Science, Johns Hopkins University, USA; School of Information Science and Technology, ShanghaiTech University, China; Department of Applied Mathematics & Statistics, Johns Hopkins University, USA", "aff_domain": "jhu.edu;jhu.edu; ; ; ; ; ", "email": "jhu.edu;jhu.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/ding19b.html", "aff_unique_index": "0+0;0+0;1;1;0;1;0", "aff_unique_norm": "Johns Hopkins University;ShanghaiTech University", "aff_unique_dep": "Department of Applied Mathematics & Statistics;School of Information Science and Technology", "aff_unique_url": "https://www.jhu.edu;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "JHU;ShanghaiTech", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0+0;1;1;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Non-Asymptotic Analysis of Fractional Langevin Monte Carlo for Non-Convex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3559", "id": "3559", "author_site": "Thanh Huy Nguyen, Umut Simsekli, Ga\u00ebl RICHARD", "author": "Than Huy Nguyen; Umut Simsekli; Gael Richard", "abstract": "Recent studies on diffusion-based sampling methods have shown that Langevin Monte Carlo (LMC) algorithms can be beneficial for non-convex optimization, and rigorous theoretical guarantees have been proven for both asymptotic and finite-time regimes. Algorithmically, LMC-based algorithms resemble the well-known gradient descent (GD) algorithm, where the GD recursion is perturbed by an additive Gaussian noise whose variance has a particular form. Fractional Langevin Monte Carlo (FLMC) is a recently proposed extension of LMC, where the Gaussian noise is replaced by a heavy-tailed $\\alpha$-stable noise. As opposed to its Gaussian counterpart, these heavy-tailed perturbations can incur large jumps and it has been empirically demonstrated that the choice of $\\alpha$-stable noise can provide several advantages in modern machine learning problems, both in optimization and sampling contexts. However, as opposed to LMC, only asymptotic convergence properties of FLMC have been yet established. In this study, we analyze the non-asymptotic behavior of FLMC for non-convex optimization and prove finite-time bounds for its expected suboptimality. Our results show that the weak-error of FLMC increases faster than LMC, which suggests using smaller step-sizes in FLMC. We finally extend our results to the case where the exact gradients are replaced by stochastic gradients and show that similar results hold in this setting as well.", "bibtex": "@InProceedings{pmlr-v97-nguyen19c,\n title = \t {Non-Asymptotic Analysis of Fractional {L}angevin {M}onte {C}arlo for Non-Convex Optimization},\n author = {Nguyen, Than Huy and Simsekli, Umut and Richard, Gael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4810--4819},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nguyen19c/nguyen19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/nguyen19c.html},\n abstract = \t {Recent studies on diffusion-based sampling methods have shown that Langevin Monte Carlo (LMC) algorithms can be beneficial for non-convex optimization, and rigorous theoretical guarantees have been proven for both asymptotic and finite-time regimes. Algorithmically, LMC-based algorithms resemble the well-known gradient descent (GD) algorithm, where the GD recursion is perturbed by an additive Gaussian noise whose variance has a particular form. Fractional Langevin Monte Carlo (FLMC) is a recently proposed extension of LMC, where the Gaussian noise is replaced by a heavy-tailed $\\alpha$-stable noise. As opposed to its Gaussian counterpart, these heavy-tailed perturbations can incur large jumps and it has been empirically demonstrated that the choice of $\\alpha$-stable noise can provide several advantages in modern machine learning problems, both in optimization and sampling contexts. However, as opposed to LMC, only asymptotic convergence properties of FLMC have been yet established. In this study, we analyze the non-asymptotic behavior of FLMC for non-convex optimization and prove finite-time bounds for its expected suboptimality. Our results show that the weak-error of FLMC increases faster than LMC, which suggests using smaller step-sizes in FLMC. We finally extend our results to the case where the exact gradients are replaced by stochastic gradients and show that similar results hold in this setting as well.}\n}", "pdf": "http://proceedings.mlr.press/v97/nguyen19c/nguyen19c.pdf", "supp": "", "pdf_size": 401211, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10372830718207535051&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "LTCI, T\u00b4el\u00b4ecom ParisTech, Universit\u00b4e Paris-Saclay, 75013, Paris, France; LTCI, T\u00b4el\u00b4ecom ParisTech, Universit\u00b4e Paris-Saclay, 75013, Paris, France; LTCI, T\u00b4el\u00b4ecom ParisTech, Universit\u00b4e Paris-Saclay, 75013, Paris, France", "aff_domain": "telecom-paristech.fr; ; ", "email": "telecom-paristech.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/nguyen19c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech", "aff_unique_dep": "LTCI", "aff_unique_url": "https://www.telecom-paris.fr", "aff_unique_abbr": "T\u00e9l\u00e9com ParisTech", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Paris", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Non-Monotonic Sequential Text Generation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4053", "id": "4053", "author_site": "Sean Welleck, Kiante Brantley, Hal Daum\u00e9 III, Kyunghyun Cho", "author": "Sean Welleck; Kiant\u00e9 Brantley; Hal Daum\u00e9 Iii; Kyunghyun Cho", "abstract": "Standard sequential generation methods assume a pre-specified generation order, such as text generation methods which generate words from left to right. In this work, we propose a framework for training models of text generation that operate in non-monotonic orders; the model directly learns good orders, without any additional annotation. Our framework operates by generating a word at an arbitrary position, and then recursively generating words to its left and then words to its right, yielding a binary tree. Learning is framed as imitation learning, including a coaching method which moves from imitating an oracle to reinforcing the policy\u2019s own preferences. Experimental results demonstrate that using the proposed method, it is possible to learn policies which generate text without pre-specifying a generation order, while achieving competitive performance with conventional left-to-right generation.", "bibtex": "@InProceedings{pmlr-v97-welleck19a,\n title = \t {Non-Monotonic Sequential Text Generation},\n author = {Welleck, Sean and Brantley, Kiant{\\'e} and Iii, Hal Daum{\\'e} and Cho, Kyunghyun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6716--6726},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/welleck19a/welleck19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/welleck19a.html},\n abstract = \t {Standard sequential generation methods assume a pre-specified generation order, such as text generation methods which generate words from left to right. In this work, we propose a framework for training models of text generation that operate in non-monotonic orders; the model directly learns good orders, without any additional annotation. Our framework operates by generating a word at an arbitrary position, and then recursively generating words to its left and then words to its right, yielding a binary tree. Learning is framed as imitation learning, including a coaching method which moves from imitating an oracle to reinforcing the policy\u2019s own preferences. Experimental results demonstrate that using the proposed method, it is possible to learn policies which generate text without pre-specifying a generation order, while achieving competitive performance with conventional left-to-right generation.}\n}", "pdf": "http://proceedings.mlr.press/v97/welleck19a/welleck19a.pdf", "supp": "", "pdf_size": 958749, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16018486661840997659&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "New York University; University of Maryland, College Park; Microsoft Research; Facebook AI Research + CIFAR Azrieli Global Scholar", "aff_domain": "nyu.edu; ; ; ", "email": "nyu.edu; ; ; ", "github": "https://github.com/wellecks/nonmonotonic_text", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/welleck19a.html", "aff_unique_index": "0;1;2;3+4", "aff_unique_norm": "New York University;University of Maryland;Microsoft;Meta;CIFAR", "aff_unique_dep": ";;Microsoft Research;Facebook AI Research;Azrieli Global Scholar", "aff_unique_url": "https://www.nyu.edu;https://www/umd.edu;https://www.microsoft.com/en-us/research;https://research.facebook.com;https://www.cifar.ca", "aff_unique_abbr": "NYU;UMD;MSR;FAIR;CIFAR", "aff_campus_unique_index": "1;", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0+1", "aff_country_unique": "United States;Canada" }, { "title": "Non-Parametric Priors For Generative Adversarial Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3685", "id": "3685", "author_site": "Rajhans Singh, Pavan Turaga, Suren Jayasuriya, Ravi Garg, Martin Braun", "author": "Rajhans Singh; Pavan Turaga; Suren Jayasuriya; Ravi Garg; Martin Braun", "abstract": "The advent of generative adversarial networks (GAN) has enabled new capabilities in synthesis, interpolation, and data augmentation heretofore considered very challenging. However, one of the common assumptions in most GAN architectures is the assumption of simple parametric latent-space distributions. While easy to implement, a simple latent-space distribution can be problematic for uses such as interpolation. This is due to distributional mismatches when samples are interpolated in the latent space. We present a straightforward formalization of this problem; using basic results from probability theory and off-the-shelf-optimization tools, we develop ways to arrive at appropriate non-parametric priors. The obtained prior exhibits unusual qualitative properties in terms of its shape, and quantitative benefits in terms of lower divergence with its mid-point distribution. We demonstrate that our designed prior helps improve image generation along any Euclidean straight line during interpolation, both qualitatively and quantitatively, without any additional training or architectural modifications. The proposed formulation is quite flexible, paving the way to impose newer constraints on the latent-space statistics.", "bibtex": "@InProceedings{pmlr-v97-singh19a,\n title = \t {Non-Parametric Priors For Generative Adversarial Networks},\n author = {Singh, Rajhans and Turaga, Pavan and Jayasuriya, Suren and Garg, Ravi and Braun, Martin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5838--5847},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/singh19a/singh19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/singh19a.html},\n abstract = \t {The advent of generative adversarial networks (GAN) has enabled new capabilities in synthesis, interpolation, and data augmentation heretofore considered very challenging. However, one of the common assumptions in most GAN architectures is the assumption of simple parametric latent-space distributions. While easy to implement, a simple latent-space distribution can be problematic for uses such as interpolation. This is due to distributional mismatches when samples are interpolated in the latent space. We present a straightforward formalization of this problem; using basic results from probability theory and off-the-shelf-optimization tools, we develop ways to arrive at appropriate non-parametric priors. The obtained prior exhibits unusual qualitative properties in terms of its shape, and quantitative benefits in terms of lower divergence with its mid-point distribution. We demonstrate that our designed prior helps improve image generation along any Euclidean straight line during interpolation, both qualitatively and quantitatively, without any additional training or architectural modifications. The proposed formulation is quite flexible, paving the way to impose newer constraints on the latent-space statistics.}\n}", "pdf": "http://proceedings.mlr.press/v97/singh19a/singh19a.pdf", "supp": "", "pdf_size": 1719611, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13041432329171059911&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "School of Electrical, Computer, and Energy Engineering, Arizona State University, Tempe, AZ, USA+School of Arts, Media and Engineering, Arizona State University, Tempe, AZ, USA; School of Electrical, Computer, and Energy Engineering, Arizona State University, Tempe, AZ, USA+School of Arts, Media and Engineering, Arizona State University, Tempe, AZ, USA; School of Electrical, Computer, and Energy Engineering, Arizona State University, Tempe, AZ, USA+School of Arts, Media and Engineering, Arizona State University, Tempe, AZ, USA; Intel Corporation, Chandler, AZ, USA; Intel Corporation, Chandler, AZ, USA", "aff_domain": "asu.edu;asu.edu;asu.edu;intel.com;intel.com", "email": "asu.edu;asu.edu;asu.edu;intel.com;intel.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/singh19a.html", "aff_unique_index": "0+0;0+0;0+0;1;1", "aff_unique_norm": "Arizona State University;Intel", "aff_unique_dep": "School of Electrical, Computer, and Energy Engineering;Intel Corporation", "aff_unique_url": "https://www.asu.edu;https://www.intel.com", "aff_unique_abbr": "ASU;Intel", "aff_campus_unique_index": "0+0;0+0;0+0;1;1", "aff_campus_unique": "Tempe;Chandler", "aff_country_unique_index": "0+0;0+0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Non-monotone Submodular Maximization with Nearly Optimal Adaptivity and Query Complexity", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3890", "id": "3890", "author_site": "Matthew Fahrbach, Vahab Mirrokni, Morteza Zadimoghaddam", "author": "Matthew Fahrbach; Vahab Mirrokni; Morteza Zadimoghaddam", "abstract": "Submodular maximization is a general optimization problem with a wide range of applications in machine learning (e.g., active learning, clustering, and feature selection). In large-scale optimization, the parallel running time of an algorithm is governed by its adaptivity, which measures the number of sequential rounds needed if the algorithm can execute polynomially-many independent oracle queries in parallel. While low adaptivity is ideal, it is not sufficient for an algorithm to be efficient in practice\u2014there are many applications of distributed submodular optimization where the number of function evaluations becomes prohibitively expensive. Motivated by these applications, we study the adaptivity and query complexity of submodular maximization. In this paper, we give the first constant-factor approximation algorithm for maximizing a non-monotone submodular function subject to a cardinality constraint $k$ that runs in $O(\\log(n))$ adaptive rounds and makes $O(n \\log(k))$ oracle queries in expectation. In our empirical study, we use three real-world applications to compare our algorithm with several benchmarks for non-monotone submodular maximization. The results demonstrate that our algorithm finds competitive solutions using significantly fewer rounds and queries.", "bibtex": "@InProceedings{pmlr-v97-fahrbach19a,\n title = \t {Non-monotone Submodular Maximization with Nearly Optimal Adaptivity and Query Complexity},\n author = {Fahrbach, Matthew and Mirrokni, Vahab and Zadimoghaddam, Morteza},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1833--1842},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fahrbach19a/fahrbach19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fahrbach19a.html},\n abstract = \t {Submodular maximization is a general optimization problem with a wide range of applications in machine learning (e.g., active learning, clustering, and feature selection). In large-scale optimization, the parallel running time of an algorithm is governed by its adaptivity, which measures the number of sequential rounds needed if the algorithm can execute polynomially-many independent oracle queries in parallel. While low adaptivity is ideal, it is not sufficient for an algorithm to be efficient in practice\u2014there are many applications of distributed submodular optimization where the number of function evaluations becomes prohibitively expensive. Motivated by these applications, we study the adaptivity and query complexity of submodular maximization. In this paper, we give the first constant-factor approximation algorithm for maximizing a non-monotone submodular function subject to a cardinality constraint $k$ that runs in $O(\\log(n))$ adaptive rounds and makes $O(n \\log(k))$ oracle queries in expectation. In our empirical study, we use three real-world applications to compare our algorithm with several benchmarks for non-monotone submodular maximization. The results demonstrate that our algorithm finds competitive solutions using significantly fewer rounds and queries.}\n}", "pdf": "http://proceedings.mlr.press/v97/fahrbach19a/fahrbach19a.pdf", "supp": "", "pdf_size": 736503, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10299740737215647752&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Georgia Institute of Technology; Google; Google", "aff_domain": "gatech.edu; ; ", "email": "gatech.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/fahrbach19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Georgia Institute of Technology;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.gatech.edu;https://www.google.com", "aff_unique_abbr": "Georgia Tech;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Nonconvex Variance Reduced Optimization with Arbitrary Sampling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3831", "id": "3831", "author_site": "Samuel Horvath, Peter Richtarik", "author": "Samuel Horv\u00e1th; Peter Richtarik", "abstract": "We provide the first importance sampling variants of variance reduced algorithms for empirical risk minimization with non-convex loss functions. In particular, we analyze non-convex versions of \\texttt{SVRG}, \\texttt{SAGA} and \\texttt{SARAH}. Our methods have the capacity to speed up the training process by an order of magnitude compared to the state of the art on real datasets. Moreover, we also improve upon current mini-batch analysis of these methods by proposing importance sampling for minibatches in this setting. Surprisingly, our approach can in some regimes lead to superlinear speedup with respect to the minibatch size, which is not usually present in stochastic optimization. All the above results follow from a general analysis of the methods which works with", "bibtex": "@InProceedings{pmlr-v97-horvath19a,\n title = \t {Nonconvex Variance Reduced Optimization with Arbitrary Sampling},\n author = {Horv{\\'a}th, Samuel and Richtarik, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2781--2789},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/horvath19a/horvath19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/horvath19a.html},\n abstract = \t {We provide the first importance sampling variants of variance reduced algorithms for empirical risk minimization with non-convex loss functions. In particular, we analyze non-convex versions of \\texttt{SVRG}, \\texttt{SAGA} and \\texttt{SARAH}. Our methods have the capacity to speed up the training process by an order of magnitude compared to the state of the art on real datasets. Moreover, we also improve upon current mini-batch analysis of these methods by proposing importance sampling for minibatches in this setting. Surprisingly, our approach can in some regimes lead to superlinear speedup with respect to the minibatch size, which is not usually present in stochastic optimization. All the above results follow from a general analysis of the methods which works with", "pdf": "http://proceedings.mlr.press/v97/horvath19a/horvath19a.pdf", "supp": "", "pdf_size": 947959, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15716281759462040381&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "King Abdullah University of Science and Technology, Saudi Arabia + Moscow Institute of Physics and Technology, Russia + University of Edinburgh, United Kingdom; King Abdullah University of Science and Technology, Saudi Arabia + Moscow Institute of Physics and Technology, Russia + University of Edinburgh, United Kingdom", "aff_domain": "kaust.edu.sa;kaust.edu.sa", "email": "kaust.edu.sa;kaust.edu.sa", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/horvath19a.html", "aff_unique_index": "0+1+2;0+1+2", "aff_unique_norm": "King Abdullah University of Science and Technology;Moscow Institute of Physics and Technology;University of Edinburgh", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kast.kau.edu.sa;https://www.mipt.ru/en;https://www.ed.ac.uk", "aff_unique_abbr": "KAUST;MIPT;Edinburgh", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+1+2;0+1+2", "aff_country_unique": "Saudi Arabia;Russian Federation;United Kingdom" }, { "title": "Nonlinear Distributional Gradient Temporal-Difference Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3628", "id": "3628", "author_site": "chao qu, Shie Mannor, Huan Xu", "author": "Chao Qu; Shie Mannor; Huan Xu", "abstract": "We devise a distributional variant of gradient temporal-difference (TD) learning. Distributional reinforcement learning has been demonstrated to outperform the regular one in the recent study \\citep{bellemare2017distributional}. In the policy evaluation setting, we design two new algorithms called distributional GTD2 and distributional TDC using the Cram{\u00e9}r distance on the distributional version of the Bellman error objective function, which inherits advantages of both the nonlinear gradient TD algorithms and the distributional RL approach. In the control setting, we propose the distributional Greedy-GQ using similar derivation. We prove the asymptotic almost-sure convergence of distributional GTD2 and TDC to a local optimal solution for general smooth function approximators, which includes neural networks that have been widely used in recent study to solve the real-life RL problems. In each step, the computational complexity of above three algorithms is linear w.r.t. the number of the parameters of the function approximator, thus can be implemented efficiently for neural networks.", "bibtex": "@InProceedings{pmlr-v97-qu19b,\n title = \t {Nonlinear Distributional Gradient Temporal-Difference Learning},\n author = {Qu, Chao and Mannor, Shie and Xu, Huan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5251--5260},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/qu19b/qu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/qu19b.html},\n abstract = \t {We devise a distributional variant of gradient temporal-difference (TD) learning. Distributional reinforcement learning has been demonstrated to outperform the regular one in the recent study \\citep{bellemare2017distributional}. In the policy evaluation setting, we design two new algorithms called distributional GTD2 and distributional TDC using the Cram{\u00e9}r distance on the distributional version of the Bellman error objective function, which inherits advantages of both the nonlinear gradient TD algorithms and the distributional RL approach. In the control setting, we propose the distributional Greedy-GQ using similar derivation. We prove the asymptotic almost-sure convergence of distributional GTD2 and TDC to a local optimal solution for general smooth function approximators, which includes neural networks that have been widely used in recent study to solve the real-life RL problems. In each step, the computational complexity of above three algorithms is linear w.r.t. the number of the parameters of the function approximator, thus can be implemented efficiently for neural networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/qu19b/qu19b.pdf", "supp": "", "pdf_size": 1422494, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15475928435894267767&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Ant Financial Services Group, Hang Zhou, China; Faculty of Electrical Engineering, Technion, Haifa, Israel; Alibaba Group, Seattle, USA + H. Milton Stewart School of Industrial and Systems Engineering, Georgia Tech, Atlanta, USA", "aff_domain": "antfin.com; ; ", "email": "antfin.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/qu19b.html", "aff_unique_index": "0;1;2+3", "aff_unique_norm": "Ant Financial Services Group;Technion;Alibaba Group;Georgia Tech", "aff_unique_dep": ";Faculty of Electrical Engineering;;H. Milton Stewart School of Industrial and Systems Engineering", "aff_unique_url": "https://www.antgroup.com;https://www.technion.ac.il;https://www.alibaba.com;https://www.gatech.edu", "aff_unique_abbr": "Ant Group;Technion;Alibaba;GT", "aff_campus_unique_index": "0;1;2+3", "aff_campus_unique": "Hang Zhou;Haifa;Seattle;Atlanta", "aff_country_unique_index": "0;1;2+2", "aff_country_unique": "China;Israel;United States" }, { "title": "Nonlinear Stein Variational Gradient Descent for Learning Diversified Mixture Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3886", "id": "3886", "author_site": "Dilin Wang, Qiang Liu", "author": "Dilin Wang; Qiang Liu", "abstract": "Diversification has been shown to be a powerful mechanism for learning robust models in non-convex settings. A notable example is learning mixture models, in which enforcing diversity between the different mixture components allows us to prevent the model collapsing phenomenon and capture more patterns from the observed data. In this work, we present a variational approach for diversity-promoting learning, which leverages the entropy functional as a natural mechanism for enforcing diversity. We develop a simple and efficient functional gradient-based algorithm for optimizing the variational objective function, which provides a significant generalization of Stein variational gradient descent (SVGD). We test our method on various challenging real world problems, including deep embedded clustering and deep anomaly detection. Empirical results show that our method provides an effective mechanism for diversity-promoting learning, achieving substantial improvement over existing methods.", "bibtex": "@InProceedings{pmlr-v97-wang19h,\n title = \t {Nonlinear Stein Variational Gradient Descent for Learning Diversified Mixture Models},\n author = {Wang, Dilin and Liu, Qiang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6576--6585},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19h/wang19h.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19h.html},\n abstract = \t {Diversification has been shown to be a powerful mechanism for learning robust models in non-convex settings. A notable example is learning mixture models, in which enforcing diversity between the different mixture components allows us to prevent the model collapsing phenomenon and capture more patterns from the observed data. In this work, we present a variational approach for diversity-promoting learning, which leverages the entropy functional as a natural mechanism for enforcing diversity. We develop a simple and efficient functional gradient-based algorithm for optimizing the variational objective function, which provides a significant generalization of Stein variational gradient descent (SVGD). We test our method on various challenging real world problems, including deep embedded clustering and deep anomaly detection. Empirical results show that our method provides an effective mechanism for diversity-promoting learning, achieving substantial improvement over existing methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19h/wang19h.pdf", "supp": "", "pdf_size": 2410001, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6803406910300162422&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, UT Austin; Department of Computer Science, UT Austin", "aff_domain": "cs.utexas.edu;cs.utexas.edu", "email": "cs.utexas.edu;cs.utexas.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/wang19h.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Nonparametric Bayesian Deep Networks with Local Competition", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3879", "id": "3879", "author_site": "Konstantinos Panousis, Sotirios Chatzis, Sergios Theodoridis", "author": "Konstantinos Panousis; Sotirios Chatzis; Sergios Theodoridis", "abstract": "The aim of this work is to enable inference of deep networks that retain high accuracy for the least possible model complexity, with the latter deduced from the data during inference. To this end, we revisit deep networks that comprise competing linear units, as opposed to nonlinear units that do not entail any form of (local) competition. In this context, our main technical innovation consists in an inferential setup that leverages solid arguments from Bayesian nonparametrics. We infer both the needed set of connections or locally competing sets of units, as well as the required floating-point precision for storing the network parameters. Specifically, we introduce auxiliary discrete latent variables representing which initial network components are actually needed for modeling the data at hand, and perform Bayesian inference over them by imposing appropriate stick-breaking priors. As we experimentally show using benchmark datasets, our approach yields networks with less computational footprint than the state-of-the-art, and with no compromises in predictive accuracy.", "bibtex": "@InProceedings{pmlr-v97-panousis19a,\n title = \t {Nonparametric {B}ayesian Deep Networks with Local Competition},\n author = {Panousis, Konstantinos and Chatzis, Sotirios and Theodoridis, Sergios},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4980--4988},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/panousis19a/panousis19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/panousis19a.html},\n abstract = \t {The aim of this work is to enable inference of deep networks that retain high accuracy for the least possible model complexity, with the latter deduced from the data during inference. To this end, we revisit deep networks that comprise competing linear units, as opposed to nonlinear units that do not entail any form of (local) competition. In this context, our main technical innovation consists in an inferential setup that leverages solid arguments from Bayesian nonparametrics. We infer both the needed set of connections or locally competing sets of units, as well as the required floating-point precision for storing the network parameters. Specifically, we introduce auxiliary discrete latent variables representing which initial network components are actually needed for modeling the data at hand, and perform Bayesian inference over them by imposing appropriate stick-breaking priors. As we experimentally show using benchmark datasets, our approach yields networks with less computational footprint than the state-of-the-art, and with no compromises in predictive accuracy.}\n}", "pdf": "http://proceedings.mlr.press/v97/panousis19a/panousis19a.pdf", "supp": "", "pdf_size": 499236, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6949349876007421452&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Dept. of Informatics & Telecommunications, National and Kapodistrian University of Athens, Greece+The Chinese University of Hong Kong, Shenzen, China; Dept. of Electrical Eng., Computer Eng., and Informatics, Cyprus University of Technology, Limassol, Cyprus+The Chinese University of Hong Kong, Shenzen, China; The Chinese University of Hong Kong, Shenzen, China", "aff_domain": "di.uoa.gr;cut.ac.cy; ", "email": "di.uoa.gr;cut.ac.cy; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/panousis19a.html", "aff_unique_index": "0+1;2+1;1", "aff_unique_norm": "National and Kapodistrian University of Athens;Chinese University of Hong Kong;Cyprus University of Technology", "aff_unique_dep": "Dept. of Informatics & Telecommunications;;Dept. of Electrical Eng., Computer Eng., and Informatics", "aff_unique_url": "https://www.uoa.gr;https://www.cuhk.edu.cn;https://www.cut.ac.cy", "aff_unique_abbr": "NKUA;CUHK;CUT", "aff_campus_unique_index": "1;2+1;1", "aff_campus_unique": ";Shenzhen;Limassol", "aff_country_unique_index": "0+1;2+1;1", "aff_country_unique": "Greece;China;Cyprus" }, { "title": "Obtaining Fairness using Optimal Transport Theory", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4010", "id": "4010", "author_site": "Paula Gordaliza, Eustasio del Barrio, Gamboa Fabrice, Loubes Jean-Michel", "author": "Paula Gordaliza; Eustasio Del Barrio; Gamboa Fabrice; Jean-Michel Loubes", "abstract": "In the fair classification setup, we recast the links between fairness and predictability in terms of probability metrics. We analyze repair methods based on mapping conditional distributions to the Wasserstein barycenter. We propose a Random Repair which yields a tradeoff between minimal information loss and a certain amount of fairness.", "bibtex": "@InProceedings{pmlr-v97-gordaliza19a,\n title = \t {Obtaining Fairness using Optimal Transport Theory},\n author = {Gordaliza, Paula and Barrio, Eustasio Del and Fabrice, Gamboa and Loubes, Jean-Michel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2357--2365},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gordaliza19a/gordaliza19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gordaliza19a.html},\n abstract = \t {In the fair classification setup, we recast the links between fairness and predictability in terms of probability metrics. We analyze repair methods based on mapping conditional distributions to the Wasserstein barycenter. We propose a Random Repair which yields a tradeoff between minimal information loss and a certain amount of fairness.}\n}", "pdf": "http://proceedings.mlr.press/v97/gordaliza19a/gordaliza19a.pdf", "supp": "", "pdf_size": 584901, "gs_citation": 225, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8606438642485055916&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": "IMUV A, Universidad de Valladolid, Valladolid, Spain; Institut de Math\u00e9matiques de Toulouse, Universit\u00e9 Paul Sabatier, Toulouse, France; Institut de Math\u00e9matiques de Toulouse, Universit\u00e9 Paul Sabatier, Toulouse, France; Institut de Math\u00e9matiques de Toulouse, Universit\u00e9 Paul Sabatier, Toulouse, France", "aff_domain": "math.univ-toulouse.fr; ; ; ", "email": "math.univ-toulouse.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/gordaliza19a.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Universidad de Valladolid;Universit\u00e9 Paul Sabatier", "aff_unique_dep": "IMUV A;Institut de Math\u00e9matiques de Toulouse", "aff_unique_url": "https://www.uv.es;https://www.univ-toulouse.fr", "aff_unique_abbr": ";UPS", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Valladolid;Toulouse", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Spain;France" }, { "title": "Off-Policy Deep Reinforcement Learning without Exploration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3655", "id": "3655", "author_site": "Scott Fujimoto, David Meger, Doina Precup", "author": "Scott Fujimoto; David Meger; Doina Precup", "abstract": "Many practical applications of reinforcement learning constrain agents to learn from a fixed batch of data which has already been gathered, without offering further possibility for data collection. In this paper, we demonstrate that due to errors introduced by extrapolation, standard off-policy deep reinforcement learning algorithms, such as DQN and DDPG, are incapable of learning with data uncorrelated to the distribution under the current policy, making them ineffective for this fixed batch setting. We introduce a novel class of off-policy algorithms, batch-constrained reinforcement learning, which restricts the action space in order to force the agent towards behaving close to on-policy with respect to a subset of the given data. We present the first continuous control deep reinforcement learning algorithm which can learn effectively from arbitrary, fixed batch data, and empirically demonstrate the quality of its behavior in several tasks.", "bibtex": "@InProceedings{pmlr-v97-fujimoto19a,\n title = \t {Off-Policy Deep Reinforcement Learning without Exploration},\n author = {Fujimoto, Scott and Meger, David and Precup, Doina},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2052--2062},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fujimoto19a/fujimoto19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fujimoto19a.html},\n abstract = \t {Many practical applications of reinforcement learning constrain agents to learn from a fixed batch of data which has already been gathered, without offering further possibility for data collection. In this paper, we demonstrate that due to errors introduced by extrapolation, standard off-policy deep reinforcement learning algorithms, such as DQN and DDPG, are incapable of learning with data uncorrelated to the distribution under the current policy, making them ineffective for this fixed batch setting. We introduce a novel class of off-policy algorithms, batch-constrained reinforcement learning, which restricts the action space in order to force the agent towards behaving close to on-policy with respect to a subset of the given data. We present the first continuous control deep reinforcement learning algorithm which can learn effectively from arbitrary, fixed batch data, and empirically demonstrate the quality of its behavior in several tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/fujimoto19a/fujimoto19a.pdf", "supp": "", "pdf_size": 922439, "gs_citation": 1958, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13735420516544008547&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, McGill University, Montreal, Canada+Mila Qu\u00b4ebec AI Institute; Department of Computer Science, McGill University, Montreal, Canada+Mila Qu\u00b4ebec AI Institute; Department of Computer Science, McGill University, Montreal, Canada+Mila Qu\u00b4ebec AI Institute", "aff_domain": "mail.mcgill.ca; ; ", "email": "mail.mcgill.ca; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/fujimoto19a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "McGill University;Mila Qu\u00b4ebec AI Institute", "aff_unique_dep": "Department of Computer Science;AI Institute", "aff_unique_url": "https://www.mcgill.ca;https://mila.quebec", "aff_unique_abbr": "McGill;Mila", "aff_campus_unique_index": "0+1;0+1;0+1", "aff_campus_unique": "Montreal;Quebec", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Canada" }, { "title": "On Certifying Non-Uniform Bounds against Adversarial Attacks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4033", "id": "4033", "author_site": "Chen Liu, Ryota Tomioka, Volkan Cevher", "author": "Chen Liu; Ryota Tomioka; Volkan Cevher", "abstract": "This work studies the robustness certification problem of neural network models, which aims to find certified adversary-free regions as large as possible around data points. In contrast to the existing approaches that seek regions bounded uniformly along all input features, we consider non-uniform bounds and use it to study the decision boundary of neural network models. We formulate our target as an optimization problem with nonlinear constraints. Then, a framework applicable for general feedforward neural networks is proposed to bound the output logits so that the relaxed problem can be solved by the augmented Lagrangian method. Our experiments show the non-uniform bounds have larger volumes than uniform ones. Compared with normal models, the robust models have even larger non-uniform bounds and better interpretability. Further, the geometric similarity of the non-uniform bounds gives a quantitative, data-agnostic metric of input features\u2019 robustness.", "bibtex": "@InProceedings{pmlr-v97-liu19h,\n title = \t {On Certifying Non-Uniform Bounds against Adversarial Attacks},\n author = {Liu, Chen and Tomioka, Ryota and Cevher, Volkan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4072--4081},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19h/liu19h.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19h.html},\n abstract = \t {This work studies the robustness certification problem of neural network models, which aims to find certified adversary-free regions as large as possible around data points. In contrast to the existing approaches that seek regions bounded uniformly along all input features, we consider non-uniform bounds and use it to study the decision boundary of neural network models. We formulate our target as an optimization problem with nonlinear constraints. Then, a framework applicable for general feedforward neural networks is proposed to bound the output logits so that the relaxed problem can be solved by the augmented Lagrangian method. Our experiments show the non-uniform bounds have larger volumes than uniform ones. Compared with normal models, the robust models have even larger non-uniform bounds and better interpretability. Further, the geometric similarity of the non-uniform bounds gives a quantitative, data-agnostic metric of input features\u2019 robustness.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19h/liu19h.pdf", "supp": "", "pdf_size": 3851746, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9253056850581885161&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "EPFL, Lausanne, Switzerland; Microsoft Research, Cambridge, UK; EPFL, Lausanne, Switzerland", "aff_domain": "epfl.ch; ; ", "email": "epfl.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/liu19h.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "EPFL;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.epfl.ch;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "EPFL;MSR", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Lausanne;Cambridge", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United Kingdom" }, { "title": "On Connected Sublevel Sets in Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4246", "id": "4246", "author": "Quynh Nguyen", "abstract": "This paper shows that every sublevel set of the loss function of a class of deep over-parameterized neural nets with piecewise linear activation functions is connected and unbounded. This implies that the loss has no bad local valleys and all of its global minima are connected within a unique and potentially very large global valley.", "bibtex": "@InProceedings{pmlr-v97-nguyen19a,\n title = \t {On Connected Sublevel Sets in Deep Learning},\n author = {Nguyen, Quynh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4790--4799},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nguyen19a/nguyen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nguyen19a.html},\n abstract = \t {This paper shows that every sublevel set of the loss function of a class of deep over-parameterized neural nets with piecewise linear activation functions is connected and unbounded. This implies that the loss has no bad local valleys and all of its global minima are connected within a unique and potentially very large global valley.}\n}", "pdf": "http://proceedings.mlr.press/v97/nguyen19a/nguyen19a.pdf", "supp": "", "pdf_size": 378623, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11022075245998027693&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Mathematics and Computer Science, Saarland University, Germany", "aff_domain": "cs.uni-saarland.de", "email": "cs.uni-saarland.de", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/nguyen19a.html", "aff_unique_index": "0", "aff_unique_norm": "Saarland University", "aff_unique_dep": "Department of Mathematics and Computer Science", "aff_unique_url": "https://www.uni-saarland.de", "aff_unique_abbr": "Saarland U", "aff_country_unique_index": "0", "aff_country_unique": "Germany" }, { "title": "On Dropout and Nuclear Norm Regularization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4173", "id": "4173", "author_site": "Poorya Mianjy, Raman Arora", "author": "Poorya Mianjy; Raman Arora", "abstract": "We give a formal and complete characterization of the explicit regularizer induced by dropout in deep linear networks with squared loss. We show that (a) the explicit regularizer is composed of an $\\ell_2$-path regularizer and other terms that are also re-scaling invariant, (b) the convex envelope of the induced regularizer is the squared nuclear norm of the network map, and (c) for a sufficiently large dropout rate, we characterize the global optima of the dropout objective. We validate our theoretical findings with empirical results.", "bibtex": "@InProceedings{pmlr-v97-mianjy19a,\n title = \t {On Dropout and Nuclear Norm Regularization},\n author = {Mianjy, Poorya and Arora, Raman},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4575--4584},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mianjy19a/mianjy19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mianjy19a.html},\n abstract = \t {We give a formal and complete characterization of the explicit regularizer induced by dropout in deep linear networks with squared loss. We show that (a) the explicit regularizer is composed of an $\\ell_2$-path regularizer and other terms that are also re-scaling invariant, (b) the convex envelope of the induced regularizer is the squared nuclear norm of the network map, and (c) for a sufficiently large dropout rate, we characterize the global optima of the dropout objective. We validate our theoretical findings with empirical results.}\n}", "pdf": "http://proceedings.mlr.press/v97/mianjy19a/mianjy19a.pdf", "supp": "", "pdf_size": 1814117, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2540515501706995243&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Johns Hopkins University, Baltimore, MD, USA; Department of Computer Science, Johns Hopkins University, Baltimore, MD, USA", "aff_domain": "jhu.edu;cs.jhu.edu", "email": "jhu.edu;cs.jhu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/mianjy19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Baltimore", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On Efficient Optimal Transport: An Analysis of Greedy and Accelerated Mirror Descent Algorithms", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3800", "id": "3800", "author_site": "Tianyi Lin, Nhat Ho, Michael Jordan", "author": "Tianyi Lin; Nhat Ho; Michael Jordan", "abstract": "We provide theoretical analyses for two algorithms that solve the regularized optimal transport (OT) problem between two discrete probability measures with at most $n$ atoms. We show that a greedy variant of the classical Sinkhorn algorithm, known as the", "bibtex": "@InProceedings{pmlr-v97-lin19a,\n title = \t {On Efficient Optimal Transport: An Analysis of Greedy and Accelerated Mirror Descent Algorithms},\n author = {Lin, Tianyi and Ho, Nhat and Jordan, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3982--3991},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lin19a/lin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lin19a.html},\n abstract = \t {We provide theoretical analyses for two algorithms that solve the regularized optimal transport (OT) problem between two discrete probability measures with at most $n$ atoms. We show that a greedy variant of the classical Sinkhorn algorithm, known as the", "pdf": "http://proceedings.mlr.press/v97/lin19a/lin19a.pdf", "supp": "", "pdf_size": 620949, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1215257994264756360&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of IEOR, University of California, Berkeley; Department of EECS, University of California, Berkeley; Department of Statistics and EECS, University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu", "email": "berkeley.edu;berkeley.edu;berkeley.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/lin19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Industrial Engineering and Operations Research", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On Learning Invariant Representations for Domain Adaptation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4264", "id": "4264", "author_site": "Han Zhao, Remi Tachet des Combes, Kun Zhang, Geoff Gordon", "author": "Han Zhao; Remi Tachet Des Combes; Kun Zhang; Geoffrey Gordon", "abstract": "Due to the ability of deep neural nets to learn rich representations, recent advances in unsupervised domain adaptation have focused on learning domain-invariant features that achieve a small error on the source domain. The hope is that the learnt representation, together with the hypothesis learnt from the source domain, can generalize to the target domain. In this paper, we first construct a simple counterexample showing that, contrary to common belief, the above conditions are not sufficient to guarantee successful domain adaptation. In particular, the counterexample exhibits", "bibtex": "@InProceedings{pmlr-v97-zhao19a,\n title = \t {On Learning Invariant Representations for Domain Adaptation},\n author = {Zhao, Han and Combes, Remi Tachet Des and Zhang, Kun and Gordon, Geoffrey},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7523--7532},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhao19a/zhao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhao19a.html},\n abstract = \t {Due to the ability of deep neural nets to learn rich representations, recent advances in unsupervised domain adaptation have focused on learning domain-invariant features that achieve a small error on the source domain. The hope is that the learnt representation, together with the hypothesis learnt from the source domain, can generalize to the target domain. In this paper, we first construct a simple counterexample showing that, contrary to common belief, the above conditions are not sufficient to guarantee successful domain adaptation. In particular, the counterexample exhibits", "pdf": "http://proceedings.mlr.press/v97/zhao19a/zhao19a.pdf", "supp": "", "pdf_size": 728316, "gs_citation": 769, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8836462837379340833&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Machine Learning Department, Carnegie Mellon University, Pittsburgh, PA, USA+Microsoft Research, Montreal, Canada; Microsoft Research, Montreal, Canada; Machine Learning Department, Carnegie Mellon University, Pittsburgh, PA, USA; Machine Learning Department, Carnegie Mellon University, Pittsburgh, PA, USA+Microsoft Research, Montreal, Canada", "aff_domain": "cs.cmu.edu; ; ; ", "email": "cs.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/zhao19a.html", "aff_unique_index": "0+1;1;0;0+1", "aff_unique_norm": "Carnegie Mellon University;Microsoft", "aff_unique_dep": "Machine Learning Department;Microsoft Research", "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com/en-us/research/group/montreal", "aff_unique_abbr": "CMU;MSR", "aff_campus_unique_index": "0+1;1;0;0+1", "aff_campus_unique": "Pittsburgh;Montreal", "aff_country_unique_index": "0+1;1;0;0+1", "aff_country_unique": "United States;Canada" }, { "title": "On Medians of (Randomized) Pairwise Means", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3669", "id": "3669", "author_site": "Stephan Clemencon, Pierre Laforgue, Patrice Bertail", "author": "Pierre Laforgue; Stephan Clemencon; Patrice Bertail", "abstract": "Tournament procedures, recently introduced in the literature, offer an appealing alternative, from a theoretical perspective at least, to the principle of Empirical Risk Minimization in machine learning. Statistical learning by Median-of-Means (MoM) basically consists in segmenting the training data into blocks of equal size and comparing the statistical performance of every pair of candidate decision rules on each data block: that with highest performance on the majority of the blocks is declared as the winner. In the context of nonparametric regression, functions having won all their duels have been shown to outperform empirical risk minimizers w.r.t. the mean squared error under minimal assumptions, while exhibiting robustness properties. It is the purpose of this paper to extend this approach, in order to address other learning problems in particular, for which the performance criterion takes the form of an expectation over pairs of observations rather than over one single observation, as may be the case in pairwise ranking, clustering or metric learning. Precisely, it is proved here that the bounds achieved by MoM are essentially conserved when the blocks are built by means of independent sampling without replacement schemes instead of a simple segmentation. These results are next extended to situations where the risk is related to a pairwise loss function and its empirical counterpart is of the form of a $U$-statistic. Beyond theoretical results guaranteeing the performance of the learning/estimation methods proposed, some numerical experiments provide empirical evidence of their relevance in practice.", "bibtex": "@InProceedings{pmlr-v97-clemencon19a,\n title = \t {On Medians of ({R}andomized) Pairwise Means},\n author = {Laforgue, Pierre and Clemencon, Stephan and Bertail, Patrice},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1272--1281},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/clemencon19a/clemencon19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/clemencon19a.html},\n abstract = \t {Tournament procedures, recently introduced in the literature, offer an appealing alternative, from a theoretical perspective at least, to the principle of Empirical Risk Minimization in machine learning. Statistical learning by Median-of-Means (MoM) basically consists in segmenting the training data into blocks of equal size and comparing the statistical performance of every pair of candidate decision rules on each data block: that with highest performance on the majority of the blocks is declared as the winner. In the context of nonparametric regression, functions having won all their duels have been shown to outperform empirical risk minimizers w.r.t. the mean squared error under minimal assumptions, while exhibiting robustness properties. It is the purpose of this paper to extend this approach, in order to address other learning problems in particular, for which the performance criterion takes the form of an expectation over pairs of observations rather than over one single observation, as may be the case in pairwise ranking, clustering or metric learning. Precisely, it is proved here that the bounds achieved by MoM are essentially conserved when the blocks are built by means of independent sampling without replacement schemes instead of a simple segmentation. These results are next extended to situations where the risk is related to a pairwise loss function and its empirical counterpart is of the form of a $U$-statistic. Beyond theoretical results guaranteeing the performance of the learning/estimation methods proposed, some numerical experiments provide empirical evidence of their relevance in practice.}\n}", "pdf": "http://proceedings.mlr.press/v97/clemencon19a/clemencon19a.pdf", "supp": "", "pdf_size": 390864, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=128974119753981156&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "LTCI, T\u00e9l\u00e9com Paris, Institut Polytechnique de Paris; LTCI, T\u00e9l\u00e9com Paris, Institut Polytechnique de Paris; Modal\u2019X, UPL, Universit\u00e9 Paris-Nanterre", "aff_domain": "telecom-paristech.fr; ; ", "email": "telecom-paristech.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/clemencon19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "T\u00e9l\u00e9com Paris;Universit\u00e9 Paris-Nanterre", "aff_unique_dep": "LTCI;Modal\u2019X", "aff_unique_url": "https://www.telecom-paris.fr;https://www.univ-paris-nanterre.fr", "aff_unique_abbr": "T\u00e9l\u00e9com Paris;UPN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "On Scalable and Efficient Computation of Large Scale Optimal Transport", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3918", "id": "3918", "author_site": "Yujia Xie, Minshuo Chen, Haoming Jiang, Tuo Zhao, Hongyuan Zha", "author": "Yujia Xie; Minshuo Chen; Haoming Jiang; Tuo Zhao; Hongyuan Zha", "abstract": "Optimal Transport (OT) naturally arises in many machine learning applications, yet the heavy computational burden limits its wide-spread uses. To address the scalability issue, we propose an implicit generative learning-based framework called SPOT (Scalable Push-forward of Optimal Transport). Specifically, we approximate the optimal transport plan by a pushforward of a reference distribution, and cast the optimal transport problem into a minimax problem. We then can solve OT problems efficiently using primal dual stochastic gradient-type algorithms. We also show that we can recover the density of the optimal transport plan using neural ordinary differential equations. Numerical experiments on both synthetic and real datasets illustrate that SPOT is robust and has favorable convergence behavior. SPOT also allows us to efficiently sample from the optimal transport plan, which benefits downstream applications such as domain adaptation.", "bibtex": "@InProceedings{pmlr-v97-xie19a,\n title = \t {On Scalable and Efficient Computation of Large Scale Optimal Transport},\n author = {Xie, Yujia and Chen, Minshuo and Jiang, Haoming and Zhao, Tuo and Zha, Hongyuan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6882--6892},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xie19a/xie19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/xie19a.html},\n abstract = \t {Optimal Transport (OT) naturally arises in many machine learning applications, yet the heavy computational burden limits its wide-spread uses. To address the scalability issue, we propose an implicit generative learning-based framework called SPOT (Scalable Push-forward of Optimal Transport). Specifically, we approximate the optimal transport plan by a pushforward of a reference distribution, and cast the optimal transport problem into a minimax problem. We then can solve OT problems efficiently using primal dual stochastic gradient-type algorithms. We also show that we can recover the density of the optimal transport plan using neural ordinary differential equations. Numerical experiments on both synthetic and real datasets illustrate that SPOT is robust and has favorable convergence behavior. SPOT also allows us to efficiently sample from the optimal transport plan, which benefits downstream applications such as domain adaptation.}\n}", "pdf": "http://proceedings.mlr.press/v97/xie19a/xie19a.pdf", "supp": "", "pdf_size": 5084144, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4161717912699637524&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Georgia Tech; Georgia Tech; Georgia Tech; Georgia Tech; Georgia Tech", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu;cc.gatech.edu", "email": "gatech.edu;gatech.edu;gatech.edu;gatech.edu;cc.gatech.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/xie19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "On Sparse Linear Regression in the Local Differential Privacy Model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3642", "id": "3642", "author_site": "Di Wang, Jinhui Xu", "author": "Di Wang; Jinhui Xu", "abstract": "In this paper, we study the sparse linear regression problem under the Local Differential Privacy (LDP) model. We first show that polynomial dependency on the dimensionality $p$ of the space is unavoidable for the estimation error in both non-interactive and sequential interactive local models, if the privacy of the whole dataset needs to be preserved. Similar limitations also exist for other types of error measurements and in the relaxed local models. This indicates that differential privacy in high dimensional space is unlikely achievable for the problem. With the understanding of this limitation, we then present two algorithmic results. The first one is a sequential interactive LDP algorithm for the low dimensional sparse case, called Locally Differentially Private Iterative Hard Thresholding (LDP-IHT), which achieves a near optimal upper bound. This algorithm is actually rather general and can be used to solve quite a few other problems, such as (Local) DP-ERM with sparsity constraints and sparse regression with non-linear measurements. The second one is for the restricted (high dimensional) case where only the privacy of the responses (labels) needs to be preserved. For this case, we show that the optimal rate of the error estimation can be made logarithmically depending on $p$ (i.e., $\\log p$) in the local model, where an upper bound is obtained by a label-privacy version of LDP-IHT. Experiments on real world and synthetic datasets confirm our theoretical analysis.", "bibtex": "@InProceedings{pmlr-v97-wang19m,\n title = \t {On Sparse Linear Regression in the Local Differential Privacy Model},\n author = {Wang, Di and Xu, Jinhui},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6628--6637},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19m/wang19m.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19m.html},\n abstract = \t {In this paper, we study the sparse linear regression problem under the Local Differential Privacy (LDP) model. We first show that polynomial dependency on the dimensionality $p$ of the space is unavoidable for the estimation error in both non-interactive and sequential interactive local models, if the privacy of the whole dataset needs to be preserved. Similar limitations also exist for other types of error measurements and in the relaxed local models. This indicates that differential privacy in high dimensional space is unlikely achievable for the problem. With the understanding of this limitation, we then present two algorithmic results. The first one is a sequential interactive LDP algorithm for the low dimensional sparse case, called Locally Differentially Private Iterative Hard Thresholding (LDP-IHT), which achieves a near optimal upper bound. This algorithm is actually rather general and can be used to solve quite a few other problems, such as (Local) DP-ERM with sparsity constraints and sparse regression with non-linear measurements. The second one is for the restricted (high dimensional) case where only the privacy of the responses (labels) needs to be preserved. For this case, we show that the optimal rate of the error estimation can be made logarithmically depending on $p$ (i.e., $\\log p$) in the local model, where an upper bound is obtained by a label-privacy version of LDP-IHT. Experiments on real world and synthetic datasets confirm our theoretical analysis.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19m/wang19m.pdf", "supp": "", "pdf_size": 985243, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14907476360042915352&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science and Engineering, State University of New York at Buffalo, Buffalo, USA; Department of Computer Science and Engineering, State University of New York at Buffalo, Buffalo, USA", "aff_domain": "buffalo.edu;buffalo.edu", "email": "buffalo.edu;buffalo.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/wang19m.html", "aff_unique_index": "0;0", "aff_unique_norm": "State University of New York at Buffalo", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.buffalo.edu", "aff_unique_abbr": "SUNY Buffalo", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Buffalo", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "On Symmetric Losses for Learning from Corrupted Labels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3697", "id": "3697", "author_site": "Nontawat Charoenphakdee, Jongyeong Lee, Masashi Sugiyama", "author": "Nontawat Charoenphakdee; Jongyeong Lee; Masashi Sugiyama", "abstract": "This paper aims to provide a better understanding of a symmetric loss. First, we emphasize that using a symmetric loss is advantageous in the balanced error rate (BER) minimization and area under the receiver operating characteristic curve (AUC) maximization from corrupted labels. Second, we prove general theoretical properties of symmetric losses, including a classification-calibration condition, excess risk bound, conditional risk minimizer, and AUC-consistency condition. Third, since all nonnegative symmetric losses are non-convex, we propose a convex barrier hinge loss that benefits significantly from the symmetric condition, although it is not symmetric everywhere. Finally, we conduct experiments to validate the relevance of the symmetric condition.", "bibtex": "@InProceedings{pmlr-v97-charoenphakdee19a,\n title = \t {On Symmetric Losses for Learning from Corrupted Labels},\n author = {Charoenphakdee, Nontawat and Lee, Jongyeong and Sugiyama, Masashi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {961--970},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/charoenphakdee19a/charoenphakdee19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/charoenphakdee19a.html},\n abstract = \t {This paper aims to provide a better understanding of a symmetric loss. First, we emphasize that using a symmetric loss is advantageous in the balanced error rate (BER) minimization and area under the receiver operating characteristic curve (AUC) maximization from corrupted labels. Second, we prove general theoretical properties of symmetric losses, including a classification-calibration condition, excess risk bound, conditional risk minimizer, and AUC-consistency condition. Third, since all nonnegative symmetric losses are non-convex, we propose a convex barrier hinge loss that benefits significantly from the symmetric condition, although it is not symmetric everywhere. Finally, we conduct experiments to validate the relevance of the symmetric condition.}\n}", "pdf": "http://proceedings.mlr.press/v97/charoenphakdee19a/charoenphakdee19a.pdf", "supp": "", "pdf_size": 723856, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10267954479357863559&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, The University of Tokyo, Tokyo, Japan+RIKEN Center of Arti\ufb01cial Intelligence Project, Tokyo, Japan; Department of Computer Science, The University of Tokyo, Tokyo, Japan+RIKEN Center of Arti\ufb01cial Intelligence Project, Tokyo, Japan; Department of Computer Science, The University of Tokyo, Tokyo, Japan+RIKEN Center of Arti\ufb01cial Intelligence Project, Tokyo, Japan", "aff_domain": "ms.k.u-tokyo.ac.jp;ms.k.u-tokyo.ac.jp;k.u-tokyo.ac.jp", "email": "ms.k.u-tokyo.ac.jp;ms.k.u-tokyo.ac.jp;k.u-tokyo.ac.jp", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/charoenphakdee19a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "University of Tokyo;RIKEN Center of Arti\ufb01cial Intelligence", "aff_unique_dep": "Department of Computer Science;Arti\ufb01cial Intelligence", "aff_unique_url": "https://www.u-tokyo.ac.jp;https://www.riken.jp/en/research/labs/aip/", "aff_unique_abbr": "UTokyo;RIKEN AIP", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Japan" }, { "title": "On The Power of Curriculum Learning in Training Deep Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3720", "id": "3720", "author_site": "Guy Hacohen, Daphna Weinshall", "author": "Guy Hacohen; Daphna Weinshall", "abstract": "Training neural networks is traditionally done by providing a sequence of random mini-batches sampled uniformly from the entire training data. In this work, we analyze the effect of curriculum learning, which involves the non-uniform sampling of mini-batches, on the training of deep networks, and specifically CNNs trained for image recognition. To employ curriculum learning, the training algorithm must resolve 2 problems: (i) sort the training examples by difficulty; (ii) compute a series of mini-batches that exhibit an increasing level of difficulty. We address challenge (i) using two methods: transfer learning from some competitive \u201cteacher\" network, and bootstrapping. In our empirical evaluation, both methods show similar benefits in terms of increased learning speed and improved final performance on test data. We address challenge (ii) by investigating different pacing functions to guide the sampling. The empirical investigation includes a variety of network architectures, using images from CIFAR-10, CIFAR-100 and subsets of ImageNet. We conclude with a novel theoretical analysis of curriculum learning, where we show how it effectively modifies the optimization landscape. We then define the concept of an ideal curriculum, and show that under mild conditions it does not change the corresponding global minimum of the optimization function.", "bibtex": "@InProceedings{pmlr-v97-hacohen19a,\n title = \t {On The Power of Curriculum Learning in Training Deep Networks},\n author = {Hacohen, Guy and Weinshall, Daphna},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2535--2544},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hacohen19a/hacohen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hacohen19a.html},\n abstract = \t {Training neural networks is traditionally done by providing a sequence of random mini-batches sampled uniformly from the entire training data. In this work, we analyze the effect of curriculum learning, which involves the non-uniform sampling of mini-batches, on the training of deep networks, and specifically CNNs trained for image recognition. To employ curriculum learning, the training algorithm must resolve 2 problems: (i) sort the training examples by difficulty; (ii) compute a series of mini-batches that exhibit an increasing level of difficulty. We address challenge (i) using two methods: transfer learning from some competitive \u201cteacher\" network, and bootstrapping. In our empirical evaluation, both methods show similar benefits in terms of increased learning speed and improved final performance on test data. We address challenge (ii) by investigating different pacing functions to guide the sampling. The empirical investigation includes a variety of network architectures, using images from CIFAR-10, CIFAR-100 and subsets of ImageNet. We conclude with a novel theoretical analysis of curriculum learning, where we show how it effectively modifies the optimization landscape. We then define the concept of an ideal curriculum, and show that under mild conditions it does not change the corresponding global minimum of the optimization function.}\n}", "pdf": "http://proceedings.mlr.press/v97/hacohen19a/hacohen19a.pdf", "supp": "", "pdf_size": 617271, "gs_citation": 558, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13645945393876441822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Computer Science and Engineering, The Hebrew University of Jerusalem, Jerusalem 91904, Israel+Edmond and Lily Safra Center for Brain Sciences, The Hebrew University of Jerusalem, Jerusalem 91904, Israel; School of Computer Science and Engineering, The Hebrew University of Jerusalem, Jerusalem 91904, Israel", "aff_domain": "mail.huji.ac.il; ", "email": "mail.huji.ac.il; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/hacohen19a.html", "aff_unique_index": "0+0;0", "aff_unique_norm": "Hebrew University of Jerusalem", "aff_unique_dep": "School of Computer Science and Engineering", "aff_unique_url": "http://www.huji.ac.il", "aff_unique_abbr": "HUJI", "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Jerusalem", "aff_country_unique_index": "0+0;0", "aff_country_unique": "Israel" }, { "title": "On Variational Bounds of Mutual Information", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4330", "id": "4330", "author_site": "Ben Poole, Sherjil Ozair, A\u00e4ron van den Oord, Alexander Alemi, George Tucker", "author": "Ben Poole; Sherjil Ozair; Aaron Van Den Oord; Alex Alemi; George Tucker", "abstract": "Estimating and optimizing Mutual Information (MI) is core to many problems in machine learning, but bounding MI in high dimensions is challenging. To establish tractable and scalable objectives, recent work has turned to variational bounds parameterized by neural networks. However, the relationships and tradeoffs between these bounds remains unclear. In this work, we unify these recent developments in a single framework. We find that the existing variational lower bounds degrade when the MI is large, exhibiting either high bias or high variance. To address this problem, we introduce a continuum of lower bounds that encompasses previous bounds and flexibly trades off bias and variance. On high-dimensional, controlled problems, we empirically characterize the bias and variance of the bounds and their gradients and demonstrate the effectiveness of these new bounds for estimation and representation learning.", "bibtex": "@InProceedings{pmlr-v97-poole19a,\n title = \t {On Variational Bounds of Mutual Information},\n author = {Poole, Ben and Ozair, Sherjil and Van Den Oord, Aaron and Alemi, Alex and Tucker, George},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5171--5180},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/poole19a/poole19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/poole19a.html},\n abstract = \t {Estimating and optimizing Mutual Information (MI) is core to many problems in machine learning, but bounding MI in high dimensions is challenging. To establish tractable and scalable objectives, recent work has turned to variational bounds parameterized by neural networks. However, the relationships and tradeoffs between these bounds remains unclear. In this work, we unify these recent developments in a single framework. We find that the existing variational lower bounds degrade when the MI is large, exhibiting either high bias or high variance. To address this problem, we introduce a continuum of lower bounds that encompasses previous bounds and flexibly trades off bias and variance. On high-dimensional, controlled problems, we empirically characterize the bias and variance of the bounds and their gradients and demonstrate the effectiveness of these new bounds for estimation and representation learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/poole19a/poole19a.pdf", "supp": "", "pdf_size": 2318169, "gs_citation": 1008, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11761276849119903728&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Brain; Google Brain + MILA; DeepMind; Google Brain; Google Brain", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/poole19a.html", "aff_unique_index": "0;0+1;2;0;0", "aff_unique_norm": "Google;Mila;DeepMind", "aff_unique_dep": "Google Brain;;", "aff_unique_url": "https://brain.google.com;https://mila.quebec;https://deepmind.com", "aff_unique_abbr": "Google Brain;MILA;DeepMind", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0+1;2;0;0", "aff_country_unique": "United States;Canada;United Kingdom" }, { "title": "On discriminative learning of prediction uncertainty", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4098", "id": "4098", "author_site": "Vojtech Franc, Daniel Prusa", "author": "Vojtech Franc; Daniel Prusa", "abstract": "In classification with a reject option, the classifier is allowed in uncertain cases to abstain from prediction. The classical cost based model of an optimal classifier with a reject option requires the cost of rejection to be defined explicitly. An alternative bounded-improvement model, avoiding the notion of the reject cost, seeks for a classifier with a guaranteed selective risk and maximal cover. We prove that both models share the same class of optimal strategies, and we provide an explicit relation between the reject cost and the target risk being the parameters of the two models. An optimal rejection strategy for both models is based on thresholding the conditional risk defined by posterior probabilities which are usually unavailable. We propose a discriminative algorithm learning an uncertainty function which preserves ordering of the input space induced by the conditional risk, and hence can be used to construct optimal rejection strategies.", "bibtex": "@InProceedings{pmlr-v97-franc19a,\n title = \t {On discriminative learning of prediction uncertainty},\n author = {Franc, Vojtech and Prusa, Daniel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1963--1971},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/franc19a/franc19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/franc19a.html},\n abstract = \t {In classification with a reject option, the classifier is allowed in uncertain cases to abstain from prediction. The classical cost based model of an optimal classifier with a reject option requires the cost of rejection to be defined explicitly. An alternative bounded-improvement model, avoiding the notion of the reject cost, seeks for a classifier with a guaranteed selective risk and maximal cover. We prove that both models share the same class of optimal strategies, and we provide an explicit relation between the reject cost and the target risk being the parameters of the two models. An optimal rejection strategy for both models is based on thresholding the conditional risk defined by posterior probabilities which are usually unavailable. We propose a discriminative algorithm learning an uncertainty function which preserves ordering of the input space induced by the conditional risk, and hence can be used to construct optimal rejection strategies.}\n}", "pdf": "http://proceedings.mlr.press/v97/franc19a/franc19a.pdf", "supp": "", "pdf_size": 1561840, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11397976825557770589&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Cybernetics, Faculty of Electrical Engineering, Czech Technical University in Prague; Department of Cybernetics, Faculty of Electrical Engineering, Czech Technical University in Prague", "aff_domain": "cmp.felk.cvut.cz; ", "email": "cmp.felk.cvut.cz; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/franc19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Czech Technical University in Prague", "aff_unique_dep": "Department of Cybernetics, Faculty of Electrical Engineering", "aff_unique_url": "https://www.cvut.cz", "aff_unique_abbr": "CTU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Prague", "aff_country_unique_index": "0;0", "aff_country_unique": "Czech Republic" }, { "title": "On the Complexity of Approximating Wasserstein Barycenters", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4071", "id": "4071", "author_site": "Alexey Kroshnin, Nazarii Tupitsa, Darina Dvinskikh, Pavel Dvurechenskii, Alexander Gasnikov, Cesar Uribe", "author": "Alexey Kroshnin; Nazarii Tupitsa; Darina Dvinskikh; Pavel Dvurechensky; Alexander Gasnikov; Cesar Uribe", "abstract": "We study the complexity of approximating the Wasserstein barycenter of $m$ discrete measures, or histograms of size $n$, by contrasting two alternative approaches that use entropic regularization. The first approach is based on the Iterative Bregman Projections (IBP) algorithm for which our novel analysis gives a complexity bound proportional to ${mn^2}/{\\varepsilon^2}$ to approximate the original non-regularized barycenter. On the other hand, using an approach based on accelerated gradient descent, we obtain a complexity proportional to\u00a0${mn^{2}}/{\\varepsilon}$. As a byproduct, we show that the regularization parameter in both approaches has to be proportional to $\\varepsilon$, which causes instability of both algorithms when the desired accuracy is high. To overcome this issue, we propose a novel proximal-IBP algorithm, which can be seen as a proximal gradient method, which uses IBP on each iteration to make a proximal step. We also consider the question of scalability of these algorithms using approaches from distributed optimization and show that the first algorithm can be implemented in a centralized distributed setting (master/slave), while the second one is amenable to a more general decentralized distributed setting with an arbitrary network topology.", "bibtex": "@InProceedings{pmlr-v97-kroshnin19a,\n title = \t {On the Complexity of Approximating {W}asserstein Barycenters},\n author = {Kroshnin, Alexey and Tupitsa, Nazarii and Dvinskikh, Darina and Dvurechensky, Pavel and Gasnikov, Alexander and Uribe, Cesar},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3530--3540},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kroshnin19a/kroshnin19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kroshnin19a.html},\n abstract = \t {We study the complexity of approximating the Wasserstein barycenter of $m$ discrete measures, or histograms of size $n$, by contrasting two alternative approaches that use entropic regularization. The first approach is based on the Iterative Bregman Projections (IBP) algorithm for which our novel analysis gives a complexity bound proportional to ${mn^2}/{\\varepsilon^2}$ to approximate the original non-regularized barycenter. On the other hand, using an approach based on accelerated gradient descent, we obtain a complexity proportional to\u00a0${mn^{2}}/{\\varepsilon}$. As a byproduct, we show that the regularization parameter in both approaches has to be proportional to $\\varepsilon$, which causes instability of both algorithms when the desired accuracy is high. To overcome this issue, we propose a novel proximal-IBP algorithm, which can be seen as a proximal gradient method, which uses IBP on each iteration to make a proximal step. We also consider the question of scalability of these algorithms using approaches from distributed optimization and show that the first algorithm can be implemented in a centralized distributed setting (master/slave), while the second one is amenable to a more general decentralized distributed setting with an arbitrary network topology.}\n}", "pdf": "http://proceedings.mlr.press/v97/kroshnin19a/kroshnin19a.pdf", "supp": "", "pdf_size": 546263, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4969591329065523547&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/kroshnin19a.html" }, { "title": "On the Computation and Communication Complexity of Parallel SGD with Dynamic Batch Sizes for Stochastic Non-Convex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3781", "id": "3781", "author_site": "Hao Yu, rong jin", "author": "Hao Yu; Rong Jin", "abstract": "For SGD based distributed stochastic optimization, computation complexity, measured by the convergence rate in terms of the number of stochastic gradient calls, and communication complexity, measured by the number of inter-node communication rounds, are two most important performance metrics. The classical data-parallel implementation of SGD over N workers can achieve linear speedup of its convergence rate but incurs an inter-node communication round at each batch. We study the benefit of using dynamically increasing batch sizes in parallel SGD for stochastic non-convex optimization by charactering the attained convergence rate and the required number of communication rounds. We show that for stochastic non-convex optimization under the P-L condition, the classical data-parallel SGD with exponentially increasing batch sizes can achieve the fastest known $O(1/(NT))$ convergence with linear speedup using only $\\log(T)$ communication rounds. For general stochastic non-convex optimization, we propose a Catalyst-like algorithm to achieve the fastest known $O(1/\\sqrt{NT})$ convergence with only $O(\\sqrt{NT}\\log(\\frac{T}{N}))$ communication rounds.", "bibtex": "@InProceedings{pmlr-v97-yu19c,\n title = \t {On the Computation and Communication Complexity of Parallel {SGD} with Dynamic Batch Sizes for Stochastic Non-Convex Optimization},\n author = {Yu, Hao and Jin, Rong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7174--7183},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yu19c/yu19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/yu19c.html},\n abstract = \t {For SGD based distributed stochastic optimization, computation complexity, measured by the convergence rate in terms of the number of stochastic gradient calls, and communication complexity, measured by the number of inter-node communication rounds, are two most important performance metrics. The classical data-parallel implementation of SGD over N workers can achieve linear speedup of its convergence rate but incurs an inter-node communication round at each batch. We study the benefit of using dynamically increasing batch sizes in parallel SGD for stochastic non-convex optimization by charactering the attained convergence rate and the required number of communication rounds. We show that for stochastic non-convex optimization under the P-L condition, the classical data-parallel SGD with exponentially increasing batch sizes can achieve the fastest known $O(1/(NT))$ convergence with linear speedup using only $\\log(T)$ communication rounds. For general stochastic non-convex optimization, we propose a Catalyst-like algorithm to achieve the fastest known $O(1/\\sqrt{NT})$ convergence with only $O(\\sqrt{NT}\\log(\\frac{T}{N}))$ communication rounds.}\n}", "pdf": "http://proceedings.mlr.press/v97/yu19c/yu19c.pdf", "supp": "", "pdf_size": 2822004, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4292586064861492856&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";", "aff_domain": ";", "email": ";", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/yu19c.html" }, { "title": "On the Connection Between Adversarial Robustness and Saliency Map Interpretability", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4176", "id": "4176", "author_site": "Christian Etmann, Sebastian Lunz, Peter Maass, Carola-Bibiane Sch\u00f6nlieb", "author": "Christian Etmann; Sebastian Lunz; Peter Maass; Carola Schoenlieb", "abstract": "Recent studies on the adversarial vulnerability of neural networks have shown that models trained to be more robust to adversarial attacks exhibit more interpretable saliency maps than their non-robust counterparts. We aim to quantify this behaviour by considering the alignment between input image and saliency map. We hypothesize that as the distance to the decision boundary grows, so does the alignment. This connection is strictly true in the case of linear models. We confirm these theoretical findings with experiments based on models trained with a local Lipschitz regularization and identify where the nonlinear nature of neural networks weakens the relation.", "bibtex": "@InProceedings{pmlr-v97-etmann19a,\n title = \t {On the Connection Between Adversarial Robustness and Saliency Map Interpretability},\n author = {Etmann, Christian and Lunz, Sebastian and Maass, Peter and Schoenlieb, Carola},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1823--1832},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/etmann19a/etmann19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/etmann19a.html},\n abstract = \t {Recent studies on the adversarial vulnerability of neural networks have shown that models trained to be more robust to adversarial attacks exhibit more interpretable saliency maps than their non-robust counterparts. We aim to quantify this behaviour by considering the alignment between input image and saliency map. We hypothesize that as the distance to the decision boundary grows, so does the alignment. This connection is strictly true in the case of linear models. We confirm these theoretical findings with experiments based on models trained with a local Lipschitz regularization and identify where the nonlinear nature of neural networks weakens the relation.}\n}", "pdf": "http://proceedings.mlr.press/v97/etmann19a/etmann19a.pdf", "supp": "", "pdf_size": 7311037, "gs_citation": 182, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9006157315043198858&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Center for Industrial Mathematics, University of Bremen, Bremen, Germany+DAMTP, Cambridge; DAMTP, University of Cambridge, Cambridge, United Kingdom; Center for Industrial Mathematics, University of Bremen, Bremen, Germany; DAMTP, University of Cambridge, Cambridge, United Kingdom", "aff_domain": "math.uni-bremen.de;math.cam.ac.uk; ; ", "email": "math.uni-bremen.de;math.cam.ac.uk; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/etmann19a.html", "aff_unique_index": "0+1;1;0;1", "aff_unique_norm": "University of Bremen;University of Cambridge", "aff_unique_dep": "Center for Industrial Mathematics;Department of Applied Mathematics and Theoretical Physics", "aff_unique_url": "https://www.uni-bremen.de;https://www DAMTP.cam.ac.uk", "aff_unique_abbr": ";DAMTP", "aff_campus_unique_index": "0+1;1;0;1", "aff_campus_unique": "Bremen;Cambridge", "aff_country_unique_index": "0+1;1;0;1", "aff_country_unique": "Germany;United Kingdom" }, { "title": "On the Convergence and Robustness of Adversarial Training", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3680", "id": "3680", "author_site": "Yisen Wang, Xingjun Ma, James Bailey, Jinfeng Yi, Bowen Zhou, Quanquan Gu", "author": "Yisen Wang; Xingjun Ma; James Bailey; Jinfeng Yi; Bowen Zhou; Quanquan Gu", "abstract": "Improving the robustness of deep neural networks (DNNs) to adversarial examples is an important yet challenging problem for secure deep learning. Across existing defense techniques, adversarial training with Projected Gradient Decent (PGD) is amongst the most effective. Adversarial training solves a min-max optimization problem, with the", "bibtex": "@InProceedings{pmlr-v97-wang19i,\n title = \t {On the Convergence and Robustness of Adversarial Training},\n author = {Wang, Yisen and Ma, Xingjun and Bailey, James and Yi, Jinfeng and Zhou, Bowen and Gu, Quanquan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6586--6595},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19i/wang19i.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19i.html},\n abstract = \t {Improving the robustness of deep neural networks (DNNs) to adversarial examples is an important yet challenging problem for secure deep learning. Across existing defense techniques, adversarial training with Projected Gradient Decent (PGD) is amongst the most effective. Adversarial training solves a min-max optimization problem, with the", "pdf": "http://proceedings.mlr.press/v97/wang19i/wang19i.pdf", "supp": "", "pdf_size": 9136972, "gs_citation": 456, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16521650606363835532&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "JD.com; The University of Melbourne; The University of Melbourne; JD.com; JD.com; The University of California, Los Angeles", "aff_domain": "jd.com;unimelb.edu.au;unimelb.edu.au;jd.com;jd.com;cs.ucla.edu", "email": "jd.com;unimelb.edu.au;unimelb.edu.au;jd.com;jd.com;cs.ucla.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/wang19i.html", "aff_unique_index": "0;1;1;0;0;2", "aff_unique_norm": "JD.com;University of Melbourne;University of California, Los Angeles", "aff_unique_dep": ";;", "aff_unique_url": "https://www.jd.com;https://www.unimelb.edu.au;https://www.ucla.edu", "aff_unique_abbr": "JD;UniMelb;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;1;0;0;2", "aff_country_unique": "China;Australia;United States" }, { "title": "On the Design of Estimators for Bandit Off-Policy Evaluation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4218", "id": "4218", "author_site": "Nikos Vlassis, Aurelien Bibaut, Maria Dimakopoulou, Tony Jebara", "author": "Nikos Vlassis; Aurelien Bibaut; Maria Dimakopoulou; Tony Jebara", "abstract": "Off-policy evaluation is the problem of estimating the value of a target policy using data collected under a different policy. Given a base estimator for bandit off-policy evaluation and a parametrized class of control variates, we address the problem of computing a control variate in that class that reduces the risk of the base estimator. We derive the population risk as a function of the class parameters and we establish conditions that guarantee risk improvement. We present our main results in the context of multi-armed bandits, and we propose a simple design for contextual bandits that gives rise to an estimator that is shown to perform well in multi-class cost-sensitive classification datasets.", "bibtex": "@InProceedings{pmlr-v97-vlassis19a,\n title = \t {On the Design of Estimators for Bandit Off-Policy Evaluation},\n author = {Vlassis, Nikos and Bibaut, Aurelien and Dimakopoulou, Maria and Jebara, Tony},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6468--6476},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/vlassis19a/vlassis19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/vlassis19a.html},\n abstract = \t {Off-policy evaluation is the problem of estimating the value of a target policy using data collected under a different policy. Given a base estimator for bandit off-policy evaluation and a parametrized class of control variates, we address the problem of computing a control variate in that class that reduces the risk of the base estimator. We derive the population risk as a function of the class parameters and we establish conditions that guarantee risk improvement. We present our main results in the context of multi-armed bandits, and we propose a simple design for contextual bandits that gives rise to an estimator that is shown to perform well in multi-class cost-sensitive classification datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/vlassis19a/vlassis19a.pdf", "supp": "", "pdf_size": 550254, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5985529266725528586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Netflix, Los Gatos CA, USA; Department of Biostatistics, University of California Berkeley, Berkeley, USA; Netflix, Los Gatos CA, USA; Netflix, Los Gatos CA, USA", "aff_domain": "netflix.com; ; ; ", "email": "netflix.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/vlassis19a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Netflix;University of California, Berkeley", "aff_unique_dep": ";Department of Biostatistics", "aff_unique_url": "https://www.netflix.com;https://www.berkeley.edu", "aff_unique_abbr": "Netflix;UC Berkeley", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Los Gatos;Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Feasibility of Learning, Rather than Assuming, Human Biases for Reward Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4315", "id": "4315", "author_site": "Rohin Shah, Noah Gundotra, Pieter Abbeel, Anca Dragan", "author": "Rohin Shah; Noah Gundotra; Pieter Abbeel; Anca Dragan", "abstract": "Our goal is for agents to optimize the right reward function, despite how difficult it is for us to specify what that is. Inverse Reinforcement Learning (IRL) enables us to infer reward functions from demonstrations, but it usually assumes that the expert is noisily optimal. Real people, on the other hand, often have systematic biases: risk-aversion, myopia, etc. One option is to try to characterize these biases and account for them explicitly during learning. But in the era of deep learning, a natural suggestion researchers make is to avoid mathematical models of human behavior that are fraught with specific assumptions, and instead use a purely data-driven approach. We decided to put this to the test \u2013 rather than relying on assumptions about which specific bias the demonstrator has when planning, we instead learn the demonstrator\u2019s planning algorithm that they use to generate demonstrations, as a differentiable planner. Our exploration yielded mixed findings: on the one hand, learning the planner can lead to better reward inference than relying on the wrong assumption; on the other hand, this benefit is dwarfed by the loss we incur by going from an exact to a differentiable planner. This suggest that at least for the foreseeable future, agents need a middle ground between the flexibility of data-driven methods and the useful bias of known human biases. Code is available at https://tinyurl.com/learningbiases.", "bibtex": "@InProceedings{pmlr-v97-shah19a,\n title = \t {On the Feasibility of Learning, Rather than Assuming, Human Biases for Reward Inference},\n author = {Shah, Rohin and Gundotra, Noah and Abbeel, Pieter and Dragan, Anca},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5670--5679},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shah19a/shah19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/shah19a.html},\n abstract = \t {Our goal is for agents to optimize the right reward function, despite how difficult it is for us to specify what that is. Inverse Reinforcement Learning (IRL) enables us to infer reward functions from demonstrations, but it usually assumes that the expert is noisily optimal. Real people, on the other hand, often have systematic biases: risk-aversion, myopia, etc. One option is to try to characterize these biases and account for them explicitly during learning. But in the era of deep learning, a natural suggestion researchers make is to avoid mathematical models of human behavior that are fraught with specific assumptions, and instead use a purely data-driven approach. We decided to put this to the test \u2013 rather than relying on assumptions about which specific bias the demonstrator has when planning, we instead learn the demonstrator\u2019s planning algorithm that they use to generate demonstrations, as a differentiable planner. Our exploration yielded mixed findings: on the one hand, learning the planner can lead to better reward inference than relying on the wrong assumption; on the other hand, this benefit is dwarfed by the loss we incur by going from an exact to a differentiable planner. This suggest that at least for the foreseeable future, agents need a middle ground between the flexibility of data-driven methods and the useful bias of known human biases. Code is available at https://tinyurl.com/learningbiases.}\n}", "pdf": "http://proceedings.mlr.press/v97/shah19a/shah19a.pdf", "supp": "", "pdf_size": 1648552, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16060534901891863389&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering and Computer Science, UC Berkeley; Department of Electrical Engineering and Computer Science, UC Berkeley; Department of Electrical Engineering and Computer Science, UC Berkeley; Department of Electrical Engineering and Computer Science, UC Berkeley", "aff_domain": "berkeley.edu; ; ; ", "email": "berkeley.edu; ; ; ", "github": "", "project": "https://tinyurl.com/learningbiases", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/shah19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Electrical Engineering and Computer Science", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Generalization Gap in Reparameterizable Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4206", "id": "4206", "author_site": "Huan Wang, Stephan Zheng, Caiming Xiong, Richard Socher", "author": "Huan Wang; Stephan Zheng; Caiming Xiong; Richard Socher", "abstract": "Understanding generalization in reinforcement learning (RL) is a significant challenge, as many common assumptions of traditional supervised learning theory do not apply. We focus on the special class of reparameterizable RL problems, where the trajectory distribution can be decomposed using the reparametrization trick. For this problem class, estimating the expected return is efficient and the trajectory can be computed deterministically given peripheral random variables, which enables us to study reparametrizable RL using supervised learning and transfer learning theory. Through these relationships, we derive guarantees on the gap between the expected and empirical return for both intrinsic and external errors, based on Rademacher complexity as well as the PAC-Bayes bound. Our bound suggests the generalization capability of reparameterizable RL is related to multiple factors including \u201csmoothness\u201d of the environment transition, reward and agent policy function class. We also empirically verify the relationship between the generalization gap and these factors through simulations.", "bibtex": "@InProceedings{pmlr-v97-wang19o,\n title = \t {On the Generalization Gap in Reparameterizable Reinforcement Learning},\n author = {Wang, Huan and Zheng, Stephan and Xiong, Caiming and Socher, Richard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6648--6658},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19o/wang19o.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19o.html},\n abstract = \t {Understanding generalization in reinforcement learning (RL) is a significant challenge, as many common assumptions of traditional supervised learning theory do not apply. We focus on the special class of reparameterizable RL problems, where the trajectory distribution can be decomposed using the reparametrization trick. For this problem class, estimating the expected return is efficient and the trajectory can be computed deterministically given peripheral random variables, which enables us to study reparametrizable RL using supervised learning and transfer learning theory. Through these relationships, we derive guarantees on the gap between the expected and empirical return for both intrinsic and external errors, based on Rademacher complexity as well as the PAC-Bayes bound. Our bound suggests the generalization capability of reparameterizable RL is related to multiple factors including \u201csmoothness\u201d of the environment transition, reward and agent policy function class. We also empirically verify the relationship between the generalization gap and these factors through simulations.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19o/wang19o.pdf", "supp": "", "pdf_size": 349629, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10972822240819531840&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Salesforce Research, Palo Alto CA, USA; Salesforce Research, Palo Alto CA, USA; Salesforce Research, Palo Alto CA, USA; Salesforce Research, Palo Alto CA, USA", "aff_domain": "salesforce.com; ; ; ", "email": "salesforce.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wang19o.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Salesforce Research", "aff_unique_dep": "Research", "aff_unique_url": "https://research.salesforce.com", "aff_unique_abbr": "Salesforce", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Palo Alto", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "On the Impact of the Activation function on Deep Neural Networks Training", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3911", "id": "3911", "author_site": "Soufiane Hayou, Arnaud Doucet, Judith Rousseau", "author": "Soufiane Hayou; Arnaud Doucet; Judith Rousseau", "abstract": "The weight initialization and the activation function of deep neural networks have a crucial impact on the performance of the training procedure. An inappropriate selection can lead to the loss of information of the input during forward propagation and the exponential vanishing/exploding of gradients during back-propagation. Understanding the theoretical properties of untrained random networks is key to identifying which deep networks may be trained successfully as recently demonstrated by Samuel et al. (2017) who showed that for deep feedforward neural networks only a specific choice of hyperparameters known as the \u2018Edge of Chaos\u2019 can lead to good performance. While the work by Samuel et al. (2017) discuss trainability issues, we focus here on training acceleration and overall performance. We give a comprehensive theoretical analysis of the Edge of Chaos and show that we can indeed tune the initialization parameters and the activation function in order to accelerate the training and improve the performance.", "bibtex": "@InProceedings{pmlr-v97-hayou19a,\n title = \t {On the Impact of the Activation function on Deep Neural Networks Training},\n author = {Hayou, Soufiane and Doucet, Arnaud and Rousseau, Judith},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2672--2680},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hayou19a/hayou19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hayou19a.html},\n abstract = \t {The weight initialization and the activation function of deep neural networks have a crucial impact on the performance of the training procedure. An inappropriate selection can lead to the loss of information of the input during forward propagation and the exponential vanishing/exploding of gradients during back-propagation. Understanding the theoretical properties of untrained random networks is key to identifying which deep networks may be trained successfully as recently demonstrated by Samuel et al. (2017) who showed that for deep feedforward neural networks only a specific choice of hyperparameters known as the \u2018Edge of Chaos\u2019 can lead to good performance. While the work by Samuel et al. (2017) discuss trainability issues, we focus here on training acceleration and overall performance. We give a comprehensive theoretical analysis of the Edge of Chaos and show that we can indeed tune the initialization parameters and the activation function in order to accelerate the training and improve the performance.}\n}", "pdf": "http://proceedings.mlr.press/v97/hayou19a/hayou19a.pdf", "supp": "", "pdf_size": 1326691, "gs_citation": 304, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7881880287979104428&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Statistics, University of Oxford; Department of Statistics, University of Oxford; Department of Statistics, University of Oxford", "aff_domain": "stats.ox.ac.uk; ; ", "email": "stats.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hayou19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "On the Limitations of Representing Functions on Sets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4106", "id": "4106", "author_site": "Edward Wagstaff, Fabian Fuchs, Martin Engelcke, Ingmar Posner, Michael A Osborne", "author": "Edward Wagstaff; Fabian Fuchs; Martin Engelcke; Ingmar Posner; Michael A. Osborne", "abstract": "Recent work on the representation of functions on sets has considered the use of summation in a latent space to enforce permutation invariance. In particular, it has been conjectured that the dimension of this latent space may remain fixed as the cardinality of the sets under consideration increases. However, we demonstrate that the analysis leading to this conjecture requires mappings which are highly discontinuous and argue that this is only of limited practical use. Motivated by this observation, we prove that an implementation of this model via continuous mappings (as provided by e.g. neural networks or Gaussian processes) actually imposes a constraint on the dimensionality of the latent space. Practical universal function representation for set inputs can only be achieved with a latent dimension at least the size of the maximum number of input elements.", "bibtex": "@InProceedings{pmlr-v97-wagstaff19a,\n title = \t {On the Limitations of Representing Functions on Sets},\n author = {Wagstaff, Edward and Fuchs, Fabian and Engelcke, Martin and Posner, Ingmar and Osborne, Michael A.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6487--6494},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wagstaff19a/wagstaff19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wagstaff19a.html},\n abstract = \t {Recent work on the representation of functions on sets has considered the use of summation in a latent space to enforce permutation invariance. In particular, it has been conjectured that the dimension of this latent space may remain fixed as the cardinality of the sets under consideration increases. However, we demonstrate that the analysis leading to this conjecture requires mappings which are highly discontinuous and argue that this is only of limited practical use. Motivated by this observation, we prove that an implementation of this model via continuous mappings (as provided by e.g. neural networks or Gaussian processes) actually imposes a constraint on the dimensionality of the latent space. Practical universal function representation for set inputs can only be achieved with a latent dimension at least the size of the maximum number of input elements.}\n}", "pdf": "http://proceedings.mlr.press/v97/wagstaff19a/wagstaff19a.pdf", "supp": "", "pdf_size": 1262614, "gs_citation": 229, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16337798071667485621&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Engineering Science, University of Oxford; Department of Engineering Science, University of Oxford; Department of Engineering Science, University of Oxford; Department of Engineering Science, University of Oxford; Department of Engineering Science, University of Oxford", "aff_domain": "robots.ox.ac.uk;robots.ox.ac.uk;robots.ox.ac.uk; ; ", "email": "robots.ox.ac.uk;robots.ox.ac.uk;robots.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/wagstaff19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "Department of Engineering Science", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Oxford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "On the Linear Speedup Analysis of Communication Efficient Momentum SGD for Distributed Non-Convex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3779", "id": "3779", "author_site": "Hao Yu, rong jin, Sen Yang", "author": "Hao Yu; Rong Jin; Sen Yang", "abstract": "Recent developments on large-scale distributed machine learning applications, e.g., deep neural networks, benefit enormously from the advances in distributed non-convex optimization techniques, e.g., distributed Stochastic Gradient Descent (SGD). A series of recent works study the linear speedup property of distributed SGD variants with reduced communication. The linear speedup property enables us to scale out the computing capability by adding more computing nodes into our system. The reduced communication complexity is desirable since communication overhead is often the performance bottleneck in distributed systems. Recently, momentum methods are more and more widely adopted by practitioners to train machine learning models since they can often converge faster and generalize better. However, it remains unclear whether any distributed momentum SGD possesses the same linear speedup property as distributed SGD and has reduced communication complexity. This paper fills the gap by considering a distributed communication efficient momentum SGD method and proving its linear speedup property.", "bibtex": "@InProceedings{pmlr-v97-yu19d,\n title = \t {On the Linear Speedup Analysis of Communication Efficient Momentum {SGD} for Distributed Non-Convex Optimization},\n author = {Yu, Hao and Jin, Rong and Yang, Sen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7184--7193},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yu19d/yu19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/yu19d.html},\n abstract = \t {Recent developments on large-scale distributed machine learning applications, e.g., deep neural networks, benefit enormously from the advances in distributed non-convex optimization techniques, e.g., distributed Stochastic Gradient Descent (SGD). A series of recent works study the linear speedup property of distributed SGD variants with reduced communication. The linear speedup property enables us to scale out the computing capability by adding more computing nodes into our system. The reduced communication complexity is desirable since communication overhead is often the performance bottleneck in distributed systems. Recently, momentum methods are more and more widely adopted by practitioners to train machine learning models since they can often converge faster and generalize better. However, it remains unclear whether any distributed momentum SGD possesses the same linear speedup property as distributed SGD and has reduced communication complexity. This paper fills the gap by considering a distributed communication efficient momentum SGD method and proving its linear speedup property.}\n}", "pdf": "http://proceedings.mlr.press/v97/yu19d/yu19d.pdf", "supp": "", "pdf_size": 859532, "gs_citation": 452, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9719460488456131754&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Machine Intelligence Technology Lab, Alibaba Group (U.S.) Inc., Bellevue, WA; Machine Intelligence Technology Lab, Alibaba Group (U.S.) Inc., Bellevue, WA; Machine Intelligence Technology Lab, Alibaba Group (U.S.) Inc., Bellevue, WA", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yu19d.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Alibaba Group", "aff_unique_dep": "Machine Intelligence Technology Lab", "aff_unique_url": "https://www.alibaba.com", "aff_unique_abbr": "Alibaba", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Bellevue", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "On the Long-term Impact of Algorithmic Decision Policies: Effort Unfairness and Feature Segregation through Social Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3998", "id": "3998", "author_site": "Hoda Heidari, Vedant Nanda, Krishna Gummadi", "author": "Hoda Heidari; Vedant Nanda; Krishna Gummadi", "abstract": "Most existing notions of algorithmic fairness are one-shot: they ensure some form of allocative equality at the time of decision making, but do not account for the adverse impact of the algorithmic decisions today on the long-term welfare and prosperity of certain segments of the population. We take a broader perspective on algorithmic fairness. We propose an effort-based measure of fairness and present a data-driven framework for characterizing the long-term impact of algorithmic policies on reshaping the underlying population. Motivated by the psychological literature on social learning and the economic literature on equality of opportunity, we propose a micro-scale model of how individuals may respond to decision-making algorithms. We employ existing measures of segregation from sociology and economics to quantify the resulting macro- scale population-level change. Importantly, we observe that different models may shift the group- conditional distribution of qualifications in different directions. Our findings raise a number of important questions regarding the formalization of fairness for decision-making models.", "bibtex": "@InProceedings{pmlr-v97-heidari19a,\n title = \t {On the Long-term Impact of Algorithmic Decision Policies: Effort Unfairness and Feature Segregation through Social Learning},\n author = {Heidari, Hoda and Nanda, Vedant and Gummadi, Krishna},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2692--2701},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/heidari19a/heidari19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/heidari19a.html},\n abstract = \t {Most existing notions of algorithmic fairness are one-shot: they ensure some form of allocative equality at the time of decision making, but do not account for the adverse impact of the algorithmic decisions today on the long-term welfare and prosperity of certain segments of the population. We take a broader perspective on algorithmic fairness. We propose an effort-based measure of fairness and present a data-driven framework for characterizing the long-term impact of algorithmic policies on reshaping the underlying population. Motivated by the psychological literature on social learning and the economic literature on equality of opportunity, we propose a micro-scale model of how individuals may respond to decision-making algorithms. We employ existing measures of segregation from sociology and economics to quantify the resulting macro- scale population-level change. Importantly, we observe that different models may shift the group- conditional distribution of qualifications in different directions. Our findings raise a number of important questions regarding the formalization of fairness for decision-making models.}\n}", "pdf": "http://proceedings.mlr.press/v97/heidari19a/heidari19a.pdf", "supp": "", "pdf_size": 8485694, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17715435590222166097&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Computer Science Department, ETH Z\u00fcrich, Z\u00fcrich, Switzerland; MPI-SWS, Saarbr\u00fccken, Germany; MPI-SWS, Saarbr\u00fccken, Germany", "aff_domain": "inf.ethz.ch; ; ", "email": "inf.ethz.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/heidari19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "ETH Zurich;Max Planck Institute for Software Systems", "aff_unique_dep": "Computer Science Department;", "aff_unique_url": "https://www.ethz.ch;https://www.mpi-sws.org", "aff_unique_abbr": "ETH;MPI-SWS", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Z\u00fcrich;Saarbr\u00fccken", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Switzerland;Germany" }, { "title": "On the Spectral Bias of Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4226", "id": "4226", "author_site": "Nasim Rahaman, Aristide Baratin, Devansh Arpit, Felix Draxler, Min Lin, Fred Hamprecht, Yoshua Bengio, Aaron Courville", "author": "Nasim Rahaman; Aristide Baratin; Devansh Arpit; Felix Draxler; Min Lin; Fred Hamprecht; Yoshua Bengio; Aaron Courville", "abstract": "Neural networks are known to be a class of highly expressive functions able to fit even random input-output mappings with 100% accuracy. In this work we present properties of neural networks that complement this aspect of expressivity. By using tools from Fourier analysis, we highlight a learning bias of deep networks towards low frequency functions \u2013 i.e. functions that vary globally without local fluctuations \u2013 which manifests itself as a frequency-dependent learning speed. Intuitively, this property is in line with the observation that over-parameterized networks prioritize learning simple patterns that generalize across data samples. We also investigate the role of the shape of the data manifold by presenting empirical and theoretical evidence that, somewhat counter-intuitively, learning higher frequencies gets easier with increasing manifold complexity.", "bibtex": "@InProceedings{pmlr-v97-rahaman19a,\n title = \t {On the Spectral Bias of Neural Networks},\n author = {Rahaman, Nasim and Baratin, Aristide and Arpit, Devansh and Draxler, Felix and Lin, Min and Hamprecht, Fred and Bengio, Yoshua and Courville, Aaron},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5301--5310},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rahaman19a/rahaman19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rahaman19a.html},\n abstract = \t {Neural networks are known to be a class of highly expressive functions able to fit even random input-output mappings with 100% accuracy. In this work we present properties of neural networks that complement this aspect of expressivity. By using tools from Fourier analysis, we highlight a learning bias of deep networks towards low frequency functions \u2013 i.e. functions that vary globally without local fluctuations \u2013 which manifests itself as a frequency-dependent learning speed. Intuitively, this property is in line with the observation that over-parameterized networks prioritize learning simple patterns that generalize across data samples. We also investigate the role of the shape of the data manifold by presenting empirical and theoretical evidence that, somewhat counter-intuitively, learning higher frequencies gets easier with increasing manifold complexity.}\n}", "pdf": "http://proceedings.mlr.press/v97/rahaman19a/rahaman19a.pdf", "supp": "", "pdf_size": 1648942, "gs_citation": 1826, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6023723620228240592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Mila, Quebec, Canada; Mila, Quebec, Canada; Mila, Quebec, Canada; ImageAnalysis and Learning Lab, Ruprecht-Karls-Universit\u00e4t Heidelberg, Germany; Mila, Quebec, Canada; ImageAnalysis and Learning Lab, Ruprecht-Karls-Universit\u00e4t Heidelberg, Germany; Mila, Quebec, Canada; Mila, Quebec, Canada", "aff_domain": "live.com;umontreal.ca;gmail.com; ; ; ; ;", "email": "live.com;umontreal.ca;gmail.com; ; ; ; ;", "github": "https://github.com/nasimrahaman/SpectralBias", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/rahaman19a.html", "aff_unique_index": "0;0;0;1;0;1;0;0", "aff_unique_norm": "Mila;Ruprecht-Karls-Universit\u00e4t Heidelberg", "aff_unique_dep": ";ImageAnalysis and Learning Lab", "aff_unique_url": "https://mila.quebec;https://www.uni-heidelberg.de", "aff_unique_abbr": "Mila;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;1;0;0", "aff_country_unique": "Canada;Germany" }, { "title": "On the Universality of Invariant Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3813", "id": "3813", "author_site": "Haggai Maron, Ethan Fetaya, Nimrod Segol, Yaron Lipman", "author": "Haggai Maron; Ethan Fetaya; Nimrod Segol; Yaron Lipman", "abstract": "Constraining linear layers in neural networks to respect symmetry transformations from a group $G$ is a common design principle for invariant networks that has found many applications in machine learning. \t\t In this paper, we consider a fundamental question that has received very little attention to date: Can these networks approximate any (continuous) invariant function? \t\t We tackle the rather general case where $G\\leq S_n$ (an arbitrary subgroup of the symmetric group) that acts on $\\R^n$ by permuting coordinates. This setting includes several recent popular invariant networks. We present two main results: First, $G$-invariant networks are universal if high-order tensors are allowed. Second, there are groups $G$ for which higher-order tensors are unavoidable for obtaining universality. \t\t $G$-invariant networks consisting of only first-order tensors are of special interest due to their practical value. We conclude the paper by proving a necessary condition for the universality of $G$-invariant networks that incorporate only first-order tensors. Lastly, we propose a conjecture stating that this condition is also sufficient.", "bibtex": "@InProceedings{pmlr-v97-maron19a,\n title = \t {On the Universality of Invariant Networks},\n author = {Maron, Haggai and Fetaya, Ethan and Segol, Nimrod and Lipman, Yaron},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4363--4371},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/maron19a/maron19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/maron19a.html},\n abstract = \t {Constraining linear layers in neural networks to respect symmetry transformations from a group $G$ is a common design principle for invariant networks that has found many applications in machine learning. \t\t In this paper, we consider a fundamental question that has received very little attention to date: Can these networks approximate any (continuous) invariant function? \t\t We tackle the rather general case where $G\\leq S_n$ (an arbitrary subgroup of the symmetric group) that acts on $\\R^n$ by permuting coordinates. This setting includes several recent popular invariant networks. We present two main results: First, $G$-invariant networks are universal if high-order tensors are allowed. Second, there are groups $G$ for which higher-order tensors are unavoidable for obtaining universality. \t\t $G$-invariant networks consisting of only first-order tensors are of special interest due to their practical value. We conclude the paper by proving a necessary condition for the universality of $G$-invariant networks that incorporate only first-order tensors. Lastly, we propose a conjecture stating that this condition is also sufficient.}\n}", "pdf": "http://proceedings.mlr.press/v97/maron19a/maron19a.pdf", "supp": "", "pdf_size": 2752683, "gs_citation": 292, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13306450202288739632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science and Applied Mathematics, Weizmann Institute of Science, Rehovot, Israel; Department of Computer Science, University of Toronto, Toronto, Canada + Vector Institute; Department of Computer Science and Applied Mathematics, Weizmann Institute of Science, Rehovot, Israel; Department of Computer Science and Applied Mathematics, Weizmann Institute of Science, Rehovot, Israel", "aff_domain": "weizmann.ac.il; ; ; ", "email": "weizmann.ac.il; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/maron19a.html", "aff_unique_index": "0;1+2;0;0", "aff_unique_norm": "Weizmann Institute of Science;University of Toronto;Vector Institute", "aff_unique_dep": "Department of Computer Science and Applied Mathematics;Department of Computer Science;", "aff_unique_url": "https://www.weizmann.ac.il;https://www.utoronto.ca;https://vectorinstitute.ai/", "aff_unique_abbr": "Weizmann;U of T;Vector Institute", "aff_campus_unique_index": "0;1;0;0", "aff_campus_unique": "Rehovot;Toronto;", "aff_country_unique_index": "0;1+1;0;0", "aff_country_unique": "Israel;Canada" }, { "title": "On the statistical rate of nonlinear recovery in generative models with heavy-tailed data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3990", "id": "3990", "author_site": "Xiaohan Wei, Zhuoran Yang, Zhaoran Wang", "author": "Xiaohan Wei; Zhuoran Yang; Zhaoran Wang", "abstract": "We consider estimating a high-dimensional vector from non-linear measurements where the unknown vector is represented by a generative model $G:\\mathbb{R}^k\\rightarrow\\mathbb{R}^d$ with $k\\ll d$. Such a model poses structural priors on the unknown vector without having a dedicated basis, and in particular allows new and efficient approaches solving recovery problems with number of measurements far less than the ambient dimension of the vector. While progresses have been made recently regarding theoretical understandings on the linear Gaussian measurements, much less is known when the model is possibly misspecified and the measurements are non-Gaussian. In this paper, we make a step towards such a direction by considering the scenario where the measurements are non-Gaussian, subject to possibly unknown nonlinear transformations and the responses are heavy-tailed. We then propose new estimators via score functions based on the first and second order Stein\u2019s identity, and prove the sample size bound of $m=\\mathcal{O}(k\\varepsilon^{-2}\\log(L/\\varepsilon))$ achieving an $\\varepsilon$ error in the form of exponential concentration inequalities. Furthermore, for the special case of multi-layer ReLU generative model, we improve the sample bound by a logarithm factor to $m=\\mathcal{O}(k\\varepsilon^{-2}\\log(d))$, matching the state-of-art statistical rate in compressed sensing for estimating $k$-sparse vectors. On the technical side, we develop new chaining methods bounding heavy-tailed processes, which could be of independent interest.", "bibtex": "@InProceedings{pmlr-v97-wei19b,\n title = \t {On the statistical rate of nonlinear recovery in generative models with heavy-tailed data},\n author = {Wei, Xiaohan and Yang, Zhuoran and Wang, Zhaoran},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6697--6706},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wei19b/wei19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/wei19b.html},\n abstract = \t {We consider estimating a high-dimensional vector from non-linear measurements where the unknown vector is represented by a generative model $G:\\mathbb{R}^k\\rightarrow\\mathbb{R}^d$ with $k\\ll d$. Such a model poses structural priors on the unknown vector without having a dedicated basis, and in particular allows new and efficient approaches solving recovery problems with number of measurements far less than the ambient dimension of the vector. While progresses have been made recently regarding theoretical understandings on the linear Gaussian measurements, much less is known when the model is possibly misspecified and the measurements are non-Gaussian. In this paper, we make a step towards such a direction by considering the scenario where the measurements are non-Gaussian, subject to possibly unknown nonlinear transformations and the responses are heavy-tailed. We then propose new estimators via score functions based on the first and second order Stein\u2019s identity, and prove the sample size bound of $m=\\mathcal{O}(k\\varepsilon^{-2}\\log(L/\\varepsilon))$ achieving an $\\varepsilon$ error in the form of exponential concentration inequalities. Furthermore, for the special case of multi-layer ReLU generative model, we improve the sample bound by a logarithm factor to $m=\\mathcal{O}(k\\varepsilon^{-2}\\log(d))$, matching the state-of-art statistical rate in compressed sensing for estimating $k$-sparse vectors. On the technical side, we develop new chaining methods bounding heavy-tailed processes, which could be of independent interest.}\n}", "pdf": "http://proceedings.mlr.press/v97/wei19b/wei19b.pdf", "supp": "", "pdf_size": 2707732, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5000618366168942706&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Electrical Engineering, University of Southern California, Los Angeles, CA, USA; Department of Operations Research and Financial Engineering, Princeton University, Princeton, NJ, USA; Department of Industrial Engineering and Management Sciences, Northwestern University, Chicago, IL, USA", "aff_domain": "usc.edu;princeton.edu; ", "email": "usc.edu;princeton.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wei19b.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Southern California;Princeton University;Northwestern University", "aff_unique_dep": "Department of Electrical Engineering;Department of Operations Research and Financial Engineering;Department of Industrial Engineering and Management Sciences", "aff_unique_url": "https://www.usc.edu;https://www.princeton.edu;https://www.northwestern.edu", "aff_unique_abbr": "USC;Princeton;NU", "aff_campus_unique_index": "0;1;2", "aff_campus_unique": "Los Angeles;Princeton;Chicago", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Online Adaptive Principal Component Analysis and Its extensions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3705", "id": "3705", "author_site": "Jianjun Yuan, Andrew Lamperski", "author": "Jianjun Yuan; Andrew Lamperski", "abstract": "We propose algorithms for online principal component analysis (PCA) and variance minimization for adaptive settings. Previous literature has focused on upper bounding the static adversarial regret, whose comparator is the optimal fixed action in hindsight. However, static regret is not an appropriate metric when the underlying environment is changing. Instead, we adopt the adaptive regret metric from the previous literature and propose online adaptive algorithms for PCA and variance minimization, that have sub-linear adaptive regret guarantees. We demonstrate both theoretically and experimentally that the proposed algorithms can adapt to the changing environments.", "bibtex": "@InProceedings{pmlr-v97-yuan19a,\n title = \t {Online Adaptive Principal Component Analysis and Its extensions},\n author = {Yuan, Jianjun and Lamperski, Andrew},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7213--7221},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yuan19a/yuan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yuan19a.html},\n abstract = \t {We propose algorithms for online principal component analysis (PCA) and variance minimization for adaptive settings. Previous literature has focused on upper bounding the static adversarial regret, whose comparator is the optimal fixed action in hindsight. However, static regret is not an appropriate metric when the underlying environment is changing. Instead, we adopt the adaptive regret metric from the previous literature and propose online adaptive algorithms for PCA and variance minimization, that have sub-linear adaptive regret guarantees. We demonstrate both theoretically and experimentally that the proposed algorithms can adapt to the changing environments.}\n}", "pdf": "http://proceedings.mlr.press/v97/yuan19a/yuan19a.pdf", "supp": "", "pdf_size": 1470495, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11284462216308687300&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of Minnesota, Minneapolis, USA; University of Minnesota, Minneapolis, USA", "aff_domain": "umn.edu;umn.edu", "email": "umn.edu;umn.edu", "github": "https://github.com/yuanx270/online-adaptive-PCA", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/yuan19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Minnesota", "aff_unique_dep": "", "aff_unique_url": "https://www.minnesota.edu", "aff_unique_abbr": "UMN", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Minneapolis", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Online Algorithms for Rent-Or-Buy with Expert Advice", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4288", "id": "4288", "author_site": "Sreenivas Gollapudi, Debmalya Panigrahi", "author": "Sreenivas Gollapudi; Debmalya Panigrahi", "abstract": "We study the use of predictions by multiple experts (such as machine learning algorithms) to improve the performance of online algorithms. In particular, we consider the classical rent-or-buy problem (also called ski rental), and obtain algorithms that provably improve their performance over the adversarial scenario by using these predictions. We also prove matching lower bounds to show that our algorithms are the best possible, and perform experiments to empirically validate their performance in practice", "bibtex": "@InProceedings{pmlr-v97-gollapudi19a,\n title = \t {Online Algorithms for Rent-Or-Buy with Expert Advice},\n author = {Gollapudi, Sreenivas and Panigrahi, Debmalya},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2319--2327},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gollapudi19a/gollapudi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gollapudi19a.html},\n abstract = \t {We study the use of predictions by multiple experts (such as machine learning algorithms) to improve the performance of online algorithms. In particular, we consider the classical rent-or-buy problem (also called ski rental), and obtain algorithms that provably improve their performance over the adversarial scenario by using these predictions. We also prove matching lower bounds to show that our algorithms are the best possible, and perform experiments to empirically validate their performance in practice}\n}", "pdf": "http://proceedings.mlr.press/v97/gollapudi19a/gollapudi19a.pdf", "supp": "", "pdf_size": 381770, "gs_citation": 166, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15885324383813157551&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google Research; Department of Computer Science, Duke University", "aff_domain": "google.com;cs.duke.edu", "email": "google.com;cs.duke.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/gollapudi19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Google;Duke University", "aff_unique_dep": "Google Research;Department of Computer Science", "aff_unique_url": "https://research.google;https://www.duke.edu", "aff_unique_abbr": "Google Research;Duke", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Online Control with Adversarial Disturbances", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4313", "id": "4313", "author_site": "Naman Agarwal, Brian Bullins, Elad Hazan, Sham Kakade, Karan Singh", "author": "Naman Agarwal; Brian Bullins; Elad Hazan; Sham Kakade; Karan Singh", "abstract": "We study the control of linear dynamical systems with adversarial disturbances, as opposed to statistical noise. We present an efficient algorithm that achieves nearly-tight regret bounds in this setting. Our result generalizes upon previous work in two main aspects: the algorithm can accommodate adversarial noise in the dynamics, and can handle general convex costs.", "bibtex": "@InProceedings{pmlr-v97-agarwal19c,\n title = \t {Online Control with Adversarial Disturbances},\n author = {Agarwal, Naman and Bullins, Brian and Hazan, Elad and Kakade, Sham and Singh, Karan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {111--119},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/agarwal19c/agarwal19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/agarwal19c.html},\n abstract = \t {We study the control of linear dynamical systems with adversarial disturbances, as opposed to statistical noise. We present an efficient algorithm that achieves nearly-tight regret bounds in this setting. Our result generalizes upon previous work in two main aspects: the algorithm can accommodate adversarial noise in the dynamics, and can handle general convex costs.}\n}", "pdf": "http://proceedings.mlr.press/v97/agarwal19c/agarwal19c.pdf", "supp": "", "pdf_size": 282304, "gs_citation": 280, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8958380281666852698&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 17, "aff": "Google AI Princeton; Department of Computer Science, Princeton University; Google AI Princeton; Allen School of Computer Science and Engineering, University of Washington; Department of Statistics, University of Washington", "aff_domain": "google.com;cs.princeton.edu;google.com;cs.washington.edu;princeton.edu", "email": "google.com;cs.princeton.edu;google.com;cs.washington.edu;princeton.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/agarwal19c.html", "aff_unique_index": "0;1;0;2;2", "aff_unique_norm": "Google;Princeton University;University of Washington", "aff_unique_dep": "Google AI;Department of Computer Science;Allen School of Computer Science and Engineering", "aff_unique_url": "https://ai.google;https://www.princeton.edu;https://www.cs.washington.edu", "aff_unique_abbr": "Google AI;Princeton;UW", "aff_campus_unique_index": "0;0;2;2", "aff_campus_unique": "Princeton;;Seattle", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online Convex Optimization in Adversarial Markov Decision Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3679", "id": "3679", "author_site": "Aviv Rosenberg, Yishay Mansour", "author": "Aviv Rosenberg; Yishay Mansour", "abstract": "We consider online learning in episodic loop-free Markov decision processes (MDPs), where the loss function can change arbitrarily between episodes, and the transition function is not known to the learner. We show $\\tilde{O}(L|X|\\sqrt{|A|T})$ regret bound, where $T$ is the number of episodes, $X$ is the state space, $A$ is the action space, and $L$ is the length of each episode. Our online algorithm is implemented using entropic regularization methodology, which allows to extend the original adversarial MDP model to handle convex performance criteria (different ways to aggregate the losses of a single episode) , as well as improve previous regret bounds.", "bibtex": "@InProceedings{pmlr-v97-rosenberg19a,\n title = \t {Online Convex Optimization in Adversarial {M}arkov Decision Processes},\n author = {Rosenberg, Aviv and Mansour, Yishay},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5478--5486},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rosenberg19a/rosenberg19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rosenberg19a.html},\n abstract = \t {We consider online learning in episodic loop-free Markov decision processes (MDPs), where the loss function can change arbitrarily between episodes, and the transition function is not known to the learner. We show $\\tilde{O}(L|X|\\sqrt{|A|T})$ regret bound, where $T$ is the number of episodes, $X$ is the state space, $A$ is the action space, and $L$ is the length of each episode. Our online algorithm is implemented using entropic regularization methodology, which allows to extend the original adversarial MDP model to handle convex performance criteria (different ways to aggregate the losses of a single episode) , as well as improve previous regret bounds.}\n}", "pdf": "http://proceedings.mlr.press/v97/rosenberg19a/rosenberg19a.pdf", "supp": "", "pdf_size": 256657, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7734774336779940418&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Tel Aviv University, Israel+Google Research, Tel Aviv, Israel; Tel Aviv University, Israel+Google Research, Tel Aviv, Israel", "aff_domain": "gmail.com;gmail.com", "email": "gmail.com;gmail.com", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/rosenberg19a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Tel Aviv University;Google", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.tau.ac.il;https://research.google", "aff_unique_abbr": "TAU;Google Res.", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Tel Aviv", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Israel" }, { "title": "Online Learning to Rank with Features", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3782", "id": "3782", "author_site": "Shuai Li, Tor Lattimore, Csaba Szepesvari", "author": "Shuai Li; Tor Lattimore; Csaba Szepesvari", "abstract": "We introduce a new model for online ranking in which the click probability factors into an examination and attractiveness function and the attractiveness function is a linear function of a feature vector and an unknown parameter. Only relatively mild assumptions are made on the examination function. A novel algorithm for this setup is analysed, showing that the dependence on the number of items is replaced by a dependence on the dimension, allowing the new algorithm to handle a large number of items. When reduced to the orthogonal case, the regret of the algorithm improves on the state-of-the-art.", "bibtex": "@InProceedings{pmlr-v97-li19f,\n title = \t {Online Learning to Rank with Features},\n author = {Li, Shuai and Lattimore, Tor and Szepesvari, Csaba},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3856--3865},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19f/li19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19f.html},\n abstract = \t {We introduce a new model for online ranking in which the click probability factors into an examination and attractiveness function and the attractiveness function is a linear function of a feature vector and an unknown parameter. Only relatively mild assumptions are made on the examination function. A novel algorithm for this setup is analysed, showing that the dependence on the number of items is replaced by a dependence on the dimension, allowing the new algorithm to handle a large number of items. When reduced to the orthogonal case, the regret of the algorithm improves on the state-of-the-art.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19f/li19f.pdf", "supp": "", "pdf_size": 1477505, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7202238912141443157&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "The Chinese University of Hong Kong; DeepMind; DeepMind", "aff_domain": "cse.cuhk.edu.hk;google.com;google.com", "email": "cse.cuhk.edu.hk;google.com;google.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/li19f.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Chinese University of Hong Kong;DeepMind", "aff_unique_dep": ";", "aff_unique_url": "https://www.cuhk.edu.hk;https://deepmind.com", "aff_unique_abbr": "CUHK;DeepMind", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "China;United Kingdom" }, { "title": "Online Learning with Sleeping Experts and Feedback Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4151", "id": "4151", "author_site": "Corinna Cortes, Giulia DeSalvo, Claudio Gentile, Mehryar Mohri, Scott Yang", "author": "Corinna Cortes; Giulia Desalvo; Claudio Gentile; Mehryar Mohri; Scott Yang", "abstract": "We consider the scenario of online learning with sleeping experts, where not all experts are available at each round, and analyze the general framework of learning with feedback graphs, where the loss observations associated with each expert are characterized by a graph. A critical assumption in this framework is that the loss observations and the set of sleeping experts at each round are independent. We first extend the classical sleeping experts algorithm of Kleinberg et al. 2008 to the feedback graphs scenario, and prove matching upper and lower bounds for the sleeping regret of the resulting algorithm under the independence assumption. Our main contribution is then to relax this assumption, present a more general notion of sleeping regret, and derive a general algorithm with strong theoretical guarantees. We apply this new framework to the important scenario of online learning with abstention, where a learner can elect to abstain from making a prediction at the price of a certain cost. We empirically validate our algorithm against multiple online abstention algorithms on several real-world datasets, showing substantial performance improvements.", "bibtex": "@InProceedings{pmlr-v97-cortes19a,\n title = \t {Online Learning with Sleeping Experts and Feedback Graphs},\n author = {Cortes, Corinna and Desalvo, Giulia and Gentile, Claudio and Mohri, Mehryar and Yang, Scott},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1370--1378},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cortes19a/cortes19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cortes19a.html},\n abstract = \t {We consider the scenario of online learning with sleeping experts, where not all experts are available at each round, and analyze the general framework of learning with feedback graphs, where the loss observations associated with each expert are characterized by a graph. A critical assumption in this framework is that the loss observations and the set of sleeping experts at each round are independent. We first extend the classical sleeping experts algorithm of Kleinberg et al. 2008 to the feedback graphs scenario, and prove matching upper and lower bounds for the sleeping regret of the resulting algorithm under the independence assumption. Our main contribution is then to relax this assumption, present a more general notion of sleeping regret, and derive a general algorithm with strong theoretical guarantees. We apply this new framework to the important scenario of online learning with abstention, where a learner can elect to abstain from making a prediction at the price of a certain cost. We empirically validate our algorithm against multiple online abstention algorithms on several real-world datasets, showing substantial performance improvements.}\n}", "pdf": "http://proceedings.mlr.press/v97/cortes19a/cortes19a.pdf", "supp": "", "pdf_size": 1139592, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15999663133718090203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Google Research, New York, NY; Google Research, New York, NY; Google Research, New York, NY; Google Research, New York, NY + Courant Institute of Mathematical Sciences, New York, NY; D. E. Shaw & Co., New York, NY", "aff_domain": "google.com;google.com;google.com;cims.nyu.edu;shaw.com", "email": "google.com;google.com;google.com;cims.nyu.edu;shaw.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/cortes19a.html", "aff_unique_index": "0;0;0;0+1;2", "aff_unique_norm": "Google;Courant Institute of Mathematical Sciences;D. E. Shaw & Co.", "aff_unique_dep": "Google Research;Mathematical Sciences;", "aff_unique_url": "https://research.google;https://courant.nyu.edu;https://www.deshaw.com", "aff_unique_abbr": "Google Research;Courant;DES", "aff_campus_unique_index": "0;0;0;0+0", "aff_campus_unique": "New York;", "aff_country_unique_index": "0;0;0;0+0;0", "aff_country_unique": "United States" }, { "title": "Online Meta-Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4195", "id": "4195", "author_site": "Chelsea Finn, Aravind Rajeswaran, Sham Kakade, Sergey Levine", "author": "Chelsea Finn; Aravind Rajeswaran; Sham Kakade; Sergey Levine", "abstract": "A central capability of intelligent systems is the ability to continuously build upon previous experiences to speed up and enhance learning of new tasks. Two distinct research paradigms have studied this question. Meta-learning views this problem as learning a prior over model parameters that is amenable for fast adaptation on a new task, but typically assumes the tasks are available together as a batch. In contrast, online (regret based) learning considers a setting where tasks are revealed one after the other, but conventionally trains a single model without task-specific adaptation. This work introduces an online meta-learning setting, which merges ideas from both paradigms to better capture the spirit and practice of continual lifelong learning. We propose the follow the meta leader (FTML) algorithm which extends the MAML algorithm to this setting. Theoretically, this work provides an O(log T) regret guarantee with one additional higher order smoothness assumption (in comparison to the standard online setting). Our experimental evaluation on three different large-scale problems suggest that the proposed algorithm significantly outperforms alternatives based on traditional online learning approaches.", "bibtex": "@InProceedings{pmlr-v97-finn19a,\n title = \t {Online Meta-Learning},\n author = {Finn, Chelsea and Rajeswaran, Aravind and Kakade, Sham and Levine, Sergey},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1920--1930},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/finn19a/finn19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/finn19a.html},\n abstract = \t {A central capability of intelligent systems is the ability to continuously build upon previous experiences to speed up and enhance learning of new tasks. Two distinct research paradigms have studied this question. Meta-learning views this problem as learning a prior over model parameters that is amenable for fast adaptation on a new task, but typically assumes the tasks are available together as a batch. In contrast, online (regret based) learning considers a setting where tasks are revealed one after the other, but conventionally trains a single model without task-specific adaptation. This work introduces an online meta-learning setting, which merges ideas from both paradigms to better capture the spirit and practice of continual lifelong learning. We propose the follow the meta leader (FTML) algorithm which extends the MAML algorithm to this setting. Theoretically, this work provides an O(log T) regret guarantee with one additional higher order smoothness assumption (in comparison to the standard online setting). Our experimental evaluation on three different large-scale problems suggest that the proposed algorithm significantly outperforms alternatives based on traditional online learning approaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/finn19a/finn19a.pdf", "supp": "", "pdf_size": 1480943, "gs_citation": 576, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1552829599115309059&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "UC Berkeley; University of Washington; University of Washington; UC Berkeley", "aff_domain": "stanford.edu;cs.washington.edu; ; ", "email": "stanford.edu;cs.washington.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/finn19a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of California, Berkeley;University of Washington", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.washington.edu", "aff_unique_abbr": "UC Berkeley;UW", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Online Variance Reduction with Mixtures", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3739", "id": "3739", "author_site": "Zal\u00e1n Borsos, Sebastian Curi, Yehuda Levy, Andreas Krause", "author": "Zal\u00e1n Borsos; Sebastian Curi; Kfir Yehuda Levy; Andreas Krause", "abstract": "Adaptive importance sampling for stochastic optimization is a promising approach that offers improved convergence through variance reduction. In this work, we propose a new framework for variance reduction that enables the use of mixtures over predefined sampling distributions, which can naturally encode prior knowledge about the data. While these sampling distributions are fixed, the mixture weights are adapted during the optimization process. We propose VRM, a novel and efficient adaptive scheme that asymptotically recovers the best mixture weights in hindsight and can also accommodate sampling distributions over sets of points. We empirically demonstrate the versatility of VRM in a range of applications.", "bibtex": "@InProceedings{pmlr-v97-borsos19a,\n title = \t {Online Variance Reduction with Mixtures},\n author = {Borsos, Zal{\\'a}n and Curi, Sebastian and Levy, Kfir Yehuda and Krause, Andreas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {705--714},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/borsos19a/borsos19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/borsos19a.html},\n abstract = \t {Adaptive importance sampling for stochastic optimization is a promising approach that offers improved convergence through variance reduction. In this work, we propose a new framework for variance reduction that enables the use of mixtures over predefined sampling distributions, which can naturally encode prior knowledge about the data. While these sampling distributions are fixed, the mixture weights are adapted during the optimization process. We propose VRM, a novel and efficient adaptive scheme that asymptotically recovers the best mixture weights in hindsight and can also accommodate sampling distributions over sets of points. We empirically demonstrate the versatility of VRM in a range of applications.}\n}", "pdf": "http://proceedings.mlr.press/v97/borsos19a/borsos19a.pdf", "supp": "", "pdf_size": 1880401, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14403425847063612414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich; Department of Computer Science, ETH Zurich", "aff_domain": "inf.ethz.ch; ; ; ", "email": "inf.ethz.ch; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/borsos19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Online learning with kernel losses", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4222", "id": "4222", "author_site": "Niladri Chatterji, Aldo Pacchiano, Peter Bartlett", "author": "Niladri Chatterji; Aldo Pacchiano; Peter Bartlett", "abstract": "We present a generalization of the adversarial linear bandits framework, where the underlying losses are kernel functions (with an associated reproducing kernel Hilbert space) rather than linear functions. We study a version of the exponential weights algorithm and bound its regret in this setting. Under conditions on the eigen-decay of the kernel we provide a sharp characterization of the regret for this algorithm. When we have polynomial eigen-decay ($\\mu_j \\le \\mathcal{O}(j^{-\\beta})$), we find that the regret is bounded by $\\mathcal{R}_n \\le \\mathcal{O}(n^{\\beta/2(\\beta-1)})$. While under the assumption of exponential eigen-decay ($\\mu_j \\le \\mathcal{O}(e^{-\\beta j })$) we get an even tighter bound on the regret $\\mathcal{R}_n \\le \\tilde{\\mathcal{O}}(n^{1/2})$. When the eigen-decay is polynomial we also show a", "bibtex": "@InProceedings{pmlr-v97-chatterji19a,\n title = \t {Online learning with kernel losses},\n author = {Chatterji, Niladri and Pacchiano, Aldo and Bartlett, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {971--980},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chatterji19a/chatterji19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chatterji19a.html},\n abstract = \t {We present a generalization of the adversarial linear bandits framework, where the underlying losses are kernel functions (with an associated reproducing kernel Hilbert space) rather than linear functions. We study a version of the exponential weights algorithm and bound its regret in this setting. Under conditions on the eigen-decay of the kernel we provide a sharp characterization of the regret for this algorithm. When we have polynomial eigen-decay ($\\mu_j \\le \\mathcal{O}(j^{-\\beta})$), we find that the regret is bounded by $\\mathcal{R}_n \\le \\mathcal{O}(n^{\\beta/2(\\beta-1)})$. While under the assumption of exponential eigen-decay ($\\mu_j \\le \\mathcal{O}(e^{-\\beta j })$) we get an even tighter bound on the regret $\\mathcal{R}_n \\le \\tilde{\\mathcal{O}}(n^{1/2})$. When the eigen-decay is polynomial we also show a", "pdf": "http://proceedings.mlr.press/v97/chatterji19a/chatterji19a.pdf", "supp": "", "pdf_size": 362285, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17559721610481806140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "University of California Berkeley; University of California Berkeley; University of California Berkeley", "aff_domain": "berkeley.edu;berkeley.edu; ", "email": "berkeley.edu;berkeley.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/chatterji19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Open Vocabulary Learning on Source Code with a Graph-Structured Cache", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4178", "id": "4178", "author_site": "Milan Cvitkovic, Badal Singh, Anima Anandkumar", "author": "Milan Cvitkovic; Badal Singh; Animashree Anandkumar", "abstract": "Machine learning models that take computer program source code as input typically use Natural Language Processing (NLP) techniques. However, a major challenge is that code is written using an open, rapidly changing vocabulary due to, e.g., the coinage of new variable and method names. Reasoning over such a vocabulary is not something for which most NLP methods are designed. We introduce a Graph-Structured Cache to address this problem; this cache contains a node for each new word the model encounters with edges connecting each word to its occurrences in the code. We find that combining this graph-structured cache strategy with recent Graph-Neural-Network-based models for supervised learning on code improves the models\u2019 performance on a code completion task and a variable naming task \u2014 with over 100% relative improvement on the latter \u2014 at the cost of a moderate increase in computation time.", "bibtex": "@InProceedings{pmlr-v97-cvitkovic19b,\n title = \t {Open Vocabulary Learning on Source Code with a Graph-Structured Cache},\n author = {Cvitkovic, Milan and Singh, Badal and Anandkumar, Animashree},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1475--1485},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cvitkovic19b/cvitkovic19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/cvitkovic19b.html},\n abstract = \t {Machine learning models that take computer program source code as input typically use Natural Language Processing (NLP) techniques. However, a major challenge is that code is written using an open, rapidly changing vocabulary due to, e.g., the coinage of new variable and method names. Reasoning over such a vocabulary is not something for which most NLP methods are designed. We introduce a Graph-Structured Cache to address this problem; this cache contains a node for each new word the model encounters with edges connecting each word to its occurrences in the code. We find that combining this graph-structured cache strategy with recent Graph-Neural-Network-based models for supervised learning on code improves the models\u2019 performance on a code completion task and a variable naming task \u2014 with over 100% relative improvement on the latter \u2014 at the cost of a moderate increase in computation time.}\n}", "pdf": "http://proceedings.mlr.press/v97/cvitkovic19b/cvitkovic19b.pdf", "supp": "", "pdf_size": 1298781, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1145489630896909786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, California, USA; Amazon Web Services, Seattle, Washington, USA; Department of Computing and Mathematical Sciences, California Institute of Technology, Pasadena, California, USA", "aff_domain": "caltech.edu; ;caltech.edu", "email": "caltech.edu; ;caltech.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/cvitkovic19b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "California Institute of Technology;Amazon", "aff_unique_dep": "Department of Computing and Mathematical Sciences;Amazon Web Services", "aff_unique_url": "https://www.caltech.edu;https://aws.amazon.com", "aff_unique_abbr": "Caltech;AWS", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Pasadena;Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Open-ended learning in symmetric zero-sum games", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4161", "id": "4161", "author_site": "David Balduzzi, Marta Garnelo, Yoram Bachrach, Wojciech Czarnecki, Julien Perolat, Max Jaderberg, Thore Graepel", "author": "David Balduzzi; Marta Garnelo; Yoram Bachrach; Wojciech Czarnecki; Julien Perolat; Max Jaderberg; Thore Graepel", "abstract": "Zero-sum games such as chess and poker are, abstractly, functions that evaluate pairs of agents, for example labeling them \u2018winner\u2019 and \u2018loser\u2019. If the game is approximately transitive, then self-play generates sequences of agents of increasing strength. However, nontransitive games, such as rock-paper-scissors, can exhibit strategic cycles, and there is no longer a clear objective \u2013 we want agents to increase in strength, but against whom is unclear. In this paper, we introduce a geometric framework for formulating agent objectives in zero-sum games, in order to construct adaptive sequences of objectives that yield open-ended learning. The framework allows us to reason about population performance in nontransitive games, and enables the development of a new algorithm (rectified Nash response, PSRO_rN) that uses game-theoretic niching to construct diverse populations of effective agents, producing a stronger set of agents than existing algorithms. We apply PSRO_rN to two highly nontransitive resource allocation games and find that PSRO_rN consistently outperforms the existing alternatives.", "bibtex": "@InProceedings{pmlr-v97-balduzzi19a,\n title = \t {Open-ended learning in symmetric zero-sum games},\n author = {Balduzzi, David and Garnelo, Marta and Bachrach, Yoram and Czarnecki, Wojciech and Perolat, Julien and Jaderberg, Max and Graepel, Thore},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {434--443},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/balduzzi19a/balduzzi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/balduzzi19a.html},\n abstract = \t {Zero-sum games such as chess and poker are, abstractly, functions that evaluate pairs of agents, for example labeling them \u2018winner\u2019 and \u2018loser\u2019. If the game is approximately transitive, then self-play generates sequences of agents of increasing strength. However, nontransitive games, such as rock-paper-scissors, can exhibit strategic cycles, and there is no longer a clear objective \u2013 we want agents to increase in strength, but against whom is unclear. In this paper, we introduce a geometric framework for formulating agent objectives in zero-sum games, in order to construct adaptive sequences of objectives that yield open-ended learning. The framework allows us to reason about population performance in nontransitive games, and enables the development of a new algorithm (rectified Nash response, PSRO_rN) that uses game-theoretic niching to construct diverse populations of effective agents, producing a stronger set of agents than existing algorithms. We apply PSRO_rN to two highly nontransitive resource allocation games and find that PSRO_rN consistently outperforms the existing alternatives.}\n}", "pdf": "http://proceedings.mlr.press/v97/balduzzi19a/balduzzi19a.pdf", "supp": "", "pdf_size": 790561, "gs_citation": 220, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9041788923673437776&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind", "aff_domain": "google.com; ; ; ; ; ; ", "email": "google.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/balduzzi19a.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Optimal Algorithms for Lipschitz Bandits with Heavy-tailed Rewards", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3798", "id": "3798", "author_site": "Shiyin Lu, Guanghui Wang, Yao Hu, Lijun Zhang", "author": "Shiyin Lu; Guanghui Wang; Yao Hu; Lijun Zhang", "abstract": "We study Lipschitz bandits, where a learner repeatedly plays one arm from an infinite arm set and then receives a stochastic reward whose expectation is a Lipschitz function of the chosen arm. Most of existing work assume the reward distributions are bounded or at least sub-Gaussian, and thus do not apply to heavy-tailed rewards arising in many real-world scenarios such as web advertising and financial markets. To address this limitation, in this paper we relax the assumption on rewards to allow arbitrary distributions that have finite $(1+\\epsilon)$-th moments for some $\\epsilon \\in (0, 1]$, and propose algorithms that enjoy a sublinear regret of $\\widetilde{O}(T^{(d_z\\epsilon + 1)/(d_z \\epsilon + \\epsilon + 1)})$ where $T$ is the time horizon and $d_z$ is the zooming dimension. The key idea is to exploit the Lipschitz property of the expected reward function by adaptively discretizing the arm set, and employ upper confidence bound policies with robust mean estimators designed for heavy-tailed distributions. Furthermore, we provide a lower bound for Lipschitz bandits with heavy-tailed rewards, and show that our algorithms are optimal in terms of $T$. Finally, we conduct numerical experiments to demonstrate the effectiveness of our algorithms.", "bibtex": "@InProceedings{pmlr-v97-lu19c,\n title = \t {Optimal Algorithms for {L}ipschitz Bandits with Heavy-tailed Rewards},\n author = {Lu, Shiyin and Wang, Guanghui and Hu, Yao and Zhang, Lijun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4154--4163},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lu19c/lu19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/lu19c.html},\n abstract = \t {We study Lipschitz bandits, where a learner repeatedly plays one arm from an infinite arm set and then receives a stochastic reward whose expectation is a Lipschitz function of the chosen arm. Most of existing work assume the reward distributions are bounded or at least sub-Gaussian, and thus do not apply to heavy-tailed rewards arising in many real-world scenarios such as web advertising and financial markets. To address this limitation, in this paper we relax the assumption on rewards to allow arbitrary distributions that have finite $(1+\\epsilon)$-th moments for some $\\epsilon \\in (0, 1]$, and propose algorithms that enjoy a sublinear regret of $\\widetilde{O}(T^{(d_z\\epsilon + 1)/(d_z \\epsilon + \\epsilon + 1)})$ where $T$ is the time horizon and $d_z$ is the zooming dimension. The key idea is to exploit the Lipschitz property of the expected reward function by adaptively discretizing the arm set, and employ upper confidence bound policies with robust mean estimators designed for heavy-tailed distributions. Furthermore, we provide a lower bound for Lipschitz bandits with heavy-tailed rewards, and show that our algorithms are optimal in terms of $T$. Finally, we conduct numerical experiments to demonstrate the effectiveness of our algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/lu19c/lu19c.pdf", "supp": "", "pdf_size": 742840, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=343741987508895704&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing 210023, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing 210023, China; YouKu Cognitive and Intelligent Lab, Alibaba Group, Beijing 100102, China; National Key Laboratory for Novel Software Technology, Nanjing University, Nanjing 210023, China", "aff_domain": "lamda.nju.edu.cn; ; ;lamda.nju.edu.cn", "email": "lamda.nju.edu.cn; ; ;lamda.nju.edu.cn", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/lu19c.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Nanjing University;Alibaba Group", "aff_unique_dep": "National Key Laboratory for Novel Software Technology;YouKu Cognitive and Intelligent Lab", "aff_unique_url": "http://www.nju.edu.cn;https://www.alibaba.com", "aff_unique_abbr": "Nanjing U;Alibaba", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Nanjing;Beijing", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Optimal Auctions through Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3967", "id": "3967", "author_site": "Paul Duetting, Zhe Feng, Harikrishna Narasimhan, David Parkes, Sai Srivatsa Ravindranath", "author": "Paul Duetting; Zhe Feng; Harikrishna Narasimhan; David Parkes; Sai Srivatsa Ravindranath", "abstract": "Designing an incentive compatible auction that maximizes expected revenue is an intricate task. The single-item case was resolved in a seminal piece of work by Myerson in 1981. Even after 30-40 years of intense research the problem remains unsolved for seemingly simple multi-bidder, multi-item settings. In this work, we initiate the exploration of the use of tools from deep learning for the automated design of optimal auctions. We model an auction as a multi-layer neural network, frame optimal auction design as a constrained learning problem, and show how it can be solved using standard pipelines. We prove generalization bounds and present extensive experiments, recovering essentially all known analytical solutions for multi-item settings, and obtaining novel mechanisms for settings in which the optimal mechanism is unknown.", "bibtex": "@InProceedings{pmlr-v97-duetting19a,\n title = \t {Optimal Auctions through Deep Learning},\n author = {Duetting, Paul and Feng, Zhe and Narasimhan, Harikrishna and Parkes, David and Ravindranath, Sai Srivatsa},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1706--1715},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/duetting19a/duetting19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/duetting19a.html},\n abstract = \t {Designing an incentive compatible auction that maximizes expected revenue is an intricate task. The single-item case was resolved in a seminal piece of work by Myerson in 1981. Even after 30-40 years of intense research the problem remains unsolved for seemingly simple multi-bidder, multi-item settings. In this work, we initiate the exploration of the use of tools from deep learning for the automated design of optimal auctions. We model an auction as a multi-layer neural network, frame optimal auction design as a constrained learning problem, and show how it can be solved using standard pipelines. We prove generalization bounds and present extensive experiments, recovering essentially all known analytical solutions for multi-item settings, and obtaining novel mechanisms for settings in which the optimal mechanism is unknown.}\n}", "pdf": "http://proceedings.mlr.press/v97/duetting19a/duetting19a.pdf", "supp": "", "pdf_size": 674474, "gs_citation": 273, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5937791241364902632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "aff": "London School of Economics; Harvard University; Harvard University; Harvard University; Harvard University", "aff_domain": "g.harvard.edu; ; ; ; ", "email": "g.harvard.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/duetting19a.html", "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "London School of Economics;Harvard University", "aff_unique_dep": ";", "aff_unique_url": "https://www.lse.ac.uk;https://www.harvard.edu", "aff_unique_abbr": "LSE;Harvard", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "title": "Optimal Continuous DR-Submodular Maximization and Applications to Provable Mean Field Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4006", "id": "4006", "author_site": "Yatao Bian, Joachim Buhmann, Andreas Krause", "author": "Yatao Bian; Joachim Buhmann; Andreas Krause", "abstract": "Mean field inference for discrete graphical models is generally a highly nonconvex problem, which also holds for the class of probabilistic log-submodular models. Existing optimization methods, e.g., coordinate ascent algorithms, typically only find local optima. In this work we propose provable mean filed methods for probabilistic log-submodular models and its posterior agreement (PA) with strong approximation guarantees. The main algorithmic technique is a new Double Greedy scheme, termed DR-DoubleGreedy, for continuous DR-submodular maximization with box-constraints. It is a one-pass algorithm with linear time complexity, reaching the optimal 1/2 approximation ratio, which may be of independent interest. We validate the superior performance of our algorithms against baselines on both synthetic and real-world datasets.", "bibtex": "@InProceedings{pmlr-v97-bian19a,\n title = \t {Optimal Continuous {DR}-Submodular Maximization and Applications to Provable Mean Field Inference},\n author = {Bian, Yatao and Buhmann, Joachim and Krause, Andreas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {644--653},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bian19a/bian19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bian19a.html},\n abstract = \t {Mean field inference for discrete graphical models is generally a highly nonconvex problem, which also holds for the class of probabilistic log-submodular models. Existing optimization methods, e.g., coordinate ascent algorithms, typically only find local optima. In this work we propose provable mean filed methods for probabilistic log-submodular models and its posterior agreement (PA) with strong approximation guarantees. The main algorithmic technique is a new Double Greedy scheme, termed DR-DoubleGreedy, for continuous DR-submodular maximization with box-constraints. It is a one-pass algorithm with linear time complexity, reaching the optimal 1/2 approximation ratio, which may be of independent interest. We validate the superior performance of our algorithms against baselines on both synthetic and real-world datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/bian19a/bian19a.pdf", "supp": "", "pdf_size": 1206127, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13682553241772682418&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Computer Science, ETH Zurich, Zurich, Switzerland", "aff_domain": "inf.ethz.ch; ; ", "email": "inf.ethz.ch; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/bian19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Optimal Kronecker-Sum Approximation of Real Time Recurrent Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3880", "id": "3880", "author_site": "Frederik Benzing, Marcelo Matheus Gauy, Asier Mujika, Anders Martinsson, Angelika Steger", "author": "Frederik Benzing; Marcelo Matheus Gauy; Asier Mujika; Anders Martinsson; Angelika Steger", "abstract": "One of the central goals of Recurrent Neural Networks (RNNs) is to learn long-term dependencies in sequential data. Nevertheless, the most popular training method, Truncated Backpropagation through Time (TBPTT), categorically forbids learning dependencies beyond the truncation horizon. In contrast, the online training algorithm Real Time Recurrent Learning (RTRL) provides untruncated gradients, with the disadvantage of impractically large computational costs. Recently published approaches reduce these costs by providing noisy approximations of RTRL. We present a new approximation algorithm of RTRL, Optimal Kronecker-Sum Approximation (OK). We prove that OK is optimal for a class of approximations of RTRL, which includes all approaches published so far. Additionally, we show that OK has empirically negligible noise: Unlike previous algorithms it matches TBPTT in a real world task (character-level Penn TreeBank) and can exploit online parameter updates to outperform TBPTT in a synthetic string memorization task. Code available at GitHub.", "bibtex": "@InProceedings{pmlr-v97-benzing19a,\n title = \t {Optimal {K}ronecker-Sum Approximation of Real Time Recurrent Learning},\n author = {Benzing, Frederik and Gauy, Marcelo Matheus and Mujika, Asier and Martinsson, Anders and Steger, Angelika},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {604--613},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/benzing19a/benzing19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/benzing19a.html},\n abstract = \t {One of the central goals of Recurrent Neural Networks (RNNs) is to learn long-term dependencies in sequential data. Nevertheless, the most popular training method, Truncated Backpropagation through Time (TBPTT), categorically forbids learning dependencies beyond the truncation horizon. In contrast, the online training algorithm Real Time Recurrent Learning (RTRL) provides untruncated gradients, with the disadvantage of impractically large computational costs. Recently published approaches reduce these costs by providing noisy approximations of RTRL. We present a new approximation algorithm of RTRL, Optimal Kronecker-Sum Approximation (OK). We prove that OK is optimal for a class of approximations of RTRL, which includes all approaches published so far. Additionally, we show that OK has empirically negligible noise: Unlike previous algorithms it matches TBPTT in a real world task (character-level Penn TreeBank) and can exploit online parameter updates to outperform TBPTT in a synthetic string memorization task. Code available at GitHub.}\n}", "pdf": "http://proceedings.mlr.press/v97/benzing19a/benzing19a.pdf", "supp": "", "pdf_size": 2258244, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6902147836625554260&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Computer Science, ETH Zurich, Zurich, Switzerland; Department of Computer Science, ETH Zurich, Zurich, Switzerland", "aff_domain": "inf.ethz.ch;inf.ethz.ch; ; ; ", "email": "inf.ethz.ch;inf.ethz.ch; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/benzing19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Optimal Mini-Batch and Step Sizes for SAGA", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4142", "id": "4142", "author_site": "Nidham Gazagnadou, Robert Gower, Joseph Salmon", "author": "Nidham Gazagnadou; Robert Gower; Joseph Salmon", "abstract": "Recently it has been shown that the step sizes of a family of variance reduced gradient methods called the JacSketch methods depend on the expected smoothness constant. In particular, if this expected smoothness constant could be calculated a priori, then one could safely set much larger step sizes which would result in a much faster convergence rate. We fill in this gap, and provide simple closed form expressions for the expected smoothness constant and careful numerical experiments verifying these bounds. Using these bounds, and since the SAGA algorithm is part of this JacSketch family, we suggest a new standard practice for setting the step and mini-batch sizes for SAGA that are competitive with a numerical grid search. Furthermore, we can now show that the total complexity of the SAGA algorithm decreases linearly in the mini-batch size up to a pre-defined value: the optimal mini-batch size. This is a rare result in the stochastic variance reduced literature, only previously shown for the Katyusha algorithm. Finally we conjecture that this is the case for many other stochastic variance reduced methods and that our bounds and analysis of the expected smoothness constant is key to extending these results.", "bibtex": "@InProceedings{pmlr-v97-gazagnadou19a,\n title = \t {Optimal Mini-Batch and Step Sizes for {SAGA}},\n author = {Gazagnadou, Nidham and Gower, Robert and Salmon, Joseph},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2142--2150},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gazagnadou19a/gazagnadou19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gazagnadou19a.html},\n abstract = \t {Recently it has been shown that the step sizes of a family of variance reduced gradient methods called the JacSketch methods depend on the expected smoothness constant. In particular, if this expected smoothness constant could be calculated a priori, then one could safely set much larger step sizes which would result in a much faster convergence rate. We fill in this gap, and provide simple closed form expressions for the expected smoothness constant and careful numerical experiments verifying these bounds. Using these bounds, and since the SAGA algorithm is part of this JacSketch family, we suggest a new standard practice for setting the step and mini-batch sizes for SAGA that are competitive with a numerical grid search. Furthermore, we can now show that the total complexity of the SAGA algorithm decreases linearly in the mini-batch size up to a pre-defined value: the optimal mini-batch size. This is a rare result in the stochastic variance reduced literature, only previously shown for the Katyusha algorithm. Finally we conjecture that this is the case for many other stochastic variance reduced methods and that our bounds and analysis of the expected smoothness constant is key to extending these results.}\n}", "pdf": "http://proceedings.mlr.press/v97/gazagnadou19a/gazagnadou19a.pdf", "supp": "", "pdf_size": 3015110, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14147185624190732996&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "LTCI, T \u00b4el\u00b4ecom ParisTech, Universit \u00b4e Paris-Saclay, Paris, France; LTCI, T \u00b4el\u00b4ecom ParisTech, Universit \u00b4e Paris-Saclay, Paris, France; IMAG, Univ Montpellier, CNRS, Montpellier, France", "aff_domain": "telecom-paristech.fr;telecom-paristech.fr; ", "email": "telecom-paristech.fr;telecom-paristech.fr; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/gazagnadou19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech;University of Montpellier", "aff_unique_dep": "LTCI;IMAG", "aff_unique_url": "https://www.telecom-paris.fr;https://www.univ-montp1.fr", "aff_unique_abbr": "T\u00e9l\u00e9com ParisTech;Univ Montpellier", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Paris;Montpellier", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Optimal Minimal Margin Maximization with Boosting", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3731", "id": "3731", "author_site": "Alexander Mathiasen, Kasper Green Larsen, Allan Gr\u00f8nlund", "author": "Alexander Mathiasen; Kasper Green Larsen; Allan Gr\u00f8nlund", "abstract": "Boosting algorithms iteratively produce linear combinations of more and more base hypotheses and it has been observed experimentally that the generalization error keeps improving even after achieving zero training error. One popular explanation attributes this to improvements in margins. A common goal in a long line of research, is to obtain large margins using as few base hypotheses as possible, culminating with the AdaBoostV algorithm by R{\u00e4}tsch and Warmuth [JMLR\u201905]. The AdaBoostV algorithm was later conjectured to yield an optimal trade-off between number of hypotheses trained and the minimal margin over all training points (Nie, Warmuth, Vishwanathan and Zhang [JMLR\u201913]). Our main contribution is a new algorithm refuting this conjecture. Furthermore, we prove a lower bound which implies that our new algorithm is optimal.", "bibtex": "@InProceedings{pmlr-v97-mathiasen19a,\n title = \t {Optimal Minimal Margin Maximization with Boosting},\n author = {Mathiasen, Alexander and Larsen, Kasper Green and Gr{\\o}nlund, Allan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4392--4401},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mathiasen19a/mathiasen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mathiasen19a.html},\n abstract = \t {Boosting algorithms iteratively produce linear combinations of more and more base hypotheses and it has been observed experimentally that the generalization error keeps improving even after achieving zero training error. One popular explanation attributes this to improvements in margins. A common goal in a long line of research, is to obtain large margins using as few base hypotheses as possible, culminating with the AdaBoostV algorithm by R{\u00e4}tsch and Warmuth [JMLR\u201905]. The AdaBoostV algorithm was later conjectured to yield an optimal trade-off between number of hypotheses trained and the minimal margin over all training points (Nie, Warmuth, Vishwanathan and Zhang [JMLR\u201913]). Our main contribution is a new algorithm refuting this conjecture. Furthermore, we prove a lower bound which implies that our new algorithm is optimal.}\n}", "pdf": "http://proceedings.mlr.press/v97/mathiasen19a/mathiasen19a.pdf", "supp": "", "pdf_size": 1446479, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12797614124545169672&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, University of Aarhus, Denmark; Department of Computer Science, University of Aarhus, Denmark; Department of Computer Science, University of Aarhus, Denmark", "aff_domain": "cs.au.dk; ; ", "email": "cs.au.dk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mathiasen19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Aarhus", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.au.dk", "aff_unique_abbr": "AU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Denmark" }, { "title": "Optimal Transport for structured data with application on graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4110", "id": "4110", "author_site": "Titouan Vayer, Nicolas Courty, Romain Tavenard, Chapel Laetitia, Remi Flamary", "author": "Vayer Titouan; Nicolas Courty; Romain Tavenard; Chapel Laetitia; R\u00e9mi Flamary", "abstract": "This work considers the problem of computing distances between structured objects such as undirected graphs, seen as probability distributions in a specific metric space. We consider a new transportation distance ( i.e. that minimizes a total cost of transporting probability masses) that unveils the geometric nature of the structured objects space. Unlike Wasserstein or Gromov-Wasserstein metrics that focus solely and respectively on features (by considering a metric in the feature space) or structure (by seeing structure as a metric space), our new distance exploits jointly both information, and is consequently called Fused Gromov-Wasserstein (FGW). After discussing its properties and computational aspects, we show results on a graph classification task, where our method outperforms both graph kernels and deep graph convolutional networks. Exploiting further on the metric properties of FGW, interesting geometric objects such as Fr{\u00e9}chet means or barycenters of graphs are illustrated and discussed in a clustering context.", "bibtex": "@InProceedings{pmlr-v97-titouan19a,\n title = \t {Optimal Transport for structured data with application on graphs},\n author = {Titouan, Vayer and Courty, Nicolas and Tavenard, Romain and Laetitia, Chapel and Flamary, R{\\'e}mi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6275--6284},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/titouan19a/titouan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/titouan19a.html},\n abstract = \t {This work considers the problem of computing distances between structured objects such as undirected graphs, seen as probability distributions in a specific metric space. We consider a new transportation distance ( i.e. that minimizes a total cost of transporting probability masses) that unveils the geometric nature of the structured objects space. Unlike Wasserstein or Gromov-Wasserstein metrics that focus solely and respectively on features (by considering a metric in the feature space) or structure (by seeing structure as a metric space), our new distance exploits jointly both information, and is consequently called Fused Gromov-Wasserstein (FGW). After discussing its properties and computational aspects, we show results on a graph classification task, where our method outperforms both graph kernels and deep graph convolutional networks. Exploiting further on the metric properties of FGW, interesting geometric objects such as Fr{\u00e9}chet means or barycenters of graphs are illustrated and discussed in a clustering context.}\n}", "pdf": "http://proceedings.mlr.press/v97/titouan19a/titouan19a.pdf", "supp": "", "pdf_size": 960087, "gs_citation": 207, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10320623113700017334&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Univ. Bretagne-Sud, CNRS, IRISA, F-56000 Vannes; Univ. Bretagne-Sud, CNRS, IRISA, F-56000 Vannes; Univ. C\u02c6ote d\u2019Azur, CNRS, OCA Lagrange, F-06000 Nice; Univ. Rennes, CNRS, LETG, F-35000 Rennes; Univ. Bretagne-Sud, CNRS, IRISA, F-56000 Vannes", "aff_domain": "irisa.fr; ; ; ; ", "email": "irisa.fr; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/titouan19a.html", "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Bretagne-Sud;University C\u00f4te d'Azur;University of Rennes", "aff_unique_dep": ";;LETG", "aff_unique_url": "https://www.univ-ubs.fr;https://www.univ-cotedazur.fr;https://www.univ-rennes1.fr", "aff_unique_abbr": "UBS;Univ. C\u00f4te d\u2019Azur;Univ. Rennes", "aff_campus_unique_index": "1", "aff_campus_unique": ";Rennes", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "France" }, { "title": "Optimality Implies Kernel Sum Classifiers are Statistically Efficient", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4299", "id": "4299", "author_site": "Raphael Meyer, Jean Honorio", "author": "Raphael Meyer; Jean Honorio", "abstract": "We propose a novel combination of optimization tools with learning theory bounds in order to analyze the sample complexity of optimal kernel sum classifiers. This contrasts the typical learning theoretic results which hold for all (potentially suboptimal) classifiers. Our work also justifies assumptions made in prior work on multiple kernel learning. As a byproduct of our analysis, we also provide a new form of Rademacher complexity for hypothesis classes containing only optimal classifiers.", "bibtex": "@InProceedings{pmlr-v97-meyer19a,\n title = \t {Optimality Implies Kernel Sum Classifiers are Statistically Efficient},\n author = {Meyer, Raphael and Honorio, Jean},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4566--4574},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/meyer19a/meyer19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/meyer19a.html},\n abstract = \t {We propose a novel combination of optimization tools with learning theory bounds in order to analyze the sample complexity of optimal kernel sum classifiers. This contrasts the typical learning theoretic results which hold for all (potentially suboptimal) classifiers. Our work also justifies assumptions made in prior work on multiple kernel learning. As a byproduct of our analysis, we also provide a new form of Rademacher complexity for hypothesis classes containing only optimal classifiers.}\n}", "pdf": "http://proceedings.mlr.press/v97/meyer19a/meyer19a.pdf", "supp": "", "pdf_size": 394733, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17041784516940890650&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff": "Department of Computer Science, Purdue University, Indiana, USA; Department of Computer Science, Purdue University, Indiana, USA", "aff_domain": "purdue.edu;purdue.edu", "email": "purdue.edu;purdue.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/meyer19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Indiana", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Optimistic Policy Optimization via Multiple Importance Sampling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4049", "id": "4049", "author_site": "Matteo Papini, Alberto Maria Metelli, Lorenzo Lupo, Marcello Restelli", "author": "Matteo Papini; Alberto Maria Metelli; Lorenzo Lupo; Marcello Restelli", "abstract": "Policy Search (PS) is an effective approach to Reinforcement Learning (RL) for solving control tasks with continuous state-action spaces. In this paper, we address the exploration-exploitation trade-off in PS by proposing an approach based on Optimism in the Face of Uncertainty. We cast the PS problem as a suitable Multi Armed Bandit (MAB) problem, defined over the policy parameter space, and we propose a class of algorithms that effectively exploit the problem structure, by leveraging Multiple Importance Sampling to perform an off-policy estimation of the expected return. We show that the regret of the proposed approach is bounded by $\\widetilde{\\mathcal{O}}(\\sqrt{T})$ for both discrete and continuous parameter spaces. Finally, we evaluate our algorithms on tasks of varying difficulty, comparing them with existing MAB and RL algorithms.", "bibtex": "@InProceedings{pmlr-v97-papini19a,\n title = \t {Optimistic Policy Optimization via Multiple Importance Sampling},\n author = {Papini, Matteo and Metelli, Alberto Maria and Lupo, Lorenzo and Restelli, Marcello},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4989--4999},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/papini19a/papini19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/papini19a.html},\n abstract = \t {Policy Search (PS) is an effective approach to Reinforcement Learning (RL) for solving control tasks with continuous state-action spaces. In this paper, we address the exploration-exploitation trade-off in PS by proposing an approach based on Optimism in the Face of Uncertainty. We cast the PS problem as a suitable Multi Armed Bandit (MAB) problem, defined over the policy parameter space, and we propose a class of algorithms that effectively exploit the problem structure, by leveraging Multiple Importance Sampling to perform an off-policy estimation of the expected return. We show that the regret of the proposed approach is bounded by $\\widetilde{\\mathcal{O}}(\\sqrt{T})$ for both discrete and continuous parameter spaces. Finally, we evaluate our algorithms on tasks of varying difficulty, comparing them with existing MAB and RL algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/papini19a/papini19a.pdf", "supp": "", "pdf_size": 529023, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13764948269040899080&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Politecnico di Milano; Politecnico di Milano; Politecnico di Milano; Politecnico di Milano", "aff_domain": "polimi.it; ; ; ", "email": "polimi.it; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/papini19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Italy" }, { "title": "Orthogonal Random Forest for Causal Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4078", "id": "4078", "author_site": "Miruna Oprescu, Vasilis Syrgkanis, Steven Wu", "author": "Miruna Oprescu; Vasilis Syrgkanis; Zhiwei Steven Wu", "abstract": "We propose the orthogonal random forest, an algorithm that combines Neyman-orthogonality to reduce sensitivity with respect to estimation error of nuisance parameters with generalized random forests (Athey et al., 2017)\u2014a flexible non-parametric method for statistical estimation of conditional moment models using random forests. We provide a consistency rate and establish asymptotic normality for our estimator. We show that under mild assumptions on the consistency rate of the nuisance estimator, we can achieve the same error rate as an oracle with a priori knowledge of these nuisance parameters. We show that when the nuisance functions have a locally sparse parametrization, then a local ell_1-penalized regression achieves the required rate. We apply our method to estimate heterogeneous treatment effects from observational data with discrete treatments or continuous treatments, and we show that, unlike prior work, our method provably allows to control for a high-dimensional set of variables under standard sparsity conditions. We also provide a comprehensive empirical evaluation of our algorithm on both synthetic and real data.", "bibtex": "@InProceedings{pmlr-v97-oprescu19a,\n title = \t {Orthogonal Random Forest for Causal Inference},\n author = {Oprescu, Miruna and Syrgkanis, Vasilis and Wu, Zhiwei Steven},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4932--4941},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/oprescu19a/oprescu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/oprescu19a.html},\n abstract = \t {We propose the orthogonal random forest, an algorithm that combines Neyman-orthogonality to reduce sensitivity with respect to estimation error of nuisance parameters with generalized random forests (Athey et al., 2017)\u2014a flexible non-parametric method for statistical estimation of conditional moment models using random forests. We provide a consistency rate and establish asymptotic normality for our estimator. We show that under mild assumptions on the consistency rate of the nuisance estimator, we can achieve the same error rate as an oracle with a priori knowledge of these nuisance parameters. We show that when the nuisance functions have a locally sparse parametrization, then a local ell_1-penalized regression achieves the required rate. We apply our method to estimate heterogeneous treatment effects from observational data with discrete treatments or continuous treatments, and we show that, unlike prior work, our method provably allows to control for a high-dimensional set of variables under standard sparsity conditions. We also provide a comprehensive empirical evaluation of our algorithm on both synthetic and real data.}\n}", "pdf": "http://proceedings.mlr.press/v97/oprescu19a/oprescu19a.pdf", "supp": "", "pdf_size": 1053407, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1871181716543524277&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Microsoft Research\u2013New England; Microsoft Research\u2013New England; University of Minnesota\u2013Twin Cities", "aff_domain": "microsoft.com;microsoft.com;umn.edu", "email": "microsoft.com;microsoft.com;umn.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/oprescu19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Microsoft;University of Minnesota", "aff_unique_dep": "Microsoft Research;", "aff_unique_url": "https://www.microsoft.com/en-us/research/group/microsoft-research-new-england;https://www.minnesota.edu", "aff_unique_abbr": "MSR NE;UMN", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "New England;Twin Cities", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Overcoming Mean-Field Approximations in Recurrent Gaussian Process Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4134", "id": "4134", "author_site": "Alessandro Davide Ialongo, Mark van der Wilk, James Hensman, Carl E Rasmussen", "author": "Alessandro Davide Ialongo; Mark Van Der Wilk; James Hensman; Carl Edward Rasmussen", "abstract": "We identify a new variational inference scheme for dynamical systems whose transition function is modelled by a Gaussian process. Inference in this setting has either employed computationally intensive MCMC methods, or relied on factorisations of the variational posterior. As we demonstrate in our experiments, the factorisation between latent system states and transition function can lead to a miscalibrated posterior and to learning unnecessarily large noise terms. We eliminate this factorisation by explicitly modelling the dependence between state trajectories and the low-rank representation of our Gaussian process posterior. Samples of the latent states can then be tractably generated by conditioning on this representation. The method we obtain gives better predictive performance and more calibrated estimates of the transition function, yet maintains the same time and space complexities as mean-field methods.", "bibtex": "@InProceedings{pmlr-v97-ialongo19a,\n title = \t {Overcoming Mean-Field Approximations in Recurrent {G}aussian Process Models},\n author = {Ialongo, Alessandro Davide and Van Der Wilk, Mark and Hensman, James and Rasmussen, Carl Edward},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2931--2940},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ialongo19a/ialongo19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ialongo19a.html},\n abstract = \t {We identify a new variational inference scheme for dynamical systems whose transition function is modelled by a Gaussian process. Inference in this setting has either employed computationally intensive MCMC methods, or relied on factorisations of the variational posterior. As we demonstrate in our experiments, the factorisation between latent system states and transition function can lead to a miscalibrated posterior and to learning unnecessarily large noise terms. We eliminate this factorisation by explicitly modelling the dependence between state trajectories and the low-rank representation of our Gaussian process posterior. Samples of the latent states can then be tractably generated by conditioning on this representation. The method we obtain gives better predictive performance and more calibrated estimates of the transition function, yet maintains the same time and space complexities as mean-field methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/ialongo19a/ialongo19a.pdf", "supp": "", "pdf_size": 3612815, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13109450737746036374&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Computational and Biological Learning Group, University of Cambridge + Max Planck Institute for Intelligent Systems, T\u00fcbingen; PROWLER.io; PROWLER.io; Computational and Biological Learning Group, University of Cambridge", "aff_domain": "cam.ac.uk; ; ; ", "email": "cam.ac.uk; ; ; ", "github": "github.com/ialong/GPt", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/ialongo19a.html", "aff_unique_index": "0+1;2;2;0", "aff_unique_norm": "University of Cambridge;Max Planck Institute for Intelligent Systems;PROWLER.io", "aff_unique_dep": "Computational and Biological Learning Group;;", "aff_unique_url": "https://www.cam.ac.uk;https://www.mpi-is.mpg.de;https://prowler.io", "aff_unique_abbr": "Cambridge;MPI-IS;PROWLER.io", "aff_campus_unique_index": "0+1;0", "aff_campus_unique": "Cambridge;T\u00fcbingen;", "aff_country_unique_index": "0+1;0;0;0", "aff_country_unique": "United Kingdom;Germany" }, { "title": "Overcoming Multi-model Forgetting", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3710", "id": "3710", "author_site": "Yassine Benyahia, Kaicheng Yu, Kamil Bennani-Smires, Martin Jaggi, Anthony C. Davison, Mathieu Salzmann, Claudiu Musat", "author": "Yassine Benyahia; Kaicheng Yu; Kamil Bennani Smires; Martin Jaggi; Anthony C. Davison; Mathieu Salzmann; Claudiu Musat", "abstract": "We identify a phenomenon, which we refer to as multi-model forgetting, that occurs when sequentially training multiple deep networks with partially-shared parameters; the performance of previously-trained models degrades as one optimizes a subsequent one, due to the overwriting of shared parameters. To overcome this, we introduce a statistically-justified weight plasticity loss that regularizes the learning of a model\u2019s shared parameters according to their importance for the previous models, and demonstrate its effectiveness when training two models sequentially and for neural architecture search. Adding weight plasticity in neural architecture search preserves the best models to the end of the search and yields improved results in both natural language processing and computer vision tasks.", "bibtex": "@InProceedings{pmlr-v97-benyahia19a,\n title = \t {Overcoming Multi-model Forgetting},\n author = {Benyahia, Yassine and Yu, Kaicheng and Smires, Kamil Bennani and Jaggi, Martin and Davison, Anthony C. and Salzmann, Mathieu and Musat, Claudiu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {594--603},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/benyahia19a/benyahia19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/benyahia19a.html},\n abstract = \t {We identify a phenomenon, which we refer to as multi-model forgetting, that occurs when sequentially training multiple deep networks with partially-shared parameters; the performance of previously-trained models degrades as one optimizes a subsequent one, due to the overwriting of shared parameters. To overcome this, we introduce a statistically-justified weight plasticity loss that regularizes the learning of a model\u2019s shared parameters according to their importance for the previous models, and demonstrate its effectiveness when training two models sequentially and for neural architecture search. Adding weight plasticity in neural architecture search preserves the best models to the end of the search and yields improved results in both natural language processing and computer vision tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/benyahia19a/benyahia19a.pdf", "supp": "", "pdf_size": 1415864, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16148647356046938402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Institute of Mathematics, EPFL; Computer Vision Lab, EPFL; Artificial Intelligence Lab, Swisscom; Machine Learning and Optimization lab, EPFL; Institute of Mathematics, EPFL; Computer Vision Lab, EPFL; Artificial Intelligence Lab, Swisscom", "aff_domain": "gmail.com;epfl.ch; ; ; ; ; ", "email": "gmail.com;epfl.ch; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/benyahia19a.html", "aff_unique_index": "0;0;1;0;0;0;1", "aff_unique_norm": "EPFL;Swisscom", "aff_unique_dep": "Institute of Mathematics;Artificial Intelligence Lab", "aff_unique_url": "https://www.epfl.ch;https://www.swisscom.ch", "aff_unique_abbr": "EPFL;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "title": "Overparameterized Nonlinear Learning: Gradient Descent Takes the Shortest Path?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3700", "id": "3700", "author_site": "Samet Oymak, Mahdi Soltanolkotabi", "author": "Samet Oymak; Mahdi Soltanolkotabi", "abstract": "Many modern learning tasks involve fitting nonlinear models which are trained in an overparameterized regime where the parameters of the model exceed the size of the training dataset. Due to this overparameterization, the training loss may have infinitely many global minima and it is critical to understand the properties of the solutions found by first-order optimization schemes such as (stochastic) gradient descent starting from different initializations. In this paper we demonstrate that when the loss has certain properties over a minimally small neighborhood of the initial point, first order methods such as (stochastic) gradient descent have a few intriguing properties: (1) the iterates converge at a geometric rate to a global optima even when the loss is nonconvex, (2) among all global optima of the loss the iterates converge to one with a near minimal distance to the initial point, (3) the iterates take a near direct route from the initial point to this global optimum. As part of our proof technique, we introduce a new potential function which captures the tradeoff between the loss function and the distance to the initial point as the iterations progress. The utility of our general theory is demonstrated for a variety of problem domains spanning low-rank matrix recovery to shallow neural network training.", "bibtex": "@InProceedings{pmlr-v97-oymak19a,\n title = \t {Overparameterized Nonlinear Learning: Gradient Descent Takes the Shortest Path?},\n author = {Oymak, Samet and Soltanolkotabi, Mahdi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4951--4960},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/oymak19a/oymak19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/oymak19a.html},\n abstract = \t {Many modern learning tasks involve fitting nonlinear models which are trained in an overparameterized regime where the parameters of the model exceed the size of the training dataset. Due to this overparameterization, the training loss may have infinitely many global minima and it is critical to understand the properties of the solutions found by first-order optimization schemes such as (stochastic) gradient descent starting from different initializations. In this paper we demonstrate that when the loss has certain properties over a minimally small neighborhood of the initial point, first order methods such as (stochastic) gradient descent have a few intriguing properties: (1) the iterates converge at a geometric rate to a global optima even when the loss is nonconvex, (2) among all global optima of the loss the iterates converge to one with a near minimal distance to the initial point, (3) the iterates take a near direct route from the initial point to this global optimum. As part of our proof technique, we introduce a new potential function which captures the tradeoff between the loss function and the distance to the initial point as the iterations progress. The utility of our general theory is demonstrated for a variety of problem domains spanning low-rank matrix recovery to shallow neural network training.}\n}", "pdf": "http://proceedings.mlr.press/v97/oymak19a/oymak19a.pdf", "supp": "", "pdf_size": 1201778, "gs_citation": 223, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8638916663075330014&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, University of California, Riverside; Department of Electrical and Computer Engineering, University of Southern California", "aff_domain": "gmail.com;usc.edu", "email": "gmail.com;usc.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/oymak19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of California, Riverside;University of Southern California", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.ucr.edu;https://www.usc.edu", "aff_unique_abbr": "UCR;USC", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Riverside;Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "PA-GD: On the Convergence of Perturbed Alternating Gradient Descent to Second-Order Stationary Points for Structured Nonconvex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4138", "id": "4138", "author_site": "Songtao Lu, Mingyi Hong, Zhengdao Wang", "author": "Songtao Lu; Mingyi Hong; Zhengdao Wang", "abstract": "Alternating gradient descent (A-GD) is a simple but popular algorithm in machine learning, which updates two blocks of variables in an alternating manner using gradient descent steps. In this paper, we consider a smooth unconstrained nonconvex optimization problem, and propose a perturbed A-GD (PA-GD) which is able to converge (with high probability) to the second-order stationary points (SOSPs) with a global sublinear rate. Existing analysis on A-GD type algorithm either only guarantees convergence to first-order solutions, or converges to second-order solutions asymptotically (without rates). To the best of our knowledge, this is the first alternating type algorithm that takes $\\mathcal{O}(\\text{polylog}(d)/\\epsilon^2)$ iterations to achieve an ($\\epsilon,\\sqrt{\\epsilon}$)-SOSP with high probability, where polylog$(d)$ denotes the polynomial of the logarithm with respect to problem dimension $d$.", "bibtex": "@InProceedings{pmlr-v97-lu19a,\n title = \t {{PA}-{GD}: On the Convergence of Perturbed Alternating Gradient Descent to Second-Order Stationary Points for Structured Nonconvex Optimization},\n author = {Lu, Songtao and Hong, Mingyi and Wang, Zhengdao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4134--4143},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lu19a/lu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lu19a.html},\n abstract = \t {Alternating gradient descent (A-GD) is a simple but popular algorithm in machine learning, which updates two blocks of variables in an alternating manner using gradient descent steps. In this paper, we consider a smooth unconstrained nonconvex optimization problem, and propose a perturbed A-GD (PA-GD) which is able to converge (with high probability) to the second-order stationary points (SOSPs) with a global sublinear rate. Existing analysis on A-GD type algorithm either only guarantees convergence to first-order solutions, or converges to second-order solutions asymptotically (without rates). To the best of our knowledge, this is the first alternating type algorithm that takes $\\mathcal{O}(\\text{polylog}(d)/\\epsilon^2)$ iterations to achieve an ($\\epsilon,\\sqrt{\\epsilon}$)-SOSP with high probability, where polylog$(d)$ denotes the polynomial of the logarithm with respect to problem dimension $d$.}\n}", "pdf": "http://proceedings.mlr.press/v97/lu19a/lu19a.pdf", "supp": "", "pdf_size": 316362, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=853505026062168321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering, University of Minnesota Twin Cities, Minneapolis, MN, USA; Department of Electrical and Computer Engineering, University of Minnesota Twin Cities, Minneapolis, MN, USA; Department of Electrical and Computer Engineering, Iowa State University, Ames, IA, USA", "aff_domain": "umn.edu; ; ", "email": "umn.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/lu19a.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Minnesota Twin Cities;Iowa State University", "aff_unique_dep": "Department of Electrical and Computer Engineering;Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.umn.edu;https://www.iastate.edu", "aff_unique_abbr": "UMN;ISU", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Minneapolis;Ames", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "PAC Identification of Many Good Arms in Stochastic Multi-Armed Bandits", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3895", "id": "3895", "author_site": "Arghya Roy Chaudhuri, Shivaram Kalyanakrishnan", "author": "Arghya Roy Chaudhuri; Shivaram Kalyanakrishnan", "abstract": "We consider the problem of identifying any k out of the best m arms in an n-armed stochastic multi-armed bandit; framed in the PAC setting, this particular problem generalises both the problem of \u201cbest subset selection\u201d (Kalyanakrishnan & Stone, 2010) and that of selecting \u201cone out of the best m\u201d arms (Roy Chaudhuri & Kalyanakrishnan, 2017). We present a lower bound on the worst-case sample complexity for general k, and a fully sequential PAC algorithm, LUCB-k-m, which is more sample-efficient on easy instances. Also, extending our analysis to infinite-armed bandits, we present a PAC algorithm that is independent of n, which identifies an arm from the best $\\rho$ fraction of arms using at most an additive poly-log number of samples than compared to the lower bound, thereby improving over Roy Chaudhuri & Kalyanakrishnan (2017) and Aziz et al. (2018). The problem of identifying k > 1 distinct arms from the best $\\rho$ fraction is not always well-defined; for a special class of this problem, we present lower and upper bounds. Finally, through a reduction, we establish a relation between upper bounds for the \u201cone out of the best $\\rho$\u201d problem for infinite instances and the \u201cone out of the best m\u201d problem for finite instances. We conjecture that it is more efficient to solve \u201csmall\u201d finite instances using the latter formulation, rather than going through the former.", "bibtex": "@InProceedings{pmlr-v97-chaudhuri19a,\n title = \t {{PAC} Identification of Many Good Arms in Stochastic Multi-Armed Bandits},\n author = {Chaudhuri, Arghya Roy and Kalyanakrishnan, Shivaram},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {991--1000},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chaudhuri19a/chaudhuri19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chaudhuri19a.html},\n abstract = \t {We consider the problem of identifying any k out of the best m arms in an n-armed stochastic multi-armed bandit; framed in the PAC setting, this particular problem generalises both the problem of \u201cbest subset selection\u201d (Kalyanakrishnan & Stone, 2010) and that of selecting \u201cone out of the best m\u201d arms (Roy Chaudhuri & Kalyanakrishnan, 2017). We present a lower bound on the worst-case sample complexity for general k, and a fully sequential PAC algorithm, LUCB-k-m, which is more sample-efficient on easy instances. Also, extending our analysis to infinite-armed bandits, we present a PAC algorithm that is independent of n, which identifies an arm from the best $\\rho$ fraction of arms using at most an additive poly-log number of samples than compared to the lower bound, thereby improving over Roy Chaudhuri & Kalyanakrishnan (2017) and Aziz et al. (2018). The problem of identifying k > 1 distinct arms from the best $\\rho$ fraction is not always well-defined; for a special class of this problem, we present lower and upper bounds. Finally, through a reduction, we establish a relation between upper bounds for the \u201cone out of the best $\\rho$\u201d problem for infinite instances and the \u201cone out of the best m\u201d problem for finite instances. We conjecture that it is more efficient to solve \u201csmall\u201d finite instances using the latter formulation, rather than going through the former.}\n}", "pdf": "http://proceedings.mlr.press/v97/chaudhuri19a/chaudhuri19a.pdf", "supp": "", "pdf_size": 509190, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11168824335978863125&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science and Engineering, Indian Institute of Technology Bombay, Mumbai 400076, India; Department of Computer Science and Engineering, Indian Institute of Technology Bombay, Mumbai 400076, India", "aff_domain": "cse.iitb.ac.in;cse.iitb.ac.in", "email": "cse.iitb.ac.in;cse.iitb.ac.in", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/chaudhuri19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Indian Institute of Technology Bombay", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.iitb.ac.in", "aff_unique_abbr": "IIT Bombay", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mumbai", "aff_country_unique_index": "0;0", "aff_country_unique": "India" }, { "title": "PAC Learnability of Node Functions in Networked Dynamical Systems", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4122", "id": "4122", "author_site": "Abhijin Adiga, Chris J Kuhlman, Madhav Marathe, S. S. Ravi, Anil Vullikanti", "author": "Abhijin Adiga; Chris J Kuhlman; Madhav Marathe; S Ravi; Anil Vullikanti", "abstract": "We consider the PAC learnability of the local functions at the vertices of a discrete networked dynamical system, assuming that the underlying network is known. Our focus is on the learnability of threshold functions. We show that several variants of threshold functions are PAC learnable and provide tight bounds on the sample complexity. In general, when the input consists of positive and negative examples, we show that the concept class of threshold functions is not efficiently PAC learnable, unless NP = RP. Using a dynamic programming approach, we show efficient PAC learnability when the number of negative examples is small. We also present an efficient learner which is consistent with all the positive examples and at least (1-1/e) fraction of the negative examples. This algorithm is based on maximizing a submodular function under matroid constraints. By performing experiments on both synthetic and real-world networks, we study how the network structure and sample complexity influence the quality of the inferred system.", "bibtex": "@InProceedings{pmlr-v97-adiga19a,\n title = \t {{PAC} Learnability of Node Functions in Networked Dynamical Systems},\n author = {Adiga, Abhijin and Kuhlman, Chris J and Marathe, Madhav and Ravi, S and Vullikanti, Anil},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {82--91},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/adiga19a/adiga19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/adiga19a.html},\n abstract = \t {We consider the PAC learnability of the local functions at the vertices of a discrete networked dynamical system, assuming that the underlying network is known. Our focus is on the learnability of threshold functions. We show that several variants of threshold functions are PAC learnable and provide tight bounds on the sample complexity. In general, when the input consists of positive and negative examples, we show that the concept class of threshold functions is not efficiently PAC learnable, unless NP = RP. Using a dynamic programming approach, we show efficient PAC learnability when the number of negative examples is small. We also present an efficient learner which is consistent with all the positive examples and at least (1-1/e) fraction of the negative examples. This algorithm is based on maximizing a submodular function under matroid constraints. By performing experiments on both synthetic and real-world networks, we study how the network structure and sample complexity influence the quality of the inferred system.}\n}", "pdf": "http://proceedings.mlr.press/v97/adiga19a/adiga19a.pdf", "supp": "", "pdf_size": 500097, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11755671402921550130&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA + Department of Computer Science, University of Virginia, Charlottesville, VA, USA; Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA + Department of Computer Science, University of Virginia, Charlottesville, VA, USA; Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA + Department of Computer Science, University of Virginia, Charlottesville, VA, USA; Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA + Department of Computer Science, University at Albany \u2013 SUNY, Albany, NY, USA; Biocomplexity Institute and Initiative, University of Virginia, Charlottesville, VA, USA + Department of Computer Science, University of Virginia, Charlottesville, VA, USA", "aff_domain": "virginia.edu; ; ; ; ", "email": "virginia.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/adiga19a.html", "aff_unique_index": "0+0;0+0;0+0;0+1;0+0", "aff_unique_norm": "University of Virginia;University at Albany \u2013 SUNY", "aff_unique_dep": "Biocomplexity Institute and Initiative;Department of Computer Science", "aff_unique_url": "https://www.virginia.edu;https://www.albany.edu", "aff_unique_abbr": "UVA;UAlbany", "aff_campus_unique_index": "0+0;0+0;0+0;0+1;0+0", "aff_campus_unique": "Charlottesville;Albany", "aff_country_unique_index": "0+0;0+0;0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "POLITEX: Regret Bounds for Policy Iteration using Expert Prediction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3753", "id": "3753", "author_site": "Yasin Abbasi-Yadkori, Peter Bartlett, Kush Bhatia, Nevena Lazic, Csaba Szepesvari, Gell\u00e9rt Weisz", "author": "Yasin Abbasi-Yadkori; Peter Bartlett; Kush Bhatia; Nevena Lazic; Csaba Szepesvari; Gellert Weisz", "abstract": "We present POLITEX (POLicy ITeration with EXpert advice), a variant of policy iteration where each policy is a Boltzmann distribution over the sum of action-value function estimates of the previous policies, and analyze its regret in continuing RL problems. We assume that the value function error after running a policy for $\\tau$ time steps scales as $\\epsilon(\\tau) = \\epsilon_0 + O(\\sqrt{d/\\tau})$, where $\\epsilon_0$ is the worst-case approximation error and $d$ is the number of features in a compressed representation of the state-action space. We establish that this condition is satisfied by the LSPE algorithm under certain assumptions on the MDP and policies. Under the error assumption, we show that the regret of POLITEX in uniformly mixing MDPs scales as $O(d^{1/2}T^{3/4} + \\epsilon_0T)$, where $O(\\cdot)$ hides logarithmic terms and problem-dependent constants. Thus, we provide the first regret bound for a fully practical model-free method which only scales in the number of features, and not in the size of the underlying MDP. Experiments on a queuing problem confirm that POLITEX is competitive with some of its alternatives, while preliminary results on Ms Pacman (one of the standard Atari benchmark problems) confirm the viability of POLITEX beyond linear function approximation.", "bibtex": "@InProceedings{pmlr-v97-lazic19a,\n title = \t {{POLITEX}: Regret Bounds for Policy Iteration using Expert Prediction},\n author = {Abbasi-Yadkori, Yasin and Bartlett, Peter and Bhatia, Kush and Lazic, Nevena and Szepesvari, Csaba and Weisz, Gellert},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3692--3702},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lazic19a/lazic19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lazic19a.html},\n abstract = \t {We present POLITEX (POLicy ITeration with EXpert advice), a variant of policy iteration where each policy is a Boltzmann distribution over the sum of action-value function estimates of the previous policies, and analyze its regret in continuing RL problems. We assume that the value function error after running a policy for $\\tau$ time steps scales as $\\epsilon(\\tau) = \\epsilon_0 + O(\\sqrt{d/\\tau})$, where $\\epsilon_0$ is the worst-case approximation error and $d$ is the number of features in a compressed representation of the state-action space. We establish that this condition is satisfied by the LSPE algorithm under certain assumptions on the MDP and policies. Under the error assumption, we show that the regret of POLITEX in uniformly mixing MDPs scales as $O(d^{1/2}T^{3/4} + \\epsilon_0T)$, where $O(\\cdot)$ hides logarithmic terms and problem-dependent constants. Thus, we provide the first regret bound for a fully practical model-free method which only scales in the number of features, and not in the size of the underlying MDP. Experiments on a queuing problem confirm that POLITEX is competitive with some of its alternatives, while preliminary results on Ms Pacman (one of the standard Atari benchmark problems) confirm the viability of POLITEX beyond linear function approximation.}\n}", "pdf": "http://proceedings.mlr.press/v97/lazic19a/lazic19a.pdf", "supp": "", "pdf_size": 965205, "gs_citation": 169, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9418615901426019243&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/lazic19a.html" }, { "title": "POPQORN: Quantifying Robustness of Recurrent Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3701", "id": "3701", "author_site": "CHING-YUN KO, Zhaoyang Lyu, Tsui-Wei Weng, Luca Daniel, Ngai Wong, Dahua Lin", "author": "Ching-Yun Ko; Zhaoyang Lyu; Lily Weng; Luca Daniel; Ngai Wong; Dahua Lin", "abstract": "The vulnerability to adversarial attacks has been a critical issue for deep neural networks. Addressing this issue requires a reliable way to evaluate the robustness of a network. Recently, several methods have been developed to compute robustness quantification for neural networks, namely, certified lower bounds of the minimum adversarial perturbation. Such methods, however, were devised for feed-forward networks, e.g. multi-layer perceptron or convolutional networks. It remains an open problem to quantify robustness for recurrent networks, especially LSTM and GRU. For such networks, there exist additional challenges in computing the robustness quantification, such as handling the inputs at multiple steps and the interaction between gates and states. In this work, we propose POPQORN (Propagated-output Quantified Robustness for RNNs), a general algorithm to quantify robustness of RNNs, including vanilla RNNs, LSTMs, and GRUs. We demonstrate its effectiveness on different network architectures and show that the robustness quantification on individual steps can lead to new insights.", "bibtex": "@InProceedings{pmlr-v97-ko19a,\n title = \t {{POPQORN}: Quantifying Robustness of Recurrent Neural Networks},\n author = {Ko, Ching-Yun and Lyu, Zhaoyang and Weng, Lily and Daniel, Luca and Wong, Ngai and Lin, Dahua},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3468--3477},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ko19a/ko19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ko19a.html},\n abstract = \t {The vulnerability to adversarial attacks has been a critical issue for deep neural networks. Addressing this issue requires a reliable way to evaluate the robustness of a network. Recently, several methods have been developed to compute robustness quantification for neural networks, namely, certified lower bounds of the minimum adversarial perturbation. Such methods, however, were devised for feed-forward networks, e.g. multi-layer perceptron or convolutional networks. It remains an open problem to quantify robustness for recurrent networks, especially LSTM and GRU. For such networks, there exist additional challenges in computing the robustness quantification, such as handling the inputs at multiple steps and the interaction between gates and states. In this work, we propose POPQORN (Propagated-output Quantified Robustness for RNNs), a general algorithm to quantify robustness of RNNs, including vanilla RNNs, LSTMs, and GRUs. We demonstrate its effectiveness on different network architectures and show that the robustness quantification on individual steps can lead to new insights.}\n}", "pdf": "http://proceedings.mlr.press/v97/ko19a/ko19a.pdf", "supp": "", "pdf_size": 2481781, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2942353004594500868&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "The University of Hong Kong, Hong Kong; The Chinese University of Hong Kong, Hong Kong; Massachusetts Institute of Technology, Cambridge, MA, USA; Massachusetts Institute of Technology, Cambridge, MA, USA; The University of Hong Kong, Hong Kong; The Chinese University of Hong Kong, Hong Kong", "aff_domain": "eee.hku.hk;link.cuhk.edu.hk; ; ; ; ", "email": "eee.hku.hk;link.cuhk.edu.hk; ; ; ; ", "github": "https://github.com/ZhaoyangLyu/POPQORN", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/ko19a.html", "aff_unique_index": "0;1;2;2;0;1", "aff_unique_norm": "University of Hong Kong;Chinese University of Hong Kong;Massachusetts Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.hku.hk;https://www.cuhk.edu.hk;https://www.mit.edu", "aff_unique_abbr": "HKU;CUHK;MIT", "aff_campus_unique_index": "0;0;1;1;0;0", "aff_campus_unique": "Hong Kong SAR;Cambridge", "aff_country_unique_index": "0;0;1;1;0;0", "aff_country_unique": "China;United States" }, { "title": "PROVEN: Verifying Robustness of Neural Networks with a Probabilistic Approach", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4021", "id": "4021", "author_site": "Tsui-Wei Weng, Pin-Yu Chen, Lam Nguyen, Mark Squillante, Akhilan Boopathy, Ivan Oseledets, Luca Daniel", "author": "Lily Weng; Pin-Yu Chen; Lam Nguyen; Mark Squillante; Akhilan Boopathy; Ivan Oseledets; Luca Daniel", "abstract": "We propose a novel framework PROVEN to \\textbf{PRO}babilistically \\textbf{VE}rify \\textbf{N}eural network\u2019s robustness with statistical guarantees. PROVEN provides probability certificates of neural network robustness when the input perturbation follow distributional characterization. Notably, PROVEN is derived from current state-of-the-art worst-case neural network robustness verification frameworks, and therefore it can provide probability certificates with little computational overhead on top of existing methods such as Fast-Lin, CROWN and CNN-Cert. Experiments on small and large MNIST and CIFAR neural network models demonstrate our probabilistic approach can tighten up robustness certificate to around $1.8 \\times$ and $3.5 \\times$ with at least a $99.99%$ confidence compared with the worst-case robustness certificate by CROWN and CNN-Cert.", "bibtex": "@InProceedings{pmlr-v97-weng19a,\n title = \t {{PROVEN}: Verifying Robustness of Neural Networks with a Probabilistic Approach},\n author = {Weng, Lily and Chen, Pin-Yu and Nguyen, Lam and Squillante, Mark and Boopathy, Akhilan and Oseledets, Ivan and Daniel, Luca},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6727--6736},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/weng19a/weng19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/weng19a.html},\n abstract = \t {We propose a novel framework PROVEN to \\textbf{PRO}babilistically \\textbf{VE}rify \\textbf{N}eural network\u2019s robustness with statistical guarantees. PROVEN provides probability certificates of neural network robustness when the input perturbation follow distributional characterization. Notably, PROVEN is derived from current state-of-the-art worst-case neural network robustness verification frameworks, and therefore it can provide probability certificates with little computational overhead on top of existing methods such as Fast-Lin, CROWN and CNN-Cert. Experiments on small and large MNIST and CIFAR neural network models demonstrate our probabilistic approach can tighten up robustness certificate to around $1.8 \\times$ and $3.5 \\times$ with at least a $99.99%$ confidence compared with the worst-case robustness certificate by CROWN and CNN-Cert.}\n}", "pdf": "http://proceedings.mlr.press/v97/weng19a/weng19a.pdf", "supp": "", "pdf_size": 162928, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7123009242178955403&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "MIT EECS; IBM Research Yorktown Heights; IBM Research Yorktown Heights; IBM Research Yorktown Heights; MIT EECS; Skoltech; MIT EECS", "aff_domain": "mit.edu; ; ; ; ; ; ", "email": "mit.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/weng19a.html", "aff_unique_index": "0;1;1;1;0;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;IBM;Skolkovo Institute of Science and Technology", "aff_unique_dep": "Electrical Engineering & Computer Science;IBM Research;", "aff_unique_url": "https://web.mit.edu;https://www.ibm.com/research;https://www.skoltech.ru", "aff_unique_abbr": "MIT;IBM;Skoltech", "aff_campus_unique_index": "0;1;1;1;0;0", "aff_campus_unique": "Cambridge;Yorktown Heights;", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "United States;Russian Federation" }, { "title": "Parameter efficient training of deep convolutional neural networks by dynamic sparse reparameterization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3692", "id": "3692", "author_site": "Hesham Mostafa, Xin Wang", "author": "Hesham Mostafa; Xin Wang", "abstract": "Modern deep neural networks are typically highly overparameterized. Pruning techniques are able to remove a significant fraction of network parameters with little loss in accuracy. Recently, techniques based on dynamic reallocation of non-zero parameters have emerged, allowing direct training of sparse networks without having to pre-train a large dense model. Here we present a novel dynamic sparse reparameterization method that addresses the limitations of previous techniques such as high computational cost and the need for manual configuration of the number of free parameters allocated to each layer. We evaluate the performance of dynamic reallocation methods in training deep convolutional networks and show that our method outperforms previous static and dynamic reparameterization methods, yielding the best accuracy for a fixed parameter budget, on par with accuracies obtained by iteratively pruning a pre-trained dense model. We further investigated the mechanisms underlying the superior generalization performance of the resultant sparse networks. We found that neither the structure, nor the initialization of the non-zero parameters were sufficient to explain the superior performance. Rather, effective learning crucially depended on the continuous exploration of the sparse network structure space during training. Our work suggests that exploring structural degrees of freedom during training is more effective than adding extra parameters to the network.", "bibtex": "@InProceedings{pmlr-v97-mostafa19a,\n title = \t {Parameter efficient training of deep convolutional neural networks by dynamic sparse reparameterization},\n author = {Mostafa, Hesham and Wang, Xin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4646--4655},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mostafa19a/mostafa19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mostafa19a.html},\n abstract = \t {Modern deep neural networks are typically highly overparameterized. Pruning techniques are able to remove a significant fraction of network parameters with little loss in accuracy. Recently, techniques based on dynamic reallocation of non-zero parameters have emerged, allowing direct training of sparse networks without having to pre-train a large dense model. Here we present a novel dynamic sparse reparameterization method that addresses the limitations of previous techniques such as high computational cost and the need for manual configuration of the number of free parameters allocated to each layer. We evaluate the performance of dynamic reallocation methods in training deep convolutional networks and show that our method outperforms previous static and dynamic reparameterization methods, yielding the best accuracy for a fixed parameter budget, on par with accuracies obtained by iteratively pruning a pre-trained dense model. We further investigated the mechanisms underlying the superior generalization performance of the resultant sparse networks. We found that neither the structure, nor the initialization of the non-zero parameters were sufficient to explain the superior performance. Rather, effective learning crucially depended on the continuous exploration of the sparse network structure space during training. Our work suggests that exploring structural degrees of freedom during training is more effective than adding extra parameters to the network.}\n}", "pdf": "http://proceedings.mlr.press/v97/mostafa19a/mostafa19a.pdf", "supp": "", "pdf_size": 562422, "gs_citation": 416, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3342252922777294975&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Arti\ufb01cial Intelligence Products Group, Intel Corporation, San Diego, CA, USA; Arti\ufb01cial Intelligence Products Group, Intel Corporation, San Diego, CA, USA + Cerebras Systems, Los Altos, CA, USA", "aff_domain": ";cerebras.net", "email": ";cerebras.net", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/mostafa19a.html", "aff_unique_index": "0;0+1", "aff_unique_norm": "Intel;Cerebras Systems", "aff_unique_dep": "Arti\ufb01cial Intelligence Products Group;", "aff_unique_url": "https://www.intel.com;https://www.cerebras.com", "aff_unique_abbr": "Intel;", "aff_campus_unique_index": "0;0+1", "aff_campus_unique": "San Diego;Los Altos", "aff_country_unique_index": "0;0+0", "aff_country_unique": "United States" }, { "title": "Parameter-Efficient Transfer Learning for NLP", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4119", "id": "4119", "author_site": "Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin de Laroussilhe, Andrea Gesmundo, Mona Attariyan, Sylvain Gelly", "author": "Neil Houlsby; Andrei Giurgiu; Stanislaw Jastrzebski; Bruna Morrone; Quentin De Laroussilhe; Andrea Gesmundo; Mona Attariyan; Sylvain Gelly", "abstract": "Fine-tuning large pretrained models is an effective transfer mechanism in NLP. However, in the presence of many downstream tasks, fine-tuning is parameter inefficient: an entire new model is required for every task. As an alternative, we propose transfer with adapter modules. Adapter modules yield a compact and extensible model; they add only a few trainable parameters per task, and new tasks can be added without revisiting previous ones. The parameters of the original network remain fixed, yielding a high degree of parameter sharing. To demonstrate adapter\u2019s effectiveness, we transfer the recently proposed BERT Transformer model to $26$ diverse text classification tasks, including the GLUE benchmark. Adapters attain near state-of-the-art performance, whilst adding only a few parameters per task. On GLUE, we attain within $0.8%$ of the performance of full fine-tuning, adding only $3.6%$ parameters per task. By contrast, fine-tuning trains $100%$ of the parameters per task.", "bibtex": "@InProceedings{pmlr-v97-houlsby19a,\n title = \t {Parameter-Efficient Transfer Learning for {NLP}},\n author = {Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2790--2799},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/houlsby19a/houlsby19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/houlsby19a.html},\n abstract = \t {Fine-tuning large pretrained models is an effective transfer mechanism in NLP. However, in the presence of many downstream tasks, fine-tuning is parameter inefficient: an entire new model is required for every task. As an alternative, we propose transfer with adapter modules. Adapter modules yield a compact and extensible model; they add only a few trainable parameters per task, and new tasks can be added without revisiting previous ones. The parameters of the original network remain fixed, yielding a high degree of parameter sharing. To demonstrate adapter\u2019s effectiveness, we transfer the recently proposed BERT Transformer model to $26$ diverse text classification tasks, including the GLUE benchmark. Adapters attain near state-of-the-art performance, whilst adding only a few parameters per task. On GLUE, we attain within $0.8%$ of the performance of full fine-tuning, adding only $3.6%$ parameters per task. By contrast, fine-tuning trains $100%$ of the parameters per task.}\n}", "pdf": "http://proceedings.mlr.press/v97/houlsby19a/houlsby19a.pdf", "supp": "", "pdf_size": 724447, "gs_citation": 5271, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18111543891993452201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Google Research; Google Research; Jagiellonian University; Google Research; Google Research; Google Research; Google Research; Google Research", "aff_domain": "google.com; ; ; ; ; ; ; ", "email": "google.com; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/houlsby19a.html", "aff_unique_index": "0;0;1;0;0;0;0;0", "aff_unique_norm": "Google;Jagiellonian University", "aff_unique_dep": "Google Research;", "aff_unique_url": "https://research.google;https://www.uj.edu.pl", "aff_unique_abbr": "Google Research;UJ", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;1;0;0;0;0;0", "aff_country_unique": "United States;Poland" }, { "title": "Pareto Optimal Streaming Unsupervised Classification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3912", "id": "3912", "author_site": "Soumya Basu, Steven Gutstein, Brent Lance, Sanjay Shakkottai", "author": "Soumya Basu; Steven Gutstein; Brent Lance; Sanjay Shakkottai", "abstract": "We study an online and streaming unsupervised classification system. Our setting consists of a collection of classifiers (with unknown confusion matrices) each of which can classify one sample per unit time, and which are accessed by a stream of unlabeled samples. Each sample is dispatched to one or more classifiers, and depending on the labels collected from these classifiers, may be sent to other classifiers to collect additional labels. The labels are continually aggregated. Once the aggregated label has high enough accuracy (a pre-specified threshold for accuracy) or the sample is sent to all the classifiers, the now labeled sample is ejected from the system. For any given pre-specified threshold for accuracy, the objective is to sustain the maximum possible rate of arrival of new samples, such that the number of samples in memory does not grow unbounded. In this paper, we characterize the Pareto-optimal region of accuracy and arrival rate, and develop an algorithm that can operate at any point within this region. Our algorithm uses queueing-based routing and scheduling approaches combined with novel online tensor decomposition method to learn the hidden parameters, to Pareto-optimality guarantees. We finally verify our theoretical results through simulations on two ensembles formed using AlexNet, VGG, and ResNet deep image classifiers.", "bibtex": "@InProceedings{pmlr-v97-basu19a,\n title = \t {Pareto Optimal Streaming Unsupervised Classification},\n author = {Basu, Soumya and Gutstein, Steven and Lance, Brent and Shakkottai, Sanjay},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {505--514},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/basu19a/basu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/basu19a.html},\n abstract = \t {We study an online and streaming unsupervised classification system. Our setting consists of a collection of classifiers (with unknown confusion matrices) each of which can classify one sample per unit time, and which are accessed by a stream of unlabeled samples. Each sample is dispatched to one or more classifiers, and depending on the labels collected from these classifiers, may be sent to other classifiers to collect additional labels. The labels are continually aggregated. Once the aggregated label has high enough accuracy (a pre-specified threshold for accuracy) or the sample is sent to all the classifiers, the now labeled sample is ejected from the system. For any given pre-specified threshold for accuracy, the objective is to sustain the maximum possible rate of arrival of new samples, such that the number of samples in memory does not grow unbounded. In this paper, we characterize the Pareto-optimal region of accuracy and arrival rate, and develop an algorithm that can operate at any point within this region. Our algorithm uses queueing-based routing and scheduling approaches combined with novel online tensor decomposition method to learn the hidden parameters, to Pareto-optimality guarantees. We finally verify our theoretical results through simulations on two ensembles formed using AlexNet, VGG, and ResNet deep image classifiers.}\n}", "pdf": "http://proceedings.mlr.press/v97/basu19a/basu19a.pdf", "supp": "", "pdf_size": 901225, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10042881174227560795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "The University of Texas at Austin, USA; Army Research Lab, USA; Army Research Lab, USA; The University of Texas at Austin, USA", "aff_domain": "utexas.edu; ; ; ", "email": "utexas.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/basu19a.html", "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Texas at Austin;Army Research Lab", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.arl.army.mil", "aff_unique_abbr": "UT Austin;ARL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Parsimonious Black-Box Adversarial Attacks via Efficient Combinatorial Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4042", "id": "4042", "author_site": "Seungyong Moon, Gaon An, Hyun Oh Song", "author": "Seungyong Moon; Gaon An; Hyun Oh Song", "abstract": "Solving for adversarial examples with projected gradient descent has been demonstrated to be highly effective in fooling the neural network based classifiers. However, in the black-box setting, the attacker is limited only to the query access to the network and solving for a successful adversarial example becomes much more difficult. To this end, recent methods aim at estimating the true gradient signal based on the input queries but at the cost of excessive queries. We propose an efficient discrete surrogate to the optimization problem which does not require estimating the gradient and consequently becomes free of the first order update hyperparameters to tune. Our experiments on Cifar-10 and ImageNet show the state of the art black-box attack performance with significant reduction in the required queries compared to a number of recently proposed methods. The source code is available at https://github.com/snu-mllab/parsimonious-blackbox-attack.", "bibtex": "@InProceedings{pmlr-v97-moon19a,\n title = \t {Parsimonious Black-Box Adversarial Attacks via Efficient Combinatorial Optimization},\n author = {Moon, Seungyong and An, Gaon and Song, Hyun Oh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4636--4645},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/moon19a/moon19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/moon19a.html},\n abstract = \t {Solving for adversarial examples with projected gradient descent has been demonstrated to be highly effective in fooling the neural network based classifiers. However, in the black-box setting, the attacker is limited only to the query access to the network and solving for a successful adversarial example becomes much more difficult. To this end, recent methods aim at estimating the true gradient signal based on the input queries but at the cost of excessive queries. We propose an efficient discrete surrogate to the optimization problem which does not require estimating the gradient and consequently becomes free of the first order update hyperparameters to tune. Our experiments on Cifar-10 and ImageNet show the state of the art black-box attack performance with significant reduction in the required queries compared to a number of recently proposed methods. The source code is available at https://github.com/snu-mllab/parsimonious-blackbox-attack.}\n}", "pdf": "http://proceedings.mlr.press/v97/moon19a/moon19a.pdf", "supp": "", "pdf_size": 1999520, "gs_citation": 173, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16009538798728740698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science and Engineering, Seoul National University, Seoul, Korea+Neural Processing Research Center; Department of Computer Science and Engineering, Seoul National University, Seoul, Korea+Neural Processing Research Center; Department of Computer Science and Engineering, Seoul National University, Seoul, Korea+Neural Processing Research Center", "aff_domain": "snu.ac.kr; ;snu.ac.kr", "email": "snu.ac.kr; ;snu.ac.kr", "github": "https://github.com/snu-mllab/parsimonious-blackbox-attack", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/moon19a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "Seoul National University;Neural Processing Research Center", "aff_unique_dep": "Department of Computer Science and Engineering;", "aff_unique_url": "https://www.snu.ac.kr;", "aff_unique_abbr": "SNU;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea;" }, { "title": "Partially Exchangeable Networks and Architectures for Learning Summary Statistics in Approximate Bayesian Computation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4054", "id": "4054", "author_site": "Samuel Wiqvist, Pierre-Alexandre Mattei, Umberto Picchini, Jes Frellsen", "author": "Samuel Wiqvist; Pierre-Alexandre Mattei; Umberto Picchini; Jes Frellsen", "abstract": "We present a novel family of deep neural architectures, named partially exchangeable networks (PENs) that leverage probabilistic symmetries. By design, PENs are invariant to block-switch transformations, which characterize the partial exchangeability properties of conditionally Markovian processes. Moreover, we show that any block-switch invariant function has a PEN-like representation. The DeepSets architecture is a special case of PEN and we can therefore also target fully exchangeable data. We employ PENs to learn summary statistics in approximate Bayesian computation (ABC). When comparing PENs to previous deep learning methods for learning summary statistics, our results are highly competitive, both considering time series and static models. Indeed, PENs provide more reliable posterior samples even when using less training data.", "bibtex": "@InProceedings{pmlr-v97-wiqvist19a,\n title = \t {Partially Exchangeable Networks and Architectures for Learning Summary Statistics in Approximate {B}ayesian Computation},\n author = {Wiqvist, Samuel and Mattei, Pierre-Alexandre and Picchini, Umberto and Frellsen, Jes},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6798--6807},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wiqvist19a/wiqvist19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wiqvist19a.html},\n abstract = \t {We present a novel family of deep neural architectures, named partially exchangeable networks (PENs) that leverage probabilistic symmetries. By design, PENs are invariant to block-switch transformations, which characterize the partial exchangeability properties of conditionally Markovian processes. Moreover, we show that any block-switch invariant function has a PEN-like representation. The DeepSets architecture is a special case of PEN and we can therefore also target fully exchangeable data. We employ PENs to learn summary statistics in approximate Bayesian computation (ABC). When comparing PENs to previous deep learning methods for learning summary statistics, our results are highly competitive, both considering time series and static models. Indeed, PENs provide more reliable posterior samples even when using less training data.}\n}", "pdf": "http://proceedings.mlr.press/v97/wiqvist19a/wiqvist19a.pdf", "supp": "", "pdf_size": 903510, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16942332521272083058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wiqvist19a.html" }, { "title": "Partially Linear Additive Gaussian Graphical Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4083", "id": "4083", "author_site": "Sinong Geng, Minhao Yan, Mladen Kolar, Sanmi Koyejo", "author": "Sinong Geng; Minhao Yan; Mladen Kolar; Sanmi Koyejo", "abstract": "We propose a partially linear additive Gaussian graphical model (PLA-GGM) for the estimation of associations between random variables distorted by observed confounders. Model parameters are estimated using an $L_1$-regularized maximal pseudo-profile likelihood estimator (MaPPLE) for which we prove a $\\sqrt{n}$-sparsistency. Importantly, our approach avoids parametric constraints on the effects of confounders on the estimated graphical model structure. Empirically, the PLA-GGM is applied to both synthetic and real-world datasets, demonstrating superior performance compared to competing methods.", "bibtex": "@InProceedings{pmlr-v97-geng19a,\n title = \t {Partially Linear Additive {G}aussian Graphical Models},\n author = {Geng, Sinong and Yan, Minhao and Kolar, Mladen and Koyejo, Sanmi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2180--2190},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/geng19a/geng19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/geng19a.html},\n abstract = \t {We propose a partially linear additive Gaussian graphical model (PLA-GGM) for the estimation of associations between random variables distorted by observed confounders. Model parameters are estimated using an $L_1$-regularized maximal pseudo-profile likelihood estimator (MaPPLE) for which we prove a $\\sqrt{n}$-sparsistency. Importantly, our approach avoids parametric constraints on the effects of confounders on the estimated graphical model structure. Empirically, the PLA-GGM is applied to both synthetic and real-world datasets, demonstrating superior performance compared to competing methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/geng19a/geng19a.pdf", "supp": "", "pdf_size": 690070, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14521311528279518115&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Princeton University; Charles H. Dyson School of Applied Economics and Management; Booth School of Business, University of Chicago; Department of Computer Science, University of Illinois at Urbana-Champaign", "aff_domain": "princeton.edu; ; ; ", "email": "princeton.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/geng19a.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Princeton University;Charles H. Dyson School of Applied Economics and Management;University of Chicago;University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Computer Science;Applied Economics and Management;Booth School of Business;Department of Computer Science", "aff_unique_url": "https://www.princeton.edu;https://dyson.cornell.edu;https://www.chicagobooth.edu;https://illinois.edu", "aff_unique_abbr": "Princeton;Dyson School;Chicago Booth;UIUC", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Chicago;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Particle Flow Bayes\u2019 Rule", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4188", "id": "4188", "author_site": "Xinshi Chen, Hanjun Dai, Le Song", "author": "Xinshi Chen; Hanjun Dai; Le Song", "abstract": "We present a particle flow realization of Bayes\u2019 rule, where an ODE-based neural operator is used to transport particles from a prior to its posterior after a new observation. We prove that such an ODE operator exists. Its neural parameterization can be trained in a meta-learning framework, allowing this operator to reason about the effect of an individual observation on the posterior, and thus generalize across different priors, observations and to sequential Bayesian inference. We demonstrated the generalization ability of our particle flow Bayes operator in several canonical and high dimensional examples.", "bibtex": "@InProceedings{pmlr-v97-chen19c,\n title = \t {Particle Flow {B}ayes\u2019 Rule},\n author = {Chen, Xinshi and Dai, Hanjun and Song, Le},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1022--1031},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19c/chen19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19c.html},\n abstract = \t {We present a particle flow realization of Bayes\u2019 rule, where an ODE-based neural operator is used to transport particles from a prior to its posterior after a new observation. We prove that such an ODE operator exists. Its neural parameterization can be trained in a meta-learning framework, allowing this operator to reason about the effect of an individual observation on the posterior, and thus generalize across different priors, observations and to sequential Bayesian inference. We demonstrated the generalization ability of our particle flow Bayes operator in several canonical and high dimensional examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19c/chen19c.pdf", "supp": "", "pdf_size": 5173755, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2500247682738008260&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "School of Mathematics, Georgia Institute of Technology; School of Computational Science and Engineering, Georgia Institute of Technology; Ant Financial, Hangzhou, China", "aff_domain": "gatech.edu; ; ", "email": "gatech.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/chen19c.html", "aff_unique_index": "0;0;1", "aff_unique_norm": "Georgia Institute of Technology;Ant Financial", "aff_unique_dep": "School of Mathematics;", "aff_unique_url": "https://www.gatech.edu;https://www.antgroup.com", "aff_unique_abbr": "Georgia Tech;Ant Financial", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Atlanta;Hangzhou", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;China" }, { "title": "Passed & Spurious: Descent Algorithms and Local Minima in Spiked Matrix-Tensor Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3706", "id": "3706", "author_site": "Stefano Sarao Mannelli, Florent Krzakala, Pierfrancesco Urbani, Lenka Zdeborova", "author": "Stefano Sarao Mannelli; Florent Krzakala; Pierfrancesco Urbani; Lenka Zdeborova", "abstract": "In this work we analyse quantitatively the interplay between the loss landscape and performance of descent algorithms in a prototypical inference problem, the spiked matrix-tensor model. We study a loss function that is the negative log-likelihood of the model. We analyse the number of local minima at a fixed distance from the signal/spike with the Kac-Rice formula, and locate trivialization of the landscape at large signal-to-noise ratios. We evaluate analytically the performance of a gradient flow algorithm using integro-differential PDEs as developed in physics of disordered systems for the Langevin dynamics. We analyze the performance of an approximate message passing algorithm estimating the maximum likelihood configuration via its state evolution. We conclude by comparing the above results: while we observe a drastic slow down of the gradient flow dynamics even in the region where the landscape is trivial, both the analyzed algorithms are shown to perform well even in the part of the region of parameters where spurious local minima are present.", "bibtex": "@InProceedings{pmlr-v97-mannelli19a,\n title = \t {Passed & Spurious: Descent Algorithms and Local Minima in Spiked Matrix-Tensor Models},\n author = {Mannelli, Stefano Sarao and Krzakala, Florent and Urbani, Pierfrancesco and Zdeborova, Lenka},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4333--4342},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mannelli19a/mannelli19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mannelli19a.html},\n abstract = \t {In this work we analyse quantitatively the interplay between the loss landscape and performance of descent algorithms in a prototypical inference problem, the spiked matrix-tensor model. We study a loss function that is the negative log-likelihood of the model. We analyse the number of local minima at a fixed distance from the signal/spike with the Kac-Rice formula, and locate trivialization of the landscape at large signal-to-noise ratios. We evaluate analytically the performance of a gradient flow algorithm using integro-differential PDEs as developed in physics of disordered systems for the Langevin dynamics. We analyze the performance of an approximate message passing algorithm estimating the maximum likelihood configuration via its state evolution. We conclude by comparing the above results: while we observe a drastic slow down of the gradient flow dynamics even in the region where the landscape is trivial, both the analyzed algorithms are shown to perform well even in the part of the region of parameters where spurious local minima are present.}\n}", "pdf": "http://proceedings.mlr.press/v97/mannelli19a/mannelli19a.pdf", "supp": "", "pdf_size": 466180, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9234072497573113919&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Institut de physique th \u00b4eorique, Universit \u00b4e Paris Saclay, CNRS, CEA, 91191 Gif-sur-Yvette, France; Laboratoire de Physique de l\u2019\u00b4Ecole Normale Sup \u00b4erieure, CNRS & Universit \u00b4e Pierre & Marie Curie & PSL Universit \u00b4e, 75005 Paris, France; Institut de physique th \u00b4eorique, Universit \u00b4e Paris Saclay, CNRS, CEA, 91191 Gif-sur-Yvette, France; Institut de physique th \u00b4eorique, Universit \u00b4e Paris Saclay, CNRS, CEA, 91191 Gif-sur-Yvette, France", "aff_domain": "ipht.fr; ; ;ipht.fr", "email": "ipht.fr; ; ;ipht.fr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/mannelli19a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Universit\u00e9 Paris Saclay;\u00c9cole Normale Sup\u00e9rieure", "aff_unique_dep": "Institut de physique th\u00e9orique;Laboratoire de Physique", "aff_unique_url": "https://www.universite-paris-saclay.fr;https://www.ens.fr", "aff_unique_abbr": ";ENS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "title": "Per-Decision Option Discounting", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3865", "id": "3865", "author_site": "Anna Harutyunyan, Peter Vrancx, Philippe Hamel, Ann Nowe, Doina Precup", "author": "Anna Harutyunyan; Peter Vrancx; Philippe Hamel; Ann Nowe; Doina Precup", "abstract": "In order to solve complex problems an agent must be able to reason over a sufficiently long horizon. Temporal abstraction, commonly modeled through options, offers the ability to reason at many timescales, but the horizon length is still determined by the discount factor of the underlying Markov Decision Process. We propose a modification to the options framework that naturally scales the agent\u2019s horizon with option length. We show that the proposed option-step discount controls a bias-variance trade-off, with larger discounts (counter-intuitively) leading to less estimation variance.", "bibtex": "@InProceedings{pmlr-v97-harutyunyan19a,\n title = \t {Per-Decision Option Discounting},\n author = {Harutyunyan, Anna and Vrancx, Peter and Hamel, Philippe and Nowe, Ann and Precup, Doina},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2644--2652},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/harutyunyan19a/harutyunyan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/harutyunyan19a.html},\n abstract = \t {In order to solve complex problems an agent must be able to reason over a sufficiently long horizon. Temporal abstraction, commonly modeled through options, offers the ability to reason at many timescales, but the horizon length is still determined by the discount factor of the underlying Markov Decision Process. We propose a modification to the options framework that naturally scales the agent\u2019s horizon with option length. We show that the proposed option-step discount controls a bias-variance trade-off, with larger discounts (counter-intuitively) leading to less estimation variance.}\n}", "pdf": "http://proceedings.mlr.press/v97/harutyunyan19a/harutyunyan19a.pdf", "supp": "", "pdf_size": 899001, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16460995769751981751&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "DeepMind, London, UK+Vrije Universiteit Brussel, Brussels, Belgium; PROWLER.io, Cambridge, UK+Vrije Universiteit Brussel, Brussels, Belgium; DeepMind, London, UK; Vrije Universiteit Brussel, Brussels, Belgium; DeepMind, London, UK", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/harutyunyan19a.html", "aff_unique_index": "0+1;2+1;0;1;0", "aff_unique_norm": "DeepMind;Vrije Universiteit Brussel;PROWLER.io", "aff_unique_dep": ";;", "aff_unique_url": "https://deepmind.com;https://www.vub.be;https://prowler.io", "aff_unique_abbr": "DeepMind;VUB;", "aff_campus_unique_index": "0+1;2+1;0;1;0", "aff_campus_unique": "London;Brussels;Cambridge", "aff_country_unique_index": "0+1;0+1;0;1;0", "aff_country_unique": "United Kingdom;Belgium" }, { "title": "Phase transition in PCA with missing data: Reduced signal-to-noise ratio, not sample size!", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4014", "id": "4014", "author_site": "Niels Ipsen, Lars Kai Hansen", "author": "Niels Ipsen; Lars Kai Hansen", "abstract": "How does missing data affect our ability to learn signal structures? It has been shown that learning signal structure in terms of principal components is dependent on the ratio of sample size and dimensionality and that a critical number of observations is needed before learning starts (Biehl and Mietzner, 1993). Here we generalize this analysis to include missing data. Probabilistic principal component analysis is regularly used for estimating signal structures in datasets with missing data. Our analytic result suggest that the effect of missing data is to effectively reduce signal-to-noise ratio rather than - as generally believed - to reduce sample size. The theory predicts a phase transition in the learning curves and this is indeed found both in simulation data and in real datasets.", "bibtex": "@InProceedings{pmlr-v97-ipsen19a,\n title = \t {Phase transition in {PCA} with missing data: Reduced signal-to-noise ratio, not sample size!},\n author = {Ipsen, Niels and Hansen, Lars Kai},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2951--2960},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ipsen19a/ipsen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ipsen19a.html},\n abstract = \t {How does missing data affect our ability to learn signal structures? It has been shown that learning signal structure in terms of principal components is dependent on the ratio of sample size and dimensionality and that a critical number of observations is needed before learning starts (Biehl and Mietzner, 1993). Here we generalize this analysis to include missing data. Probabilistic principal component analysis is regularly used for estimating signal structures in datasets with missing data. Our analytic result suggest that the effect of missing data is to effectively reduce signal-to-noise ratio rather than - as generally believed - to reduce sample size. The theory predicts a phase transition in the learning curves and this is indeed found both in simulation data and in real datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/ipsen19a/ipsen19a.pdf", "supp": "", "pdf_size": 917697, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=809292088427670370&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Applied Mathematics and Computer Science, Technical University of Denmark, Denmark; Department of Applied Mathematics and Computer Science, Technical University of Denmark, Denmark", "aff_domain": "dtu.dk;dtu.dk", "email": "dtu.dk;dtu.dk", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ipsen19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technical University of Denmark", "aff_unique_dep": "Department of Applied Mathematics and Computer Science", "aff_unique_url": "https://www.tud.dk", "aff_unique_abbr": "DTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Denmark" }, { "title": "Phaseless PCA: Low-Rank Matrix Recovery from Column-wise Phaseless Measurements", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3894", "id": "3894", "author_site": "Seyedehsara Nayer, Praneeth Narayanamurthy, Namrata Vaswani", "author": "Seyedehsara Nayer; Praneeth Narayanamurthy; Namrata Vaswani", "abstract": "This work proposes the first set of simple, practically useful, and provable algorithms for two inter-related problems. (i) The first is low-rank matrix recovery from magnitude-only (phaseless) linear projections of each of its columns. This finds important applications in phaseless dynamic imaging, e.g., Fourier Ptychographic imaging of live biological specimens. Our guarantee shows that, in the regime of small ranks, the sample complexity required is only a little larger than the order-optimal one, and much smaller than what standard (unstructured) phase retrieval methods need. %Moreover our algorithm is fast and memory-efficient if only the minimum required number of measurements is used (ii) The second problem we study is a dynamic extension of the above: it allows the low-dimensional subspace from which each image/signal (each column of the low-rank matrix) is generated to change with time. We introduce a simple algorithm that is provably correct as long as the subspace changes are piecewise constant.", "bibtex": "@InProceedings{pmlr-v97-nayer19a,\n title = \t {Phaseless {PCA}: Low-Rank Matrix Recovery from Column-wise Phaseless Measurements},\n author = {Nayer, Seyedehsara and Narayanamurthy, Praneeth and Vaswani, Namrata},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4762--4770},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nayer19a/nayer19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nayer19a.html},\n abstract = \t {This work proposes the first set of simple, practically useful, and provable algorithms for two inter-related problems. (i) The first is low-rank matrix recovery from magnitude-only (phaseless) linear projections of each of its columns. This finds important applications in phaseless dynamic imaging, e.g., Fourier Ptychographic imaging of live biological specimens. Our guarantee shows that, in the regime of small ranks, the sample complexity required is only a little larger than the order-optimal one, and much smaller than what standard (unstructured) phase retrieval methods need. %Moreover our algorithm is fast and memory-efficient if only the minimum required number of measurements is used (ii) The second problem we study is a dynamic extension of the above: it allows the low-dimensional subspace from which each image/signal (each column of the low-rank matrix) is generated to change with time. We introduce a simple algorithm that is provably correct as long as the subspace changes are piecewise constant.}\n}", "pdf": "http://proceedings.mlr.press/v97/nayer19a/nayer19a.pdf", "supp": "", "pdf_size": 429761, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=184599286952414598&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Electrical and Computer Engineering, Iowa State University, USA; Department of Electrical and Computer Engineering, Iowa State University, USA; Department of Electrical and Computer Engineering, Iowa State University, USA", "aff_domain": "iastate.edu; ; ", "email": "iastate.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/nayer19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Plug-and-Play Methods Provably Converge with Properly Trained Denoisers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4180", "id": "4180", "author_site": "Ernest Ryu, Jialin Liu, Sicheng Wang, Xiaohan Chen, Zhangyang Wang, Wotao Yin", "author": "Ernest Ryu; Jialin Liu; Sicheng Wang; Xiaohan Chen; Zhangyang Wang; Wotao Yin", "abstract": "Plug-and-play (PnP) is a non-convex framework that integrates modern denoising priors, such as BM3D or deep learning-based denoisers, into ADMM or other proximal algorithms. An advantage of PnP is that one can use pre-trained denoisers when there is not sufficient data for end-to-end training. Although PnP has been recently studied extensively with great empirical success, theoretical analysis addressing even the most basic question of convergence has been insufficient. In this paper, we theoretically establish convergence of PnP-FBS and PnP-ADMM, without using diminishing stepsizes, under a certain Lipschitz condition on the denoisers. We then propose real spectral normalization, a technique for training deep learning-based denoisers to satisfy the proposed Lipschitz condition. Finally, we present experimental results validating the theory.", "bibtex": "@InProceedings{pmlr-v97-ryu19a,\n title = \t {Plug-and-Play Methods Provably Converge with Properly Trained Denoisers},\n author = {Ryu, Ernest and Liu, Jialin and Wang, Sicheng and Chen, Xiaohan and Wang, Zhangyang and Yin, Wotao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5546--5557},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ryu19a/ryu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ryu19a.html},\n abstract = \t {Plug-and-play (PnP) is a non-convex framework that integrates modern denoising priors, such as BM3D or deep learning-based denoisers, into ADMM or other proximal algorithms. An advantage of PnP is that one can use pre-trained denoisers when there is not sufficient data for end-to-end training. Although PnP has been recently studied extensively with great empirical success, theoretical analysis addressing even the most basic question of convergence has been insufficient. In this paper, we theoretically establish convergence of PnP-FBS and PnP-ADMM, without using diminishing stepsizes, under a certain Lipschitz condition on the denoisers. We then propose real spectral normalization, a technique for training deep learning-based denoisers to satisfy the proposed Lipschitz condition. Finally, we present experimental results validating the theory.}\n}", "pdf": "http://proceedings.mlr.press/v97/ryu19a/ryu19a.pdf", "supp": "", "pdf_size": 378335, "gs_citation": 469, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11121192984446474149&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Mathematics, University of California, Los Angeles, USA; Department of Mathematics, University of California, Los Angeles, USA; Department of Computer Science and Engineering, Texas A&M University, USA; Department of Computer Science and Engineering, Texas A&M University, USA; Department of Computer Science and Engineering, Texas A&M University, USA; Department of Mathematics, University of California, Los Angeles, USA", "aff_domain": "math.ucla.edu; ; ; ; ;math.ucla.edu", "email": "math.ucla.edu; ; ; ; ;math.ucla.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/ryu19a.html", "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "University of California, Los Angeles;Texas A&M University", "aff_unique_dep": "Department of Mathematics;Department of Computer Science and Engineering", "aff_unique_url": "https://www.ucla.edu;https://www.tamu.edu", "aff_unique_abbr": "UCLA;TAMU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Poission Subsampled R\u00e9nyi Differential Privacy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4317", "id": "4317", "author_site": "Yuqing Zhu, Yu-Xiang Wang", "author": "Yuqing Zhu; Yu-Xiang Wang", "abstract": "We consider the problem of privacy-amplification by under the Renyi Differential Privacy framework. This is the main technique underlying the moments accountants (Abadi et al., 2016) for differentially private deep learning. Unlike previous attempts on this problem which deals with Sampling with Replacement, we consider the Poisson subsampling scheme which selects each data point independently with a coin toss. This allows us to significantly simplify and tighten the bounds for the RDP of subsampled mechanisms and derive numerically stable approximation schemes. In particular, for subsampled Gaussian mechanism and subsampled Laplace mechanism, we prove an analytical formula of their RDP that exactly matches the lower bound. The result is the first of its kind and we numerically demonstrate an order of magnitude improvement in the privacy-utility tradeoff.", "bibtex": "@InProceedings{pmlr-v97-zhu19c,\n title = \t {Poission Subsampled R\u00e9nyi Differential Privacy},\n author = {Zhu, Yuqing and Wang, Yu-Xiang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7634--7642},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhu19c/zhu19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhu19c.html},\n abstract = \t {We consider the problem of privacy-amplification by under the Renyi Differential Privacy framework. This is the main technique underlying the moments accountants (Abadi et al., 2016) for differentially private deep learning. Unlike previous attempts on this problem which deals with Sampling with Replacement, we consider the Poisson subsampling scheme which selects each data point independently with a coin toss. This allows us to significantly simplify and tighten the bounds for the RDP of subsampled mechanisms and derive numerically stable approximation schemes. In particular, for subsampled Gaussian mechanism and subsampled Laplace mechanism, we prove an analytical formula of their RDP that exactly matches the lower bound. The result is the first of its kind and we numerically demonstrate an order of magnitude improvement in the privacy-utility tradeoff.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhu19c/zhu19c.pdf", "supp": "", "pdf_size": 1419462, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14963858742283673257&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "UC Santa Barbara, Department of Computer Science; UC Santa Barbara, Department of Computer Science", "aff_domain": "ucsb.edu;cs.ucsb.edu", "email": "ucsb.edu;cs.ucsb.edu", "github": "https://github.com/yuxiangw/autodp", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhu19c.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of California, Santa Barbara", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ucsb.edu", "aff_unique_abbr": "UCSB", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Santa Barbara", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Policy Certificates: Towards Accountable Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3749", "id": "3749", "author_site": "Christoph Dann, Lihong Li, Wei Wei, Emma Brunskill", "author": "Christoph Dann; Lihong Li; Wei Wei; Emma Brunskill", "abstract": "The performance of a reinforcement learning algorithm can vary drastically during learning because of exploration. Existing algorithms provide little information about the quality of their current policy before executing it, and thus have limited use in high-stakes applications like healthcare. We address this lack of accountability by proposing that algorithms output policy certificates. These certificates bound the sub-optimality and return of the policy in the next episode, allowing humans to intervene when the certified quality is not satisfactory. We further introduce two new algorithms with certificates and present a new framework for theoretical analysis that guarantees the quality of their policies and certificates. For tabular MDPs, we show that computing certificates can even improve the sample-efficiency of optimism-based exploration. As a result, one of our algorithms is the first to achieve minimax-optimal PAC bounds up to lower-order terms, and this algorithm also matches (and in some settings slightly improves upon) existing minimax regret bounds.", "bibtex": "@InProceedings{pmlr-v97-dann19a,\n title = \t {Policy Certificates: Towards Accountable Reinforcement Learning},\n author = {Dann, Christoph and Li, Lihong and Wei, Wei and Brunskill, Emma},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1507--1516},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dann19a/dann19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dann19a.html},\n abstract = \t {The performance of a reinforcement learning algorithm can vary drastically during learning because of exploration. Existing algorithms provide little information about the quality of their current policy before executing it, and thus have limited use in high-stakes applications like healthcare. We address this lack of accountability by proposing that algorithms output policy certificates. These certificates bound the sub-optimality and return of the policy in the next episode, allowing humans to intervene when the certified quality is not satisfactory. We further introduce two new algorithms with certificates and present a new framework for theoretical analysis that guarantees the quality of their policies and certificates. For tabular MDPs, we show that computing certificates can even improve the sample-efficiency of optimism-based exploration. As a result, one of our algorithms is the first to achieve minimax-optimal PAC bounds up to lower-order terms, and this algorithm also matches (and in some settings slightly improves upon) existing minimax regret bounds.}\n}", "pdf": "http://proceedings.mlr.press/v97/dann19a/dann19a.pdf", "supp": "", "pdf_size": 433679, "gs_citation": 176, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11150628151442124823&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Carnegie Mellon University; Google Research; Google Research; Stanford University", "aff_domain": "cdann.net; ; ; ", "email": "cdann.net; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/dann19a.html", "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Carnegie Mellon University;Google;Stanford University", "aff_unique_dep": ";Google Research;", "aff_unique_url": "https://www.cmu.edu;https://research.google;https://www.stanford.edu", "aff_unique_abbr": "CMU;Google Research;Stanford", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Mountain View;Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Policy Consolidation for Continual Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4148", "id": "4148", "author_site": "Christos Kaplanis, Murray Shanahan, Claudia Clopath", "author": "Christos Kaplanis; Murray Shanahan; Claudia Clopath", "abstract": "We propose a method for tackling catastrophic forgetting in deep reinforcement learning that is", "bibtex": "@InProceedings{pmlr-v97-kaplanis19a,\n title = \t {Policy Consolidation for Continual Reinforcement Learning},\n author = {Kaplanis, Christos and Shanahan, Murray and Clopath, Claudia},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3242--3251},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kaplanis19a/kaplanis19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kaplanis19a.html},\n abstract = \t {We propose a method for tackling catastrophic forgetting in deep reinforcement learning that is", "pdf": "http://proceedings.mlr.press/v97/kaplanis19a/kaplanis19a.pdf", "supp": "", "pdf_size": 9628474, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16186153206765299155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computing, Imperial College London + DeepMind, London; Department of Computing, Imperial College London; Department of Bioengineering, Imperial College London + DeepMind, London", "aff_domain": "imperial.ac.uk; ; ", "email": "imperial.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kaplanis19a.html", "aff_unique_index": "0+1;0;0+1", "aff_unique_norm": "Imperial College London;DeepMind", "aff_unique_dep": "Department of Computing;", "aff_unique_url": "https://www.imperial.ac.uk;https://deepmind.com", "aff_unique_abbr": "Imperial;DeepMind", "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "London", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Population Based Augmentation: Efficient Learning of Augmentation Policy Schedules", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3664", "id": "3664", "author_site": "Daniel Ho, Eric Liang, Peter Chen, Ion Stoica, Pieter Abbeel", "author": "Daniel Ho; Eric Liang; Xi Chen; Ion Stoica; Pieter Abbeel", "abstract": "A key challenge in leveraging data augmentation for neural network training is choosing an effective augmentation policy from a large search space of candidate operations. Properly chosen augmentation policies can lead to significant generalization improvements; however, state-of-the-art approaches such as AutoAugment are computationally infeasible to run for the ordinary user. In this paper, we introduce a new data augmentation algorithm, Population Based Augmentation (PBA), which generates nonstationary augmentation policy schedules instead of a fixed augmentation policy. We show that PBA can match the performance of AutoAugment on CIFAR-10, CIFAR-100, and SVHN, with three orders of magnitude less overall compute. On CIFAR-10 we achieve a mean test error of 1.46%, which is a slight improvement upon the current state-of-the-art. The code for PBA is open source and is available at https://github.com/arcelien/pba.", "bibtex": "@InProceedings{pmlr-v97-ho19b,\n title = \t {Population Based Augmentation: Efficient Learning of Augmentation Policy Schedules},\n author = {Ho, Daniel and Liang, Eric and Chen, Xi and Stoica, Ion and Abbeel, Pieter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2731--2741},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ho19b/ho19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/ho19b.html},\n abstract = \t {A key challenge in leveraging data augmentation for neural network training is choosing an effective augmentation policy from a large search space of candidate operations. Properly chosen augmentation policies can lead to significant generalization improvements; however, state-of-the-art approaches such as AutoAugment are computationally infeasible to run for the ordinary user. In this paper, we introduce a new data augmentation algorithm, Population Based Augmentation (PBA), which generates nonstationary augmentation policy schedules instead of a fixed augmentation policy. We show that PBA can match the performance of AutoAugment on CIFAR-10, CIFAR-100, and SVHN, with three orders of magnitude less overall compute. On CIFAR-10 we achieve a mean test error of 1.46%, which is a slight improvement upon the current state-of-the-art. The code for PBA is open source and is available at https://github.com/arcelien/pba.}\n}", "pdf": "http://proceedings.mlr.press/v97/ho19b/ho19b.pdf", "supp": "", "pdf_size": 541851, "gs_citation": 547, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9297667920061606267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "EECS, UC Berkeley, Berkeley, California, USA+X, Mountain View, California, USA; EECS, UC Berkeley, Berkeley, California, USA; EECS, UC Berkeley, Berkeley, California, USA; EECS, UC Berkeley, Berkeley, California, USA+covariant.ai, Berkeley, California, USA; EECS, UC Berkeley, Berkeley, California, USA+covariant.ai, Berkeley, California, USA", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "https://github.com/arcelien/pba", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/ho19b.html", "aff_unique_index": "0+1;0;0;0+2;0+2", "aff_unique_norm": "University of California, Berkeley;X;Covariant.ai", "aff_unique_dep": "Electrical Engineering and Computer Sciences;;", "aff_unique_url": "https://www.berkeley.edu;;", "aff_unique_abbr": "UC Berkeley;;", "aff_campus_unique_index": "0+1;0;0;0+0;0+0", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0+0;0;0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Position-aware Graph Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4000", "id": "4000", "author_site": "Jiaxuan You, Rex (Zhitao) Ying, Jure Leskovec", "author": "Jiaxuan You; Rex Ying; Jure Leskovec", "abstract": "Learning node embeddings that capture a node\u2019s position within the broader graph structure is crucial for many prediction tasks on graphs. However, existing Graph Neural Network (GNN) architectures have limited power in capturing the position/location of a given node with respect to all other nodes of the graph. Here we propose Position-aware Graph Neural Networks (P-GNNs), a new class of GNNs for computing position-aware node embeddings. P-GNN first samples sets of anchor nodes, computes the distance of a given target node to each anchor-set, and then learns a non-linear distance-weighted aggregation scheme over the anchor-sets. This way P-GNNs can capture positions/locations of nodes with respect to the anchor nodes. P-GNNs have several advantages: they are inductive, scalable, and can incorporate node feature information. We apply P-GNNs to multiple prediction tasks including link prediction and community detection. We show that P-GNNs consistently outperform state of the art GNNs, with up to 66% improvement in terms of the ROC AUC score.", "bibtex": "@InProceedings{pmlr-v97-you19b,\n title = \t {Position-aware Graph Neural Networks},\n author = {You, Jiaxuan and Ying, Rex and Leskovec, Jure},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7134--7143},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/you19b/you19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/you19b.html},\n abstract = \t {Learning node embeddings that capture a node\u2019s position within the broader graph structure is crucial for many prediction tasks on graphs. However, existing Graph Neural Network (GNN) architectures have limited power in capturing the position/location of a given node with respect to all other nodes of the graph. Here we propose Position-aware Graph Neural Networks (P-GNNs), a new class of GNNs for computing position-aware node embeddings. P-GNN first samples sets of anchor nodes, computes the distance of a given target node to each anchor-set, and then learns a non-linear distance-weighted aggregation scheme over the anchor-sets. This way P-GNNs can capture positions/locations of nodes with respect to the anchor nodes. P-GNNs have several advantages: they are inductive, scalable, and can incorporate node feature information. We apply P-GNNs to multiple prediction tasks including link prediction and community detection. We show that P-GNNs consistently outperform state of the art GNNs, with up to 66% improvement in terms of the ROC AUC score.}\n}", "pdf": "http://proceedings.mlr.press/v97/you19b/you19b.pdf", "supp": "", "pdf_size": 508806, "gs_citation": 634, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2886623965746954945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, Stanford University; Department of Computer Science, Stanford University; Department of Computer Science, Stanford University", "aff_domain": "cs.stanford.edu; ;cs.stanford.edu", "email": "cs.stanford.edu; ;cs.stanford.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/you19b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Power k-Means Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4117", "id": "4117", "author_site": "Jason Xu, Kenneth Lange", "author": "Jason Xu; Kenneth Lange", "abstract": "Clustering is a fundamental task in unsupervised machine learning. Lloyd\u2019s 1957 algorithm for k-means clustering remains one of the most widely used due to its speed and simplicity, but the greedy approach is sensitive to initialization and often falls short at a poor solution. This paper explores an alternative to Lloyd\u2019s algorithm that retains its simplicity and mitigates its tendency to get trapped by local minima. Called power k-means, our method embeds the k-means problem in a continuous class of similar, better behaved problems with fewer local minima. Power k-means anneals its way toward the solution of ordinary k-means by way of majorization-minimization (MM), sharing the appealing descent property and low complexity of Lloyd\u2019s algorithm. Further, our method complements widely used seeding strategies, reaping marked improvements when used together as demonstrated on a suite of simulated and real data examples.", "bibtex": "@InProceedings{pmlr-v97-xu19a,\n title = \t {Power k-Means Clustering},\n author = {Xu, Jason and Lange, Kenneth},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6921--6931},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xu19a/xu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/xu19a.html},\n abstract = \t {Clustering is a fundamental task in unsupervised machine learning. Lloyd\u2019s 1957 algorithm for k-means clustering remains one of the most widely used due to its speed and simplicity, but the greedy approach is sensitive to initialization and often falls short at a poor solution. This paper explores an alternative to Lloyd\u2019s algorithm that retains its simplicity and mitigates its tendency to get trapped by local minima. Called power k-means, our method embeds the k-means problem in a continuous class of similar, better behaved problems with fewer local minima. Power k-means anneals its way toward the solution of ordinary k-means by way of majorization-minimization (MM), sharing the appealing descent property and low complexity of Lloyd\u2019s algorithm. Further, our method complements widely used seeding strategies, reaping marked improvements when used together as demonstrated on a suite of simulated and real data examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/xu19a/xu19a.pdf", "supp": "", "pdf_size": 613515, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=344281950323969623&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Department of Statistical Science, Duke University; Departments of Biomathematics, Statistics, and Human Genetics, UCLA", "aff_domain": "duke.edu; ", "email": "duke.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/xu19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Duke University;University of California, Los Angeles", "aff_unique_dep": "Department of Statistical Science;Departments of Biomathematics, Statistics, and Human Genetics", "aff_unique_url": "https://www.duke.edu;https://www.ucla.edu", "aff_unique_abbr": "Duke;UCLA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Predicate Exchange: Inference with Declarative Knowledge", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3972", "id": "3972", "author_site": "Zenna Tavares, Javier Burroni, Edgar Minasyan, Armando Solar-Lezama, Rajesh Ranganath", "author": "Zenna Tavares; Javier Burroni; Edgar Minasyan; Armando Solar-Lezama; Rajesh Ranganath", "abstract": "Programming languages allow us to express complex predicates, but existing inference methods are unable to condition probabilistic models on most of them. To support a broader class of predicates, we develop an inference procedure called predicate exchange, which softens predicates. A soft predicate quantifies the extent to which values of model variables are consistent with its hard counterpart. We substitute the likelihood term in the Bayesian posterior with a soft predicate, and develop a variant of replica exchange MCMC to draw posterior samples. We implement predicate exchange as a language agnostic tool which performs a nonstandard execution of a probabilistic program. We demonstrate the approach on sequence models of health and inverse rendering.", "bibtex": "@InProceedings{pmlr-v97-tavares19a,\n title = \t {Predicate Exchange: Inference with Declarative Knowledge},\n author = {Tavares, Zenna and Burroni, Javier and Minasyan, Edgar and Solar-Lezama, Armando and Ranganath, Rajesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6186--6195},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tavares19a/tavares19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tavares19a.html},\n abstract = \t {Programming languages allow us to express complex predicates, but existing inference methods are unable to condition probabilistic models on most of them. To support a broader class of predicates, we develop an inference procedure called predicate exchange, which softens predicates. A soft predicate quantifies the extent to which values of model variables are consistent with its hard counterpart. We substitute the likelihood term in the Bayesian posterior with a soft predicate, and develop a variant of replica exchange MCMC to draw posterior samples. We implement predicate exchange as a language agnostic tool which performs a nonstandard execution of a probabilistic program. We demonstrate the approach on sequence models of health and inverse rendering.}\n}", "pdf": "http://proceedings.mlr.press/v97/tavares19a/tavares19a.pdf", "supp": "", "pdf_size": 977787, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1520250654548085818&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "MIT, USA; College of Information and Computer Science, University of Massachusetts, Amherst, USA; Princeton University, USA; MIT, USA; NYU, USA", "aff_domain": "mit.edu; ; ; ; ", "email": "mit.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/tavares19a.html", "aff_unique_index": "0;1;2;0;3", "aff_unique_norm": "Massachusetts Institute of Technology;University of Massachusetts Amherst;Princeton University;New York University", "aff_unique_dep": ";College of Information and Computer Science;;", "aff_unique_url": "https://web.mit.edu;https://www.umass.edu;https://www.princeton.edu;https://www.nyu.edu", "aff_unique_abbr": "MIT;UMass Amherst;Princeton;NYU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Amherst;New York", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Predictor-Corrector Policy Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3970", "id": "3970", "author_site": "Ching-An Cheng, Xinyan Yan, Nathan Ratliff, Byron Boots", "author": "Ching-An Cheng; Xinyan Yan; Nathan Ratliff; Byron Boots", "abstract": "We present a predictor-corrector framework, called PicCoLO, that can transform a first-order model-free reinforcement or imitation learning algorithm into a new hybrid method that leverages predictive models to accelerate policy learning. The new \u201cPicCoLOed\u201d algorithm optimizes a policy by recursively repeating two steps: In the Prediction Step, the learner uses a model to predict the unseen future gradient and then applies the predicted estimate to update the policy; in the Correction Step, the learner runs the updated policy in the environment, receives the true gradient, and then corrects the policy using the gradient error. Unlike previous algorithms, PicCoLO corrects for the mistakes of using imperfect predicted gradients and hence does not suffer from model bias. The development of PicCoLO is made possible by a novel reduction from predictable online learning to adversarial online learning, which provides a systematic way to modify existing first-order algorithms to achieve the optimal regret with respect to predictable information. We show, in both theory and simulation, that the convergence rate of several first-order model-free algorithms can be improved by PicCoLO.", "bibtex": "@InProceedings{pmlr-v97-cheng19b,\n title = \t {Predictor-Corrector Policy Optimization},\n author = {Cheng, Ching-An and Yan, Xinyan and Ratliff, Nathan and Boots, Byron},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1151--1161},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cheng19b/cheng19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/cheng19b.html},\n abstract = \t {We present a predictor-corrector framework, called PicCoLO, that can transform a first-order model-free reinforcement or imitation learning algorithm into a new hybrid method that leverages predictive models to accelerate policy learning. The new \u201cPicCoLOed\u201d algorithm optimizes a policy by recursively repeating two steps: In the Prediction Step, the learner uses a model to predict the unseen future gradient and then applies the predicted estimate to update the policy; in the Correction Step, the learner runs the updated policy in the environment, receives the true gradient, and then corrects the policy using the gradient error. Unlike previous algorithms, PicCoLO corrects for the mistakes of using imperfect predicted gradients and hence does not suffer from model bias. The development of PicCoLO is made possible by a novel reduction from predictable online learning to adversarial online learning, which provides a systematic way to modify existing first-order algorithms to achieve the optimal regret with respect to predictable information. We show, in both theory and simulation, that the convergence rate of several first-order model-free algorithms can be improved by PicCoLO.}\n}", "pdf": "http://proceedings.mlr.press/v97/cheng19b/cheng19b.pdf", "supp": "", "pdf_size": 649755, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13913575152899689436&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Georgia Tech; Georgia Tech; NVIDIA; Georgia Tech+NVIDIA", "aff_domain": "gatech.edu; ; ; ", "email": "gatech.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/cheng19b.html", "aff_unique_index": "0;0;1;0+1", "aff_unique_norm": "Georgia Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.nvidia.com", "aff_unique_abbr": "Georgia Tech;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Probabilistic Neural Symbolic Models for Interpretable Visual Question Answering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3579", "id": "3579", "author_site": "Shanmukha Ramakrishna Vedantam, Karan Desai, Stefan Lee, Marcus Rohrbach, Dhruv Batra, Devi Parikh", "author": "Ramakrishna Vedantam; Karan Desai; Stefan Lee; Marcus Rohrbach; Dhruv Batra; Devi Parikh", "abstract": "We propose a new class of probabilistic neural-symbolic models, that have symbolic functional programs as a latent, stochastic variable. Instantiated in the context of visual question answering, our probabilistic formulation offers two key conceptual advantages over prior neural-symbolic models for VQA. Firstly, the programs generated by our model are more understandable while requiring less number of teaching examples. Secondly, we show that one can pose counterfactual scenarios to the model, to probe its beliefs on the programs that could lead to a specified answer given an image. Our results on the CLEVR and SHAPES datasets verify our hypotheses, showing that the model gets better program (and answer) prediction accuracy even in the low data regime, and allows one to probe the coherence and consistency of reasoning performed.", "bibtex": "@InProceedings{pmlr-v97-vedantam19a,\n title = \t {Probabilistic Neural Symbolic Models for Interpretable Visual Question Answering},\n author = {Vedantam, Ramakrishna and Desai, Karan and Lee, Stefan and Rohrbach, Marcus and Batra, Dhruv and Parikh, Devi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6428--6437},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/vedantam19a/vedantam19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/vedantam19a.html},\n abstract = \t {We propose a new class of probabilistic neural-symbolic models, that have symbolic functional programs as a latent, stochastic variable. Instantiated in the context of visual question answering, our probabilistic formulation offers two key conceptual advantages over prior neural-symbolic models for VQA. Firstly, the programs generated by our model are more understandable while requiring less number of teaching examples. Secondly, we show that one can pose counterfactual scenarios to the model, to probe its beliefs on the programs that could lead to a specified answer given an image. Our results on the CLEVR and SHAPES datasets verify our hypotheses, showing that the model gets better program (and answer) prediction accuracy even in the low data regime, and allows one to probe the coherence and consistency of reasoning performed.}\n}", "pdf": "http://proceedings.mlr.press/v97/vedantam19a/vedantam19a.pdf", "supp": "", "pdf_size": 1086424, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12432634378302033141&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Facebook AI Research+Georgia Tech; Georgia Tech; Georgia Tech; Facebook AI Research+Georgia Tech; Facebook AI Research+Georgia Tech; Facebook AI Research+Georgia Tech", "aff_domain": "fb.com; ; ; ; ; ", "email": "fb.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/vedantam19a.html", "aff_unique_index": "0+1;1;1;0+1;0+1;0+1", "aff_unique_norm": "Meta;Georgia Institute of Technology", "aff_unique_dep": "Facebook AI Research;", "aff_unique_url": "https://research.facebook.com;https://www.gatech.edu", "aff_unique_abbr": "FAIR;Georgia Tech", "aff_campus_unique_index": ";;;", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0;0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Probability Functional Descent: A Unifying Perspective on GANs, Variational Inference, and Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3944", "id": "3944", "author_site": "Casey Chu, Jose Blanchet, Peter Glynn", "author": "Casey Chu; Jose Blanchet; Peter Glynn", "abstract": "The goal of this paper is to provide a unifying view of a wide range of problems of interest in machine learning by framing them as the minimization of functionals defined on the space of probability measures. In particular, we show that generative adversarial networks, variational inference, and actor-critic methods in reinforcement learning can all be seen through the lens of our framework. We then discuss a generic optimization algorithm for our formulation, called probability functional descent (PFD), and show how this algorithm recovers existing methods developed independently in the settings mentioned earlier.", "bibtex": "@InProceedings{pmlr-v97-chu19a,\n title = \t {Probability Functional Descent: A Unifying Perspective on {GAN}s, Variational Inference, and Reinforcement Learning},\n author = {Chu, Casey and Blanchet, Jose and Glynn, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1213--1222},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chu19a/chu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chu19a.html},\n abstract = \t {The goal of this paper is to provide a unifying view of a wide range of problems of interest in machine learning by framing them as the minimization of functionals defined on the space of probability measures. In particular, we show that generative adversarial networks, variational inference, and actor-critic methods in reinforcement learning can all be seen through the lens of our framework. We then discuss a generic optimization algorithm for our formulation, called probability functional descent (PFD), and show how this algorithm recovers existing methods developed independently in the settings mentioned earlier.}\n}", "pdf": "http://proceedings.mlr.press/v97/chu19a/chu19a.pdf", "supp": "", "pdf_size": 317694, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7567772602331370101&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Institute for Computational & Mathematical Engineering, Stanford University, Stanford, California, USA; Management Science & Engineering, Stanford University, Stanford, California, USA; Management Science & Engineering, Stanford University, Stanford, California, USA", "aff_domain": "stanford.edu; ; ", "email": "stanford.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/chu19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Institute for Computational & Mathematical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Processing Megapixel Images with Deep Attention-Sampling Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4076", "id": "4076", "author_site": "Angelos Katharopoulos, Francois Fleuret", "author": "Angelos Katharopoulos; Francois Fleuret", "abstract": "Existing deep architectures cannot operate on very large signals such as megapixel images due to computational and memory constraints. To tackle this limitation, we propose a fully differentiable end-to-end trainable model that samples and processes only a fraction of the full resolution input image. The locations to process are sampled from an attention distribution computed from a low resolution view of the input. We refer to our method as attention sampling and it can process images of several megapixels with a standard single GPU setup. We show that sampling from the attention distribution results in an unbiased estimator of the full model with minimal variance, and we derive an unbiased estimator of the gradient that we use to train our model end-to-end with a normal SGD procedure. This new method is evaluated on three classification tasks, where we show that it allows to reduce computation and memory footprint by an order of magnitude for the same accuracy as classical architectures. We also show the consistency of the sampling that indeed focuses on informative parts of the input images.", "bibtex": "@InProceedings{pmlr-v97-katharopoulos19a,\n title = \t {Processing Megapixel Images with Deep Attention-Sampling Models},\n author = {Katharopoulos, Angelos and Fleuret, Francois},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3282--3291},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/katharopoulos19a/katharopoulos19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/katharopoulos19a.html},\n abstract = \t {Existing deep architectures cannot operate on very large signals such as megapixel images due to computational and memory constraints. To tackle this limitation, we propose a fully differentiable end-to-end trainable model that samples and processes only a fraction of the full resolution input image. The locations to process are sampled from an attention distribution computed from a low resolution view of the input. We refer to our method as attention sampling and it can process images of several megapixels with a standard single GPU setup. We show that sampling from the attention distribution results in an unbiased estimator of the full model with minimal variance, and we derive an unbiased estimator of the gradient that we use to train our model end-to-end with a normal SGD procedure. This new method is evaluated on three classification tasks, where we show that it allows to reduce computation and memory footprint by an order of magnitude for the same accuracy as classical architectures. We also show the consistency of the sampling that indeed focuses on informative parts of the input images.}\n}", "pdf": "http://proceedings.mlr.press/v97/katharopoulos19a/katharopoulos19a.pdf", "supp": "", "pdf_size": 1244572, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16495958235848738135&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Idiap Research Institute, Martigny, Switzerland+EPFL, Lausanne, Switzerland; Idiap Research Institute, Martigny, Switzerland+EPFL, Lausanne, Switzerland", "aff_domain": "idiap.ch; ", "email": "idiap.ch; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/katharopoulos19a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Idiap Research Institute;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.idiap.ch;https://www.epfl.ch", "aff_unique_abbr": "Idiap;EPFL", "aff_campus_unique_index": "0+1;0+1", "aff_campus_unique": "Martigny;Lausanne", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Switzerland" }, { "title": "Projection onto Minkowski Sums with Application to Constrained Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4073", "id": "4073", "author_site": "Joong-Ho (Johann) Won, Jason Xu, Kenneth Lange", "author": "Joong-Ho Won; Jason Xu; Kenneth Lange", "abstract": "We introduce block descent algorithms for projecting onto Minkowski sums of sets. Projection onto such sets is a crucial step in many statistical learning problems, and may regularize complexity of solutions to an optimization problem or arise in dual formulations of penalty methods. We show that projecting onto the Minkowski sum admits simple, efficient algorithms when complications such as overlapping constraints pose challenges to existing methods. We prove that our algorithm converges linearly when sets are strongly convex or satisfy an error bound condition, and extend the theory and methods to encompass non-convex sets as well. We demonstrate empirical advantages in runtime and accuracy over competitors in applications to $\\ell_{1,p}$-regularized learning, constrained lasso, and overlapping group lasso.", "bibtex": "@InProceedings{pmlr-v97-lange19a,\n title = \t {Projection onto {M}inkowski Sums with Application to Constrained Learning},\n author = {Lange, Kenneth and Won, Joong-Ho and Xu, Jason},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3642--3651},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lange19a/lange19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lange19a.html},\n abstract = \t {We introduce block descent algorithms for projecting onto Minkowski sums of sets. Projection onto such sets is a crucial step in many statistical learning problems, and may regularize complexity of solutions to an optimization problem or arise in dual formulations of penalty methods. We show that projecting onto the Minkowski sum admits simple, efficient algorithms when complications such as overlapping constraints pose challenges to existing methods. We prove that our algorithm converges linearly when sets are strongly convex or satisfy an error bound condition, and extend the theory and methods to encompass non-convex sets as well. We demonstrate empirical advantages in runtime and accuracy over competitors in applications to $\\ell_{1,p}$-regularized learning, constrained lasso, and overlapping group lasso.}\n}", "pdf": "http://proceedings.mlr.press/v97/lange19a/lange19a.pdf", "supp": "", "pdf_size": 429233, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17332636821166262206&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, Seoul National University; Department of Statistical Science, Duke University; University of California, Los Angeles", "aff_domain": "stats.snu.ac.kr;duke.edu; ", "email": "stats.snu.ac.kr;duke.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/lange19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "Seoul National University;Duke University;University of California, Los Angeles", "aff_unique_dep": "Department of Statistics;Department of Statistical Science;", "aff_unique_url": "https://www.snu.ac.kr;https://www.duke.edu;https://www.ucla.edu", "aff_unique_abbr": "SNU;Duke;UCLA", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Seoul;;Los Angeles", "aff_country_unique_index": "0;1;1", "aff_country_unique": "South Korea;United States" }, { "title": "Projections for Approximate Policy Iteration Algorithms", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4056", "id": "4056", "author_site": "Riad Akrour, Joni Pajarinen, Jan Peters, Gerhard Neumann", "author": "Riad Akrour; Joni Pajarinen; Jan Peters; Gerhard Neumann", "abstract": "Approximate policy iteration is a class of reinforcement learning (RL) algorithms where the policy is encoded using a function approximator and which has been especially prominent in RL with continuous action spaces. In this class of RL algorithms, ensuring increase of the policy return during policy update often requires to constrain the change in action distribution. Several approximations exist in the literature to solve this constrained policy update problem. In this paper, we propose to improve over such solutions by introducing a set of projections that transform the constrained problem into an unconstrained one which is then solved by standard gradient descent. Using these projections, we empirically demonstrate that our approach can improve the policy update solution and the control over exploration of existing approximate policy iteration algorithms.", "bibtex": "@InProceedings{pmlr-v97-akrour19a,\n title = \t {Projections for Approximate Policy Iteration Algorithms},\n author = {Akrour, Riad and Pajarinen, Joni and Peters, Jan and Neumann, Gerhard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {181--190},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/akrour19a/akrour19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/akrour19a.html},\n abstract = \t {Approximate policy iteration is a class of reinforcement learning (RL) algorithms where the policy is encoded using a function approximator and which has been especially prominent in RL with continuous action spaces. In this class of RL algorithms, ensuring increase of the policy return during policy update often requires to constrain the change in action distribution. Several approximations exist in the literature to solve this constrained policy update problem. In this paper, we propose to improve over such solutions by introducing a set of projections that transform the constrained problem into an unconstrained one which is then solved by standard gradient descent. Using these projections, we empirically demonstrate that our approach can improve the policy update solution and the control over exploration of existing approximate policy iteration algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/akrour19a/akrour19a.pdf", "supp": "", "pdf_size": 2722738, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8777635693518523693&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 14, "aff": "IAS, TU Darmstadt, Darmstadt, Germany+Tampere University, Finland; IAS, TU Darmstadt, Darmstadt, Germany+L-CAS, University of Lincoln, Lincoln, United Kingdom+Bosch Center for Arti\ufb01cial Intelligence (BCAI), Germany; L-CAS, University of Lincoln, Lincoln, United Kingdom; Max Planck Institute for Intelligent Systems, T\u00fcbingen, Germany", "aff_domain": "robot-learning.de; ; ; ", "email": "robot-learning.de; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/akrour19a.html", "aff_unique_index": "0+1;0+2+3;2;4", "aff_unique_norm": "Technical University of Darmstadt;Tampere University;University of Lincoln;Bosch Center for Arti\ufb01cial Intelligence;Max Planck Institute for Intelligent Systems", "aff_unique_dep": "Institute for Applied Systems Engineering (IAS);;L-CAS;Artificial Intelligence;", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.tuni.fi;https://www.lincoln.ac.uk;https://www.bosch-ai.com;https://www.mpi-is.mpg.de", "aff_unique_abbr": "TU Darmstadt;Tuni;UoL;BCAI;MPI-IS", "aff_campus_unique_index": "0;0+2;2;3", "aff_campus_unique": "Darmstadt;;Lincoln;T\u00fcbingen", "aff_country_unique_index": "0+1;0+2+0;2;0", "aff_country_unique": "Germany;Finland;United Kingdom" }, { "title": "Proportionally Fair Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4228", "id": "4228", "author_site": "Xingyu Chen, Brandon Fain, Liang Lyu, Kamesh Munagala", "author": "Xingyu Chen; Brandon Fain; Liang Lyu; Kamesh Munagala", "abstract": "We extend the fair machine learning literature by considering the problem of proportional centroid clustering in a metric context. For clustering n points with k centers, we define fairness as proportionality to mean that any n/k points are entitled to form their own cluster if there is another center that is closer in distance for all n/k points. We seek clustering solutions to which there are no such justified complaints from any subsets of agents, without assuming any a priori notion of protected subsets. We present and analyze algorithms to efficiently compute, optimize, and audit proportional solutions. We conclude with an empirical examination of the tradeoff between proportional solutions and the k-means objective.", "bibtex": "@InProceedings{pmlr-v97-chen19d,\n title = \t {Proportionally Fair Clustering},\n author = {Chen, Xingyu and Fain, Brandon and Lyu, Liang and Munagala, Kamesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1032--1041},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19d/chen19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19d.html},\n abstract = \t {We extend the fair machine learning literature by considering the problem of proportional centroid clustering in a metric context. For clustering n points with k centers, we define fairness as proportionality to mean that any n/k points are entitled to form their own cluster if there is another center that is closer in distance for all n/k points. We seek clustering solutions to which there are no such justified complaints from any subsets of agents, without assuming any a priori notion of protected subsets. We present and analyze algorithms to efficiently compute, optimize, and audit proportional solutions. We conclude with an empirical examination of the tradeoff between proportional solutions and the k-means objective.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19d/chen19d.pdf", "supp": "", "pdf_size": 499121, "gs_citation": 205, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10839450850263360296&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Duke University; Department of Computer Science, Duke University; Department of Computer Science, Duke University; Department of Computer Science, Duke University", "aff_domain": "cs.duke.edu; ; ; ", "email": "cs.duke.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chen19d.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Provable Guarantees for Gradient-Based Meta-Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3887", "id": "3887", "author_site": "Nina Balcan, Mikhail Khodak, Ameet Talwalkar", "author": "Maria-Florina Balcan; Mikhail Khodak; Ameet Talwalkar", "abstract": "We study the problem of meta-learning through the lens of online convex optimization, developing a meta-algorithm bridging the gap between popular gradient-based meta-learning and classical regularization-based multi-task transfer methods. Our method is the first to simultaneously satisfy good sample efficiency guarantees in the convex setting, with generalization bounds that improve with task-similarity, while also being computationally scalable to modern deep learning architectures and the many-task setting. Despite its simplicity, the algorithm matches, up to a constant factor, a lower bound on the performance of any such parameter-transfer method under natural task similarity assumptions. We use experiments in both convex and deep learning settings to verify and demonstrate the applicability of our theory.", "bibtex": "@InProceedings{pmlr-v97-balcan19a,\n title = \t {Provable Guarantees for Gradient-Based Meta-Learning},\n author = {Balcan, Maria-Florina and Khodak, Mikhail and Talwalkar, Ameet},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {424--433},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/balcan19a/balcan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/balcan19a.html},\n abstract = \t {We study the problem of meta-learning through the lens of online convex optimization, developing a meta-algorithm bridging the gap between popular gradient-based meta-learning and classical regularization-based multi-task transfer methods. Our method is the first to simultaneously satisfy good sample efficiency guarantees in the convex setting, with generalization bounds that improve with task-similarity, while also being computationally scalable to modern deep learning architectures and the many-task setting. Despite its simplicity, the algorithm matches, up to a constant factor, a lower bound on the performance of any such parameter-transfer method under natural task similarity assumptions. We use experiments in both convex and deep learning settings to verify and demonstrate the applicability of our theory.}\n}", "pdf": "http://proceedings.mlr.press/v97/balcan19a/balcan19a.pdf", "supp": "", "pdf_size": 898050, "gs_citation": 186, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18333296959440727243&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Carnegie Mellon University; Carnegie Mellon University; Carnegie Mellon University + Determined AI", "aff_domain": "cmu.edu; ; ", "email": "cmu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/balcan19a.html", "aff_unique_index": "0;0;0+1", "aff_unique_norm": "Carnegie Mellon University;Determined AI", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://determined.ai", "aff_unique_abbr": "CMU;Determined AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "United States" }, { "title": "Provably Efficient Imitation Learning from Observation Alone", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3927", "id": "3927", "author_site": "Wen Sun, Anirudh Vemula, Byron Boots, Drew Bagnell", "author": "Wen Sun; Anirudh Vemula; Byron Boots; Drew Bagnell", "abstract": "We study Imitation Learning (IL) from Observations alone (ILFO) in large-scale MDPs. While most IL algorithms rely on an expert to directly provide actions to the learner, in this setting the expert only supplies sequences of observations. We design a new model-free algorithm for ILFO, Forward Adversarial Imitation Learning (FAIL), which learns a sequence of time-dependent policies by minimizing an Integral Probability Metric between the observation distributions of the expert policy and the learner. FAIL provably learns a near-optimal policy with a number of samples that is polynomial in all relevant parameters but independent of the number of unique observations. The resulting theory extends the domain of provably sample efficient learning algorithms beyond existing results that typically only consider tabular RL settings or settings that require access to a near-optimal reset distribution. We also demonstrate the efficacy ofFAIL on multiple OpenAI Gym control tasks.", "bibtex": "@InProceedings{pmlr-v97-sun19b,\n title = \t {Provably Efficient Imitation Learning from Observation Alone},\n author = {Sun, Wen and Vemula, Anirudh and Boots, Byron and Bagnell, Drew},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6036--6045},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sun19b/sun19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/sun19b.html},\n abstract = \t {We study Imitation Learning (IL) from Observations alone (ILFO) in large-scale MDPs. While most IL algorithms rely on an expert to directly provide actions to the learner, in this setting the expert only supplies sequences of observations. We design a new model-free algorithm for ILFO, Forward Adversarial Imitation Learning (FAIL), which learns a sequence of time-dependent policies by minimizing an Integral Probability Metric between the observation distributions of the expert policy and the learner. FAIL provably learns a near-optimal policy with a number of samples that is polynomial in all relevant parameters but independent of the number of unique observations. The resulting theory extends the domain of provably sample efficient learning algorithms beyond existing results that typically only consider tabular RL settings or settings that require access to a near-optimal reset distribution. We also demonstrate the efficacy ofFAIL on multiple OpenAI Gym control tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/sun19b/sun19b.pdf", "supp": "", "pdf_size": 904538, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12068954688266237988&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Robotics Institute, Carnegie Mellon University, USA; College of Computing, Georgia Institute of Technology, USA; College of Computing, Georgia Institute of Technology, USA; Aurora Innovation, USA", "aff_domain": "cs.cmu.edu; ; ; ", "email": "cs.cmu.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/sun19b.html", "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Carnegie Mellon University;Georgia Institute of Technology;Aurora Innovation", "aff_unique_dep": "Robotics Institute;College of Computing;", "aff_unique_url": "https://www.cmu.edu;https://www.gatech.edu;https://aurora.tech", "aff_unique_abbr": "CMU;Georgia Tech;Aurora", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Provably Efficient Maximum Entropy Exploration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4322", "id": "4322", "author_site": "Elad Hazan, Sham Kakade, Karan Singh, Abby Van Soest", "author": "Elad Hazan; Sham Kakade; Karan Singh; Abby Van Soest", "abstract": "Suppose an agent is in a (possibly unknown) Markov Decision Process in the absence of a reward signal, what might we hope that an agent can efficiently learn to do? This work studies a broad class of objectives that are defined solely as functions of the state-visitation frequencies that are induced by how the agent behaves. For example, one natural, intrinsically defined, objective problem is for the agent to learn a policy which induces a distribution over state space that is as uniform as possible, which can be measured in an entropic sense. We provide an efficient algorithm to optimize such such intrinsically defined objectives, when given access to a black box planning oracle (which is robust to function approximation). Furthermore, when restricted to the tabular setting where we have sample based access to the MDP, our proposed algorithm is provably efficient, both in terms of its sample and computational complexities. Key to our algorithmic methodology is utilizing the conditional gradient method (a.k.a. the Frank-Wolfe algorithm) which utilizes an approximate MDP solver.", "bibtex": "@InProceedings{pmlr-v97-hazan19a,\n title = \t {Provably Efficient Maximum Entropy Exploration},\n author = {Hazan, Elad and Kakade, Sham and Singh, Karan and Van Soest, Abby},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2681--2691},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hazan19a/hazan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hazan19a.html},\n abstract = \t {Suppose an agent is in a (possibly unknown) Markov Decision Process in the absence of a reward signal, what might we hope that an agent can efficiently learn to do? This work studies a broad class of objectives that are defined solely as functions of the state-visitation frequencies that are induced by how the agent behaves. For example, one natural, intrinsically defined, objective problem is for the agent to learn a policy which induces a distribution over state space that is as uniform as possible, which can be measured in an entropic sense. We provide an efficient algorithm to optimize such such intrinsically defined objectives, when given access to a black box planning oracle (which is robust to function approximation). Furthermore, when restricted to the tabular setting where we have sample based access to the MDP, our proposed algorithm is provably efficient, both in terms of its sample and computational complexities. Key to our algorithmic methodology is utilizing the conditional gradient method (a.k.a. the Frank-Wolfe algorithm) which utilizes an approximate MDP solver.}\n}", "pdf": "http://proceedings.mlr.press/v97/hazan19a/hazan19a.pdf", "supp": "", "pdf_size": 535353, "gs_citation": 370, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7107307515820944527&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "aff": "Department of Computer Science, Princeton University + Google AI Princeton; Allen School of Computer Science and Engineering, University of Washington + Department of Statistics, University of Washington; Department of Computer Science, Princeton University + Google AI Princeton; Department of Computer Science, Princeton University + Google AI Princeton", "aff_domain": "google.com;cs.washington.edu;princeton.edu;princeton.edu", "email": "google.com;cs.washington.edu;princeton.edu;princeton.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/hazan19a.html", "aff_unique_index": "0+1;2+2;0+1;0+1", "aff_unique_norm": "Princeton University;Google;University of Washington", "aff_unique_dep": "Department of Computer Science;Google AI;Allen School of Computer Science and Engineering", "aff_unique_url": "https://www.princeton.edu;https://ai.google;https://www.cs.washington.edu", "aff_unique_abbr": "Princeton;Google AI;UW", "aff_campus_unique_index": "1;2+2;1;1", "aff_campus_unique": ";Princeton;Seattle", "aff_country_unique_index": "0+0;0+0;0+0;0+0", "aff_country_unique": "United States" }, { "title": "Provably efficient RL with Rich Observations via Latent State Decoding", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3757", "id": "3757", "author_site": "Simon Du, Akshay Krishnamurthy, Nan Jiang, Alekh Agarwal, Miroslav Dudik, John Langford", "author": "Simon Du; Akshay Krishnamurthy; Nan Jiang; Alekh Agarwal; Miroslav Dudik; John Langford", "abstract": "We study the exploration problem in episodic MDPs with rich observations generated from a small number of latent states. Under certain identifiability assumptions, we demonstrate how to estimate a mapping from the observations to latent states inductively through a sequence of regression and clustering steps\u2014where previously decoded latent states provide labels for later regression problems\u2014and use it to construct good exploration policies. We provide finite-sample guarantees on the quality of the learned state decoding function and exploration policies, and complement our theory with an empirical evaluation on a class of hard exploration problems. Our method exponentially improves over $Q$-learning with na\u00efve exploration, even when $Q$-learning has cheating access to latent states.", "bibtex": "@InProceedings{pmlr-v97-du19b,\n title = \t {Provably efficient {RL} with Rich Observations via Latent State Decoding},\n author = {Du, Simon and Krishnamurthy, Akshay and Jiang, Nan and Agarwal, Alekh and Dudik, Miroslav and Langford, John},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1665--1674},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/du19b/du19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/du19b.html},\n abstract = \t {We study the exploration problem in episodic MDPs with rich observations generated from a small number of latent states. Under certain identifiability assumptions, we demonstrate how to estimate a mapping from the observations to latent states inductively through a sequence of regression and clustering steps\u2014where previously decoded latent states provide labels for later regression problems\u2014and use it to construct good exploration policies. We provide finite-sample guarantees on the quality of the learned state decoding function and exploration policies, and complement our theory with an empirical evaluation on a class of hard exploration problems. Our method exponentially improves over $Q$-learning with na\u00efve exploration, even when $Q$-learning has cheating access to latent states.}\n}", "pdf": "http://proceedings.mlr.press/v97/du19b/du19b.pdf", "supp": "", "pdf_size": 769774, "gs_citation": 293, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17139201255005810211&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Carnegie Mellon University; Microsoft Research, New York; University of Illinois at Urbana-Champaign; Microsoft Research, Redmond; Microsoft Research, New York; Microsoft Research, New York", "aff_domain": "cs.cmu.edu; ; ; ; ; ", "email": "cs.cmu.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/du19b.html", "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "Carnegie Mellon University;Microsoft;University of Illinois Urbana-Champaign", "aff_unique_dep": ";Microsoft Research;", "aff_unique_url": "https://www.cmu.edu;https://www.microsoft.com/en-us/research;https://illinois.edu", "aff_unique_abbr": "CMU;MSR;UIUC", "aff_campus_unique_index": "1;2;3;1;1", "aff_campus_unique": ";New York;Urbana-Champaign;Redmond", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "QTRAN: Learning to Factorize with Transformation for Cooperative Multi-Agent Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3987", "id": "3987", "author_site": "Kyunghwan Son, Daewoo Kim, Wan Ju Kang, David Earl Hostallero, Yung Yi", "author": "Kyunghwan Son; Daewoo Kim; Wan Ju Kang; David Earl Hostallero; Yung Yi", "abstract": "We explore value-based solutions for multi-agent reinforcement learning (MARL) tasks in the centralized training with decentralized execution (CTDE) regime popularized recently. However, VDN and QMIX are representative examples that use the idea of factorization of the joint action-value function into individual ones for decentralized execution. VDN and QMIX address only a fraction of factorizable MARL tasks due to their structural constraint in factorization such as additivity and monotonicity. In this paper, we propose a new factorization method for MARL, QTRAN, which is free from such structural constraints and takes on a new approach to transforming the original joint action-value function into an easily factorizable one, with the same optimal actions. QTRAN guarantees more general factorization than VDN or QMIX, thus covering a much wider class of MARL tasks than does previous methods. Our experiments for the tasks of multi-domain Gaussian-squeeze and modified predator-prey demonstrate QTRAN\u2019s superior performance with especially larger margins in games whose payoffs penalize non-cooperative behavior more aggressively.", "bibtex": "@InProceedings{pmlr-v97-son19a,\n title = \t {{QTRAN}: Learning to Factorize with Transformation for Cooperative Multi-Agent Reinforcement Learning},\n author = {Son, Kyunghwan and Kim, Daewoo and Kang, Wan Ju and Hostallero, David Earl and Yi, Yung},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5887--5896},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/son19a/son19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/son19a.html},\n abstract = \t {We explore value-based solutions for multi-agent reinforcement learning (MARL) tasks in the centralized training with decentralized execution (CTDE) regime popularized recently. However, VDN and QMIX are representative examples that use the idea of factorization of the joint action-value function into individual ones for decentralized execution. VDN and QMIX address only a fraction of factorizable MARL tasks due to their structural constraint in factorization such as additivity and monotonicity. In this paper, we propose a new factorization method for MARL, QTRAN, which is free from such structural constraints and takes on a new approach to transforming the original joint action-value function into an easily factorizable one, with the same optimal actions. QTRAN guarantees more general factorization than VDN or QMIX, thus covering a much wider class of MARL tasks than does previous methods. Our experiments for the tasks of multi-domain Gaussian-squeeze and modified predator-prey demonstrate QTRAN\u2019s superior performance with especially larger margins in games whose payoffs penalize non-cooperative behavior more aggressively.}\n}", "pdf": "http://proceedings.mlr.press/v97/son19a/son19a.pdf", "supp": "", "pdf_size": 944921, "gs_citation": 1087, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8081563128599106489&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea", "aff_domain": "kaist.ac.kr; ; ; ;kaist.edu", "email": "kaist.ac.kr; ; ; ;kaist.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/son19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "KAIST", "aff_unique_dep": "School of Electrical Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "title": "Quantifying Generalization in Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3742", "id": "3742", "author_site": "Karl Cobbe, Oleg Klimov, Chris Hesse, Taehoon Kim, John Schulman", "author": "Karl Cobbe; Oleg Klimov; Chris Hesse; Taehoon Kim; John Schulman", "abstract": "In this paper, we investigate the problem of overfitting in deep reinforcement learning. Among the most common benchmarks in RL, it is customary to use the same environments for both training and testing. This practice offers relatively little insight into an agent\u2019s ability to generalize. We address this issue by using procedurally generated environments to construct distinct training and test sets. Most notably, we introduce a new environment called CoinRun, designed as a benchmark for generalization in RL. Using CoinRun, we find that agents overfit to surprisingly large training sets. We then show that deeper convolutional architectures improve generalization, as do methods traditionally found in supervised learning, including L2 regularization, dropout, data augmentation and batch normalization.", "bibtex": "@InProceedings{pmlr-v97-cobbe19a,\n title = \t {Quantifying Generalization in Reinforcement Learning},\n author = {Cobbe, Karl and Klimov, Oleg and Hesse, Chris and Kim, Taehoon and Schulman, John},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1282--1289},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cobbe19a/cobbe19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cobbe19a.html},\n abstract = \t {In this paper, we investigate the problem of overfitting in deep reinforcement learning. Among the most common benchmarks in RL, it is customary to use the same environments for both training and testing. This practice offers relatively little insight into an agent\u2019s ability to generalize. We address this issue by using procedurally generated environments to construct distinct training and test sets. Most notably, we introduce a new environment called CoinRun, designed as a benchmark for generalization in RL. Using CoinRun, we find that agents overfit to surprisingly large training sets. We then show that deeper convolutional architectures improve generalization, as do methods traditionally found in supervised learning, including L2 regularization, dropout, data augmentation and batch normalization.}\n}", "pdf": "http://proceedings.mlr.press/v97/cobbe19a/cobbe19a.pdf", "supp": "", "pdf_size": 3567185, "gs_citation": 831, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9870113474300692969&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "OpenAI; OpenAI; OpenAI; OpenAI; OpenAI", "aff_domain": "openai.com; ; ; ; ", "email": "openai.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/cobbe19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "OpenAI", "aff_unique_dep": "", "aff_unique_url": "https://openai.com", "aff_unique_abbr": "OpenAI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Quantile Stein Variational Gradient Descent for Batch Bayesian Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3796", "id": "3796", "author_site": "Chengyue Gong, Jian Peng, Qiang Liu", "author": "Chengyue Gong; Jian Peng; Qiang Liu", "abstract": "Batch Bayesian optimization has been shown to be an efficient and successful approach for black-box function optimization, especially when the evaluation of cost function is highly expensive but can be efficiently parallelized. In this paper, we introduce a novel variational framework for batch query optimization, based on the argument that the query batch should be selected to have both high diversity and good worst case performance. This motivates us to introduce a variational objective that combines a quantile-based risk measure (for worst case performance) and entropy regularization (for enforcing diversity). We derive a gradient-based particle-based algorithm for solving our quantile-based variational objective, which generalizes Stein variational gradient descent (SVGD). We evaluate our method on a number of real-world applications and show that it consistently outperforms other recent state-of-the-art batch Bayesian optimization methods. Extensive experimental results indicate that our method achieves better or comparable performance, compared to the existing methods.", "bibtex": "@InProceedings{pmlr-v97-gong19b,\n title = \t {Quantile Stein Variational Gradient Descent for Batch {B}ayesian Optimization},\n author = {Gong, Chengyue and Peng, Jian and Liu, Qiang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2347--2356},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gong19b/gong19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/gong19b.html},\n abstract = \t {Batch Bayesian optimization has been shown to be an efficient and successful approach for black-box function optimization, especially when the evaluation of cost function is highly expensive but can be efficiently parallelized. In this paper, we introduce a novel variational framework for batch query optimization, based on the argument that the query batch should be selected to have both high diversity and good worst case performance. This motivates us to introduce a variational objective that combines a quantile-based risk measure (for worst case performance) and entropy regularization (for enforcing diversity). We derive a gradient-based particle-based algorithm for solving our quantile-based variational objective, which generalizes Stein variational gradient descent (SVGD). We evaluate our method on a number of real-world applications and show that it consistently outperforms other recent state-of-the-art batch Bayesian optimization methods. Extensive experimental results indicate that our method achieves better or comparable performance, compared to the existing methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/gong19b/gong19b.pdf", "supp": "", "pdf_size": 582984, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16701896448454967074&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, UT Austin; University of Illinois at Urbana Champaign, IL; Department of Computer Science, UT Austin", "aff_domain": "cs.utexas.edu; ;cs.utexas.edu", "email": "cs.utexas.edu; ;cs.utexas.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/gong19b.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Texas at Austin;University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.utexas.edu;https://illinois.edu", "aff_unique_abbr": "UT Austin;UIUC", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Austin;Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "RaFM: Rank-Aware Factorization Machines", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3709", "id": "3709", "author_site": "Xiaoshuang Chen, Yin Zheng, Jiaxing Wang, Wenye Ma, Junzhou Huang", "author": "Xiaoshuang Chen; Yin Zheng; Jiaxing Wang; Wenye Ma; Junzhou Huang", "abstract": "Fatorization machines (FM) are a popular model class to learn pairwise interactions by a low-rank approximation. Different from existing FM-based approaches which use a fixed rank for all features, this paper proposes a Rank-Aware FM (RaFM) model which adopts pairwise interactions from embeddings with different ranks. The proposed model achieves a better performance on real-world datasets where different features have significantly varying frequencies of occurrences. Moreover, we prove that the RaFM model can be stored, evaluated, and trained as efficiently as one single FM, and under some reasonable conditions it can be even significantly more efficient than FM. RaFM improves the performance of FMs in both regression tasks and classification tasks while incurring less computational burden, therefore also has attractive potential in industrial applications.", "bibtex": "@InProceedings{pmlr-v97-chen19n,\n title = \t {{R}a{FM}: Rank-Aware Factorization Machines},\n author = {Chen, Xiaoshuang and Zheng, Yin and Wang, Jiaxing and Ma, Wenye and Huang, Junzhou},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1132--1140},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19n/chen19n.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19n.html},\n abstract = \t {Fatorization machines (FM) are a popular model class to learn pairwise interactions by a low-rank approximation. Different from existing FM-based approaches which use a fixed rank for all features, this paper proposes a Rank-Aware FM (RaFM) model which adopts pairwise interactions from embeddings with different ranks. The proposed model achieves a better performance on real-world datasets where different features have significantly varying frequencies of occurrences. Moreover, we prove that the RaFM model can be stored, evaluated, and trained as efficiently as one single FM, and under some reasonable conditions it can be even significantly more efficient than FM. RaFM improves the performance of FMs in both regression tasks and classification tasks while incurring less computational burden, therefore also has attractive potential in industrial applications.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19n/chen19n.pdf", "supp": "", "pdf_size": 377357, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9961787920931572726&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical Engineering, Tsinghua University; Tencent AI Lab; Institute of Automation, Chinese Academy of Sciences, and University of Chinese Academy of Sciences; Tencent AI Lab; Tencent AI Lab", "aff_domain": "tsinghua.edu.cn;tencent.com;ucas.ac.cn;tencent.com;tencent.com", "email": "tsinghua.edu.cn;tencent.com;ucas.ac.cn;tencent.com;tencent.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/chen19n.html", "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "Tsinghua University;Tencent;Chinese Academy of Sciences", "aff_unique_dep": "Department of Electrical Engineering;Tencent AI Lab;Institute of Automation", "aff_unique_url": "https://www.tsinghua.edu.cn;https://ai.tencent.com;http://www.ia.cas.cn", "aff_unique_abbr": "THU;Tencent AI Lab;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Rademacher Complexity for Adversarially Robust Generalization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3688", "id": "3688", "author_site": "Dong Yin, Kannan Ramchandran, Peter Bartlett", "author": "Dong Yin; Ramchandran Kannan; Peter Bartlett", "abstract": "Many machine learning models are vulnerable to adversarial attacks; for example, adding adversarial perturbations that are imperceptible to humans can often make machine learning models produce wrong predictions with high confidence; moreover, although we may obtain robust models on the training dataset via adversarial training, in some problems the learned models cannot generalize well to the test data. In this paper, we focus on $\\ell_\\infty$ attacks, and study the adversarially robust generalization problem through the lens of Rademacher complexity. For binary linear classifiers, we prove tight bounds for the adversarial Rademacher complexity, and show that the adversarial Rademacher complexity is never smaller than its natural counterpart, and it has an unavoidable dimension dependence, unless the weight vector has bounded $\\ell_1$ norm, and our results also extend to multi-class linear classifiers; in addition, for (nonlinear) neural networks, we show that the dimension dependence in the adversarial Rademacher complexity also exists. We further consider a surrogate adversarial loss for one-hidden layer ReLU network and prove margin bounds for this setting. Our results indicate that having $\\ell_1$ norm constraints on the weight matrices might be a potential way to improve generalization in the adversarial setting. We demonstrate experimental results that validate our theoretical findings.", "bibtex": "@InProceedings{pmlr-v97-yin19b,\n title = \t {Rademacher Complexity for Adversarially Robust Generalization},\n author = {Yin, Dong and Kannan, Ramchandran and Bartlett, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7085--7094},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yin19b/yin19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/yin19b.html},\n abstract = \t {Many machine learning models are vulnerable to adversarial attacks; for example, adding adversarial perturbations that are imperceptible to humans can often make machine learning models produce wrong predictions with high confidence; moreover, although we may obtain robust models on the training dataset via adversarial training, in some problems the learned models cannot generalize well to the test data. In this paper, we focus on $\\ell_\\infty$ attacks, and study the adversarially robust generalization problem through the lens of Rademacher complexity. For binary linear classifiers, we prove tight bounds for the adversarial Rademacher complexity, and show that the adversarial Rademacher complexity is never smaller than its natural counterpart, and it has an unavoidable dimension dependence, unless the weight vector has bounded $\\ell_1$ norm, and our results also extend to multi-class linear classifiers; in addition, for (nonlinear) neural networks, we show that the dimension dependence in the adversarial Rademacher complexity also exists. We further consider a surrogate adversarial loss for one-hidden layer ReLU network and prove margin bounds for this setting. Our results indicate that having $\\ell_1$ norm constraints on the weight matrices might be a potential way to improve generalization in the adversarial setting. We demonstrate experimental results that validate our theoretical findings.}\n}", "pdf": "http://proceedings.mlr.press/v97/yin19b/yin19b.pdf", "supp": "", "pdf_size": 602410, "gs_citation": 341, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3771850404643054723&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Electrical Engineering and Computer Sciences, UC Berkeley, Berkeley, CA, USA+Department of Statistics, UC Berkeley, Berkeley, CA, USA; Department of Electrical Engineering and Computer Sciences, UC Berkeley, Berkeley, CA, USA; Department of Electrical Engineering and Computer Sciences, UC Berkeley, Berkeley, CA, USA+Department of Statistics, UC Berkeley, Berkeley, CA, USA", "aff_domain": "eecs.berkeley.edu; ; ", "email": "eecs.berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yin19b.html", "aff_unique_index": "0+0;0;0+0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United States" }, { "title": "Random Expert Distillation: Imitation Learning via Expert Policy Support Estimation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4038", "id": "4038", "author_site": "Ruohan Wang, Carlo Ciliberto, Pierluigi Vito Amadori, Yiannis Demiris", "author": "Ruohan Wang; Carlo Ciliberto; Pierluigi Vito Amadori; Yiannis Demiris", "abstract": "We consider the problem of imitation learning from a finite set of expert trajectories, without access to reinforcement signals. The classical approach of extracting the expert\u2019s reward function via inverse reinforcement learning, followed by reinforcement learning is indirect and may be computationally expensive. Recent generative adversarial methods based on matching the policy distribution between the expert and the agent could be unstable during training. We propose a new framework for imitation learning by estimating the support of the expert policy to compute a fixed reward function, which allows us to re-frame imitation learning within the standard reinforcement learning setting. We demonstrate the efficacy of our reward function on both discrete and continuous domains, achieving comparable or better performance than the state of the art under different reinforcement learning algorithms.", "bibtex": "@InProceedings{pmlr-v97-wang19d,\n title = \t {Random Expert Distillation: Imitation Learning via Expert Policy Support Estimation},\n author = {Wang, Ruohan and Ciliberto, Carlo and Amadori, Pierluigi Vito and Demiris, Yiannis},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6536--6544},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19d/wang19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19d.html},\n abstract = \t {We consider the problem of imitation learning from a finite set of expert trajectories, without access to reinforcement signals. The classical approach of extracting the expert\u2019s reward function via inverse reinforcement learning, followed by reinforcement learning is indirect and may be computationally expensive. Recent generative adversarial methods based on matching the policy distribution between the expert and the agent could be unstable during training. We propose a new framework for imitation learning by estimating the support of the expert policy to compute a fixed reward function, which allows us to re-frame imitation learning within the standard reinforcement learning setting. We demonstrate the efficacy of our reward function on both discrete and continuous domains, achieving comparable or better performance than the state of the art under different reinforcement learning algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19d/wang19d.pdf", "supp": "", "pdf_size": 5145017, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2838461363780817206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wang19d.html" }, { "title": "Random Function Priors for Correlation Modeling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3657", "id": "3657", "author_site": "Aonan Zhang, John Paisley", "author": "Aonan Zhang; John Paisley", "abstract": "The likelihood model of high dimensional data $X_n$ can often be expressed as $p(X_n|Z_n,\\theta)$, where $\\theta\\mathrel{\\mathop:}=(\\theta_k)_{k\\in[K]}$ is a collection of hidden features shared across objects, indexed by $n$, and $Z_n$ is a non-negative factor loading vector with $K$ entries where $Z_{nk}$ indicates the strength of $\\theta_k$ used to express $X_n$. In this paper, we introduce random function priors for $Z_n$ for modeling correlations among its $K$ dimensions $Z_{n1}$ through $Z_{nK}$, which we call", "bibtex": "@InProceedings{pmlr-v97-zhang19k,\n title = \t {Random Function Priors for Correlation Modeling},\n author = {Zhang, Aonan and Paisley, John},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7424--7433},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19k/zhang19k.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19k.html},\n abstract = \t {The likelihood model of high dimensional data $X_n$ can often be expressed as $p(X_n|Z_n,\\theta)$, where $\\theta\\mathrel{\\mathop:}=(\\theta_k)_{k\\in[K]}$ is a collection of hidden features shared across objects, indexed by $n$, and $Z_n$ is a non-negative factor loading vector with $K$ entries where $Z_{nk}$ indicates the strength of $\\theta_k$ used to express $X_n$. In this paper, we introduce random function priors for $Z_n$ for modeling correlations among its $K$ dimensions $Z_{n1}$ through $Z_{nK}$, which we call", "pdf": "http://proceedings.mlr.press/v97/zhang19k/zhang19k.pdf", "supp": "", "pdf_size": 5958163, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7970715264595524454&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical Engineering & Data Science Institute, Columbia University, New York, USA; Department of Electrical Engineering & Data Science Institute, Columbia University, New York, USA", "aff_domain": "columbia.edu; ", "email": "columbia.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zhang19k.html", "aff_unique_index": "0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "Department of Electrical Engineering & Data Science Institute", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Random Matrix Improved Covariance Estimation for a Large Class of Metrics", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3740", "id": "3740", "author_site": "Malik TIOMOKO A, Romain Couillet, Florent BOUCHARD, Guillaume GINOLHAC", "author": "Malik Tiomoko; Romain Couillet; Florent Bouchard; Guillaume Ginolhac", "abstract": "Relying on recent advances in statistical estimation of covariance distances based on random matrix theory, this article proposes an improved covariance and precision matrix estimation for a wide family of metrics. The method is shown to largely outperform the sample covariance matrix estimate and to compete with state-of-the-art methods, while at the same time being computationally simpler and faster. Applications to linear and quadratic discriminant analyses also show significant gains, therefore suggesting practical interest to statistical machine learning.", "bibtex": "@InProceedings{pmlr-v97-tiomoko19a,\n title = \t {Random Matrix Improved Covariance Estimation for a Large Class of Metrics},\n author = {Tiomoko, Malik and Couillet, Romain and Bouchard, Florent and Ginolhac, Guillaume},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6254--6263},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tiomoko19a/tiomoko19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tiomoko19a.html},\n abstract = \t {Relying on recent advances in statistical estimation of covariance distances based on random matrix theory, this article proposes an improved covariance and precision matrix estimation for a wide family of metrics. The method is shown to largely outperform the sample covariance matrix estimate and to compete with state-of-the-art methods, while at the same time being computationally simpler and faster. Applications to linear and quadratic discriminant analyses also show significant gains, therefore suggesting practical interest to statistical machine learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/tiomoko19a/tiomoko19a.pdf", "supp": "", "pdf_size": 370167, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14563895050572661736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 22, "aff": "CentraleSup\u00e9lec, University ParisSaclay, France+GIPSA-lab, University Grenoble-Alpes, France; LISTIC, University Savoie Mont-Blanc, France; LISTIC, University Savoie Mont-Blanc, France; CentraleSup\u00e9lec, University ParisSaclay, France+GIPSA-lab, University Grenoble-Alpes, France", "aff_domain": "gipsa-lab.grenoble-inp.fr;univ-smb.fr;univ-smb.fr;gipsa-lab.grenoble-inp.fr", "email": "gipsa-lab.grenoble-inp.fr;univ-smb.fr;univ-smb.fr;gipsa-lab.grenoble-inp.fr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/tiomoko19a.html", "aff_unique_index": "0+1;2;2;0+1", "aff_unique_norm": "CentraleSup\u00e9lec;University Grenoble-Alpes;University Savoie Mont-Blanc", "aff_unique_dep": ";GIPSA-lab;LISTIC", "aff_unique_url": "https://www.centralesupelec.fr;https://www.univ-grenoble-alpes.fr;https://www.univ-smb.fr", "aff_unique_abbr": "CS;;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "University Paris-Saclay;", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "France" }, { "title": "Random Shuffling Beats SGD after Finite Epochs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4126", "id": "4126", "author_site": "Jeff HaoChen, Suvrit Sra", "author": "Jeff Haochen; Suvrit Sra", "abstract": "A long-standing problem in stochastic optimization is proving that \\rsgd, the without-replacement version of \\sgd, converges faster than the usual with-replacement \\sgd. Building upon\u00a0\\citep{gurbuzbalaban2015random}, we present the", "bibtex": "@InProceedings{pmlr-v97-haochen19a,\n title = \t {Random Shuffling Beats {SGD} after Finite Epochs},\n author = {Haochen, Jeff and Sra, Suvrit},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2624--2633},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/haochen19a/haochen19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/haochen19a.html},\n abstract = \t {A long-standing problem in stochastic optimization is proving that \\rsgd, the without-replacement version of \\sgd, converges faster than the usual with-replacement \\sgd. Building upon\u00a0\\citep{gurbuzbalaban2015random}, we present the", "pdf": "http://proceedings.mlr.press/v97/haochen19a/haochen19a.pdf", "supp": "", "pdf_size": 315108, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18118400213565478652&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Institute for Interdisciplinary Information Sciences, Tsinghua University; Massachusetts Institute of Technology", "aff_domain": "mails.tsinghua.edu.cn;mit.edu", "email": "mails.tsinghua.edu.cn;mit.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/haochen19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Tsinghua University;Massachusetts Institute of Technology", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://web.mit.edu", "aff_unique_abbr": "Tsinghua;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "title": "Random Walks on Hypergraphs with Edge-Dependent Vertex Weights", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4240", "id": "4240", "author_site": "Uthsav Chitra, Benjamin Raphael", "author": "Uthsav Chitra; Benjamin Raphael", "abstract": "Hypergraphs are used in machine learning to model higher-order relationships in data. While spectral methods for graphs are well-established, spectral theory for hypergraphs remains an active area of research. In this paper, we use random walks to develop a spectral theory for hypergraphs with edge-dependent vertex weights: hypergraphs where every vertex v has a weight $\\gamma_e(v)$ for each incident hyperedge e that describes the contribution of v to the hyperedge e. We derive a random walk-based hypergraph Laplacian, and bound the mixing time of random walks on such hypergraphs. Moreover, we give conditions under which random walks on such hypergraphs are equivalent to random walks on graphs. As a corollary, we show that current machine learning methods that rely on Laplacians derived from random walks on hypergraphs with edge-independent vertex weights do not utilize higher-order relationships in the data. Finally, we demonstrate the advantages of hypergraphs with edge-dependent vertex weights on ranking applications using real-world datasets.", "bibtex": "@InProceedings{pmlr-v97-chitra19a,\n title = \t {Random Walks on Hypergraphs with Edge-Dependent Vertex Weights},\n author = {Chitra, Uthsav and Raphael, Benjamin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1172--1181},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chitra19a/chitra19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chitra19a.html},\n abstract = \t {Hypergraphs are used in machine learning to model higher-order relationships in data. While spectral methods for graphs are well-established, spectral theory for hypergraphs remains an active area of research. In this paper, we use random walks to develop a spectral theory for hypergraphs with edge-dependent vertex weights: hypergraphs where every vertex v has a weight $\\gamma_e(v)$ for each incident hyperedge e that describes the contribution of v to the hyperedge e. We derive a random walk-based hypergraph Laplacian, and bound the mixing time of random walks on such hypergraphs. Moreover, we give conditions under which random walks on such hypergraphs are equivalent to random walks on graphs. As a corollary, we show that current machine learning methods that rely on Laplacians derived from random walks on hypergraphs with edge-independent vertex weights do not utilize higher-order relationships in the data. Finally, we demonstrate the advantages of hypergraphs with edge-dependent vertex weights on ranking applications using real-world datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/chitra19a/chitra19a.pdf", "supp": "", "pdf_size": 337041, "gs_citation": 144, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1402527715954592074&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Department of Computer Science, Princeton University; Department of Computer Science, Princeton University", "aff_domain": "cs.princeton.edu;cs.princeton.edu", "email": "cs.princeton.edu;cs.princeton.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/chitra19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Rao-Blackwellized Stochastic Gradients for Discrete Distributions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4277", "id": "4277", "author_site": "Runjing Liu, Jeffrey Regier, Nilesh Tripuraneni, Michael Jordan, Jon McAuliffe", "author": "Runjing Liu; Jeffrey Regier; Nilesh Tripuraneni; Michael Jordan; Jon Mcauliffe", "abstract": "We wish to compute the gradient of an expectation over a finite or countably infinite sample space having K $\\leq$ $\\infty$ categories. When K is indeed infinite, or finite but very large, the relevant summation is intractable. Accordingly, various stochastic gradient estimators have been proposed. In this paper, we describe a technique that can be applied to reduce the variance of any such estimator, without changing its bias{\u2014}in particular, unbiasedness is retained. We show that our technique is an instance of Rao-Blackwellization, and we demonstrate the improvement it yields on a semi-supervised classification problem and a pixel attention task.", "bibtex": "@InProceedings{pmlr-v97-liu19c,\n title = \t {Rao-Blackwellized Stochastic Gradients for Discrete Distributions},\n author = {Liu, Runjing and Regier, Jeffrey and Tripuraneni, Nilesh and Jordan, Michael and Mcauliffe, Jon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4023--4031},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19c/liu19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19c.html},\n abstract = \t {We wish to compute the gradient of an expectation over a finite or countably infinite sample space having K $\\leq$ $\\infty$ categories. When K is indeed infinite, or finite but very large, the relevant summation is intractable. Accordingly, various stochastic gradient estimators have been proposed. In this paper, we describe a technique that can be applied to reduce the variance of any such estimator, without changing its bias{\u2014}in particular, unbiasedness is retained. We show that our technique is an instance of Rao-Blackwellization, and we demonstrate the improvement it yields on a semi-supervised classification problem and a pixel attention task.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19c/liu19c.pdf", "supp": "", "pdf_size": 468944, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12116217648667930393&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, University of California, Berkeley + Department of Electrical Engineering and Computer Sciences, University of California, Berkeley; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley; Department of Statistics, University of California, Berkeley + Department of Electrical Engineering and Computer Sciences, University of California, Berkeley + The Voleon Group; Department of Statistics, University of California, Berkeley + The Voleon Group", "aff_domain": "berkeley.edu; ; ; ; ", "email": "berkeley.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/liu19c.html", "aff_unique_index": "0+0;0;0;0+0+1;0+1", "aff_unique_norm": "University of California, Berkeley;Voleon Group", "aff_unique_dep": "Department of Statistics;", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0+0;0;0;0+0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0+0;0;0;0+0+0;0+0", "aff_country_unique": "United States" }, { "title": "Rate Distortion For Model Compression:From Theory To Practice", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3765", "id": "3765", "author_site": "Weihao Gao, Yu-Han Liu, Chong Wang, Sewoong Oh", "author": "Weihao Gao; Yu-Han Liu; Chong Wang; Sewoong Oh", "abstract": "The enormous size of modern deep neural net-works makes it challenging to deploy those models in memory and communication limited scenarios. Thus, compressing a trained model without a significant loss in performance has become an increasingly important task. Tremendous advances has been made recently, where the main technical building blocks are pruning, quantization, and low-rank factorization. In this paper, we propose principled approaches to improve upon the common heuristics used in those building blocks, by studying the fundamental limit for model compression via the rate distortion theory. We prove a lower bound for the rate distortion function for model compression and prove its achievability for linear models. Although this achievable compression scheme is intractable in practice, this analysis motivates a novel objective function for model compression, which can be used to improve classes of model compressor such as pruning or quantization. Theoretically, we prove that the proposed scheme is optimal for compressing one-hidden-layer ReLU neural networks. Empirically,we show that the proposed scheme improves upon the baseline in the compression-accuracy tradeoff.", "bibtex": "@InProceedings{pmlr-v97-gao19c,\n title = \t {Rate Distortion For Model {C}ompression:{F}rom Theory To Practice},\n author = {Gao, Weihao and Liu, Yu-Han and Wang, Chong and Oh, Sewoong},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2102--2111},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gao19c/gao19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/gao19c.html},\n abstract = \t {The enormous size of modern deep neural net-works makes it challenging to deploy those models in memory and communication limited scenarios. Thus, compressing a trained model without a significant loss in performance has become an increasingly important task. Tremendous advances has been made recently, where the main technical building blocks are pruning, quantization, and low-rank factorization. In this paper, we propose principled approaches to improve upon the common heuristics used in those building blocks, by studying the fundamental limit for model compression via the rate distortion theory. We prove a lower bound for the rate distortion function for model compression and prove its achievability for linear models. Although this achievable compression scheme is intractable in practice, this analysis motivates a novel objective function for model compression, which can be used to improve classes of model compressor such as pruning or quantization. Theoretically, we prove that the proposed scheme is optimal for compressing one-hidden-layer ReLU neural networks. Empirically,we show that the proposed scheme improves upon the baseline in the compression-accuracy tradeoff.}\n}", "pdf": "http://proceedings.mlr.press/v97/gao19c/gao19c.pdf", "supp": "", "pdf_size": 378646, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6363968792782918425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Electrical and Computer Engineering, University of Illinois at Urbana-Champaign + Google; Google, Inc.; Bytedance, Inc.; Department of Computer Science, University of Washington", "aff_domain": "illinois.edu;google.com;gmail.com;cs.washington.edu", "email": "illinois.edu;google.com;gmail.com;cs.washington.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/gao19c.html", "aff_unique_index": "0+1;1;2;3", "aff_unique_norm": "University of Illinois Urbana-Champaign;Google;ByteDance;University of Washington", "aff_unique_dep": "Department of Electrical and Computer Engineering;Google;;Department of Computer Science", "aff_unique_url": "https://illinois.edu;https://www.google.com;https://www.bytedance.com;https://www.washington.edu", "aff_unique_abbr": "UIUC;Google;Bytedance;UW", "aff_campus_unique_index": "0+1;1;3", "aff_campus_unique": "Urbana-Champaign;Mountain View;;Seattle", "aff_country_unique_index": "0+0;0;1;0", "aff_country_unique": "United States;China" }, { "title": "Rates of Convergence for Sparse Variational Gaussian Process Regression", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3737", "id": "3737", "author_site": "David Burt, Carl E Rasmussen, Mark van der Wilk", "author": "David Burt; Carl Edward Rasmussen; Mark Van Der Wilk", "abstract": "Excellent variational approximations to Gaussian process posteriors have been developed which avoid the $\\mathcal{O}\\left(N^3\\right)$ scaling with dataset size $N$. They reduce the computational cost to $\\mathcal{O}\\left(NM^2\\right)$, with $M\\ll N$ the number of", "bibtex": "@InProceedings{pmlr-v97-burt19a,\n title = \t {Rates of Convergence for Sparse Variational {G}aussian Process Regression},\n author = {Burt, David and Rasmussen, Carl Edward and Van Der Wilk, Mark},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {862--871},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/burt19a/burt19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/burt19a.html},\n abstract = \t {Excellent variational approximations to Gaussian process posteriors have been developed which avoid the $\\mathcal{O}\\left(N^3\\right)$ scaling with dataset size $N$. They reduce the computational cost to $\\mathcal{O}\\left(NM^2\\right)$, with $M\\ll N$ the number of", "pdf": "http://proceedings.mlr.press/v97/burt19a/burt19a.pdf", "supp": "", "pdf_size": 775816, "gs_citation": 218, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4719880575444629087&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "University of Cambridge, Cambridge, UK+PROWLER.io, Cambridge, UK; University of Cambridge, Cambridge, UK+PROWLER.io, Cambridge, UK; PROWLER.io, Cambridge, UK", "aff_domain": "cam.ac.uk; ; ", "email": "cam.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/burt19a.html", "aff_unique_index": "0+1;0+1;1", "aff_unique_norm": "University of Cambridge;PROWLER.io", "aff_unique_dep": ";", "aff_unique_url": "https://www.cam.ac.uk;https://prowler.io", "aff_unique_abbr": "Cambridge;", "aff_campus_unique_index": "0+0;0+0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United Kingdom" }, { "title": "Recurrent Kalman Networks: Factorized Inference in High-Dimensional Deep Feature Spaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3721", "id": "3721", "author_site": "Philipp Becker, Harit Pandya, Gregor Gebhardt, Cheng Zhao, C. James Taylor, Gerhard Neumann", "author": "Philipp Becker; Harit Pandya; Gregor Gebhardt; Cheng Zhao; C. James Taylor; Gerhard Neumann", "abstract": "In order to integrate uncertainty estimates into deep time-series modelling, Kalman Filters (KFs) (Kalman et al., 1960) have been integrated with deep learning models, however, such approaches typically rely on approximate inference tech- niques such as variational inference which makes learning more complex and often less scalable due to approximation errors. We propose a new deep approach to Kalman filtering which can be learned directly in an end-to-end manner using backpropagation without additional approximations. Our approach uses a high-dimensional factorized latent state representation for which the Kalman updates simplify to scalar operations and thus avoids hard to backpropagate, computationally heavy and potentially unstable matrix inversions. Moreover, we use locally linear dynamic models to efficiently propagate the latent state to the next time step. The resulting network architecture, which we call Recurrent Kalman Network (RKN), can be used for any time-series data, similar to a LSTM (Hochreiter & Schmidhuber, 1997) but uses an explicit representation of uncertainty. As shown by our experiments, the RKN obtains much more accurate uncertainty estimates than an LSTM or Gated Recurrent Units (GRUs) (Cho et al., 2014) while also showing a slightly improved prediction performance and outperforms various recent generative models on an image imputation task.", "bibtex": "@InProceedings{pmlr-v97-becker19a,\n title = \t {Recurrent Kalman Networks: Factorized Inference in High-Dimensional Deep Feature Spaces},\n author = {Becker, Philipp and Pandya, Harit and Gebhardt, Gregor and Zhao, Cheng and Taylor, C. James and Neumann, Gerhard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {544--552},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/becker19a/becker19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/becker19a.html},\n abstract = \t {In order to integrate uncertainty estimates into deep time-series modelling, Kalman Filters (KFs) (Kalman et al., 1960) have been integrated with deep learning models, however, such approaches typically rely on approximate inference tech- niques such as variational inference which makes learning more complex and often less scalable due to approximation errors. We propose a new deep approach to Kalman filtering which can be learned directly in an end-to-end manner using backpropagation without additional approximations. Our approach uses a high-dimensional factorized latent state representation for which the Kalman updates simplify to scalar operations and thus avoids hard to backpropagate, computationally heavy and potentially unstable matrix inversions. Moreover, we use locally linear dynamic models to efficiently propagate the latent state to the next time step. The resulting network architecture, which we call Recurrent Kalman Network (RKN), can be used for any time-series data, similar to a LSTM (Hochreiter & Schmidhuber, 1997) but uses an explicit representation of uncertainty. As shown by our experiments, the RKN obtains much more accurate uncertainty estimates than an LSTM or Gated Recurrent Units (GRUs) (Cho et al., 2014) while also showing a slightly improved prediction performance and outperforms various recent generative models on an image imputation task.}\n}", "pdf": "http://proceedings.mlr.press/v97/becker19a/becker19a.pdf", "supp": "", "pdf_size": 534875, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1178874295303321595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Computational Learning for Autonomous Systems, TU Darmstadt, Darmstadt, Germany+Bosch Center for Artificial Intelligence, Renningen, Germany+University of T\u00fcbingen, T\u00fcbingen, Germany; Lincoln Center for Autonomous Systems, University of Lincoln, Lincoln, UK; Computational Learning for Autonomous Systems, TU Darmstadt, Darmstadt, Germany; Extreme Robotics Lab, University of Birmingham, Birmingham, UK; Engineering Department, Lancaster University, Lancaster, UK; Lincoln Center for Autonomous Systems, University of Lincoln, Lincoln, UK+Bosch Center for Artificial Intelligence, Renningen, Germany+University of T\u00fcbingen, T\u00fcbingen, Germany", "aff_domain": "googlemail.com; ; ; ; ; ", "email": "googlemail.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/becker19a.html", "aff_unique_index": "0+1+2;3;0;4;5;3+1+2", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt;Bosch Center for Artificial Intelligence;University of T\u00fcbingen;University of Lincoln;University of Birmingham;Lancaster University", "aff_unique_dep": "Computational Learning for Autonomous Systems;Artificial Intelligence;;Lincoln Center for Autonomous Systems;Extreme Robotics Lab;Engineering Department", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.bosch-ai.com;https://www.uni-tuebingen.de/;https://www.lincoln.ac.uk;https://www.birmingham.ac.uk;https://www.lancaster.ac.uk", "aff_unique_abbr": "TU Darmstadt;BCAI;Uni T\u00fcbingen;UoL;UoB;", "aff_campus_unique_index": "0+1+2;3;0;4;5;3+1+2", "aff_campus_unique": "Darmstadt;Renningen;T\u00fcbingen;Lincoln;Birmingham;Lancaster", "aff_country_unique_index": "0+0+0;1;0;1;1;1+0+0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Recursive Sketches for Modular Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3654", "id": "3654", "author_site": "Badih Ghazi, Rina Panigrahy, Joshua R. Wang", "author": "Badih Ghazi; Rina Panigrahy; Joshua Wang", "abstract": "We present a mechanism to compute a sketch (succinct summary) of how a complex modular deep network processes its inputs. The sketch summarizes essential information about the inputs and outputs of the network and can be used to quickly identify key components and summary statistics of the inputs. Furthermore, the sketch is recursive and can be unrolled to identify sub-components of these components and so forth, capturing a potentially complicated DAG structure. These sketches erase gracefully; even if we erase a fraction of the sketch at random, the remainder still retains the \u201chigh-weight\u201d information present in the original sketch. The sketches can also be organized in a repository to implicitly form a \u201cknowledge graph\u201d; it is possible to quickly retrieve sketches in the repository that are related to a sketch of interest; arranged in this fashion, the sketches can also be used to learn emerging concepts by looking for new clusters in sketch space. Finally, in the scenario where we want to learn a ground truth deep network, we show that augmenting input/output pairs with these sketches can theoretically make it easier to do so.", "bibtex": "@InProceedings{pmlr-v97-ghazi19a,\n title = \t {Recursive Sketches for Modular Deep Learning},\n author = {Ghazi, Badih and Panigrahy, Rina and Wang, Joshua},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2211--2220},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ghazi19a/ghazi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ghazi19a.html},\n abstract = \t {We present a mechanism to compute a sketch (succinct summary) of how a complex modular deep network processes its inputs. The sketch summarizes essential information about the inputs and outputs of the network and can be used to quickly identify key components and summary statistics of the inputs. Furthermore, the sketch is recursive and can be unrolled to identify sub-components of these components and so forth, capturing a potentially complicated DAG structure. These sketches erase gracefully; even if we erase a fraction of the sketch at random, the remainder still retains the \u201chigh-weight\u201d information present in the original sketch. The sketches can also be organized in a repository to implicitly form a \u201cknowledge graph\u201d; it is possible to quickly retrieve sketches in the repository that are related to a sketch of interest; arranged in this fashion, the sketches can also be used to learn emerging concepts by looking for new clusters in sketch space. Finally, in the scenario where we want to learn a ground truth deep network, we show that augmenting input/output pairs with these sketches can theoretically make it easier to do so.}\n}", "pdf": "http://proceedings.mlr.press/v97/ghazi19a/ghazi19a.pdf", "supp": "", "pdf_size": 345837, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16706646732256077451&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google Research, Mountain View, CA, USA; Google Research, Mountain View, CA, USA; Google Research, Mountain View, CA, USA", "aff_domain": "google.com;google.com;google.com", "email": "google.com;google.com;google.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ghazi19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Refined Complexity of PCA with Outliers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3827", "id": "3827", "author_site": "Kirill Simonov, Fedor Fomin, Petr Golovach, Fahad Panolan", "author": "Kirill Simonov; Fedor Fomin; Petr Golovach; Fahad Panolan", "abstract": "Principal component analysis (PCA) is one of the most fundamental procedures in exploratory data analysis and is the basic step in applications ranging from quantitative finance and bioinformatics to image analysis and neuroscience. However, it is well-documented that the applicability of PCA in many real scenarios could be constrained by an \"immune deficiency\" to outliers such as corrupted observations. We consider the following algorithmic question about the PCA with outliers. For a set of $n$ points in $\\mathbb{R}^{d}$, how to learn a subset of points, say 1% of the total number of points, such that the remaining part of the points is best fit into some unknown $r$-dimensional subspace? We provide a rigorous algorithmic analysis of the problem. We show that the problem is solvable in time $n^{O(d^2)}$. In particular, for constant dimension the problem is solvable in polynomial time. We complement the algorithmic result by the lower bound, showing that unless Exponential Time Hypothesis fails, in time $f(d)n^{o(d)}$, for any function $f$ of $d$, it is impossible not only to solve the problem exactly but even to approximate it within a constant factor.", "bibtex": "@InProceedings{pmlr-v97-simonov19a,\n title = \t {Refined Complexity of {PCA} with Outliers},\n author = {Simonov, Kirill and Fomin, Fedor and Golovach, Petr and Panolan, Fahad},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5818--5826},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/simonov19a/simonov19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/simonov19a.html},\n abstract = \t {Principal component analysis (PCA) is one of the most fundamental procedures in exploratory data analysis and is the basic step in applications ranging from quantitative finance and bioinformatics to image analysis and neuroscience. However, it is well-documented that the applicability of PCA in many real scenarios could be constrained by an \"immune deficiency\" to outliers such as corrupted observations. We consider the following algorithmic question about the PCA with outliers. For a set of $n$ points in $\\mathbb{R}^{d}$, how to learn a subset of points, say 1% of the total number of points, such that the remaining part of the points is best fit into some unknown $r$-dimensional subspace? We provide a rigorous algorithmic analysis of the problem. We show that the problem is solvable in time $n^{O(d^2)}$. In particular, for constant dimension the problem is solvable in polynomial time. We complement the algorithmic result by the lower bound, showing that unless Exponential Time Hypothesis fails, in time $f(d)n^{o(d)}$, for any function $f$ of $d$, it is impossible not only to solve the problem exactly but even to approximate it within a constant factor.}\n}", "pdf": "http://proceedings.mlr.press/v97/simonov19a/simonov19a.pdf", "supp": "", "pdf_size": 328965, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14272893377890545039&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff": "Department of Informatics, University of Bergen, Norway; Department of Informatics, University of Bergen, Norway; Department of Informatics, University of Bergen, Norway; Department of Informatics, University of Bergen, Norway", "aff_domain": "ii.uib.no;uib.no;uib.no;uib.no", "email": "ii.uib.no;uib.no;uib.no;uib.no", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/simonov19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Bergen", "aff_unique_dep": "Department of Informatics", "aff_unique_url": "https://www.uib.no", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Norway" }, { "title": "Regret Circuits: Composability of Regret Minimizers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3856", "id": "3856", "author_site": "Gabriele Farina, Christian Kroer, Tuomas Sandholm", "author": "Gabriele Farina; Christian Kroer; Tuomas Sandholm", "abstract": "Regret minimization is a powerful tool for solving large-scale problems; it was recently used in breakthrough results for large-scale extensive-form game solving. This was achieved by composing simplex regret minimizers into an overall regret-minimization framework for extensive-form game strategy spaces. In this paper we study the general composability of regret minimizers. We derive a calculus for constructing regret minimizers for composite convex sets that are obtained from convexity-preserving operations on simpler convex sets. We show that local regret minimizers for the simpler sets can be combined with additional regret minimizers into an aggregate regret minimizer for the composite set. As one application, we show that the CFR framework can be constructed easily from our framework. We also show ways to include curtailing (constraining) operations into our framework. For one, they enable the construction of CFR generalization for extensive-form games with general convex strategy constraints that can cut across decision points.", "bibtex": "@InProceedings{pmlr-v97-farina19b,\n title = \t {Regret Circuits: Composability of Regret Minimizers},\n author = {Farina, Gabriele and Kroer, Christian and Sandholm, Tuomas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1863--1872},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/farina19b/farina19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/farina19b.html},\n abstract = \t {Regret minimization is a powerful tool for solving large-scale problems; it was recently used in breakthrough results for large-scale extensive-form game solving. This was achieved by composing simplex regret minimizers into an overall regret-minimization framework for extensive-form game strategy spaces. In this paper we study the general composability of regret minimizers. We derive a calculus for constructing regret minimizers for composite convex sets that are obtained from convexity-preserving operations on simpler convex sets. We show that local regret minimizers for the simpler sets can be combined with additional regret minimizers into an aggregate regret minimizer for the composite set. As one application, we show that the CFR framework can be constructed easily from our framework. We also show ways to include curtailing (constraining) operations into our framework. For one, they enable the construction of CFR generalization for extensive-form games with general convex strategy constraints that can cut across decision points.}\n}", "pdf": "http://proceedings.mlr.press/v97/farina19b/farina19b.pdf", "supp": "", "pdf_size": 529931, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17975701751697098615&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Computer Science Department, Carnegie Mellon University, Pittsburgh PA 15213+Strategic Machine, Inc.+Strategy Robot, Inc.+Optimized Markets, Inc.; IEOR Department, Columbia University, New York NY 10027; Computer Science Department, Carnegie Mellon University, Pittsburgh PA 15213+Strategic Machine, Inc.+Strategy Robot, Inc.+Optimized Markets, Inc.", "aff_domain": "cs.cmu.edu;columbia.edu;cs.cmu.edu", "email": "cs.cmu.edu;columbia.edu;cs.cmu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/farina19b.html", "aff_unique_index": "0+1+2+3;4;0+1+2+3", "aff_unique_norm": "Carnegie Mellon University;Strategic Machine, Inc.;Strategy Robot, Inc.;Optimized Markets, Inc.;Columbia University", "aff_unique_dep": "Computer Science Department;;;;IEOR Department", "aff_unique_url": "https://www.cmu.edu;;;;https://www.columbia.edu", "aff_unique_abbr": "CMU;;;;Columbia", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Pittsburgh;;New York", "aff_country_unique_index": "0+0+0+0;0;0+0+0+0", "aff_country_unique": "United States" }, { "title": "Regularization in directable environments with application to Tetris", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3930", "id": "3930", "author_site": "Jan Malte Lichtenberg, Ozgur Simsek", "author": "Jan Malte Lichtenberg; \u00d6zg\u00fcr \u015eim\u015fek", "abstract": "Learning from small data sets is difficult in the absence of specific domain knowledge. We present a regularized linear model called STEW that benefits from a generic and prevalent form of prior knowledge: feature directions. STEW shrinks weights toward each other, converging to an equal-weights solution in the limit of infinite regularization. We provide theoretical results on the equal-weights solution that explains how STEW can productively trade-off bias and variance. Across a wide range of learning problems, including Tetris, STEW outperformed existing linear models, including ridge regression, the Lasso, and the non-negative Lasso, when feature directions were known. The model proved to be robust to unreliable (or absent) feature directions, still outperforming alternative models under diverse conditions. Our results in Tetris were obtained by using a novel approach to learning in sequential decision environments based on multinomial logistic regression.", "bibtex": "@InProceedings{pmlr-v97-lichtenberg19a,\n title = \t {Regularization in directable environments with application to Tetris},\n author = {Lichtenberg, Jan Malte and {\\c{S}im\\c{s}ek}, {\\\"{O}}zg\\\"{u}r},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3953--3962},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lichtenberg19a/lichtenberg19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lichtenberg19a.html},\n abstract = \t {Learning from small data sets is difficult in the absence of specific domain knowledge. We present a regularized linear model called STEW that benefits from a generic and prevalent form of prior knowledge: feature directions. STEW shrinks weights toward each other, converging to an equal-weights solution in the limit of infinite regularization. We provide theoretical results on the equal-weights solution that explains how STEW can productively trade-off bias and variance. Across a wide range of learning problems, including Tetris, STEW outperformed existing linear models, including ridge regression, the Lasso, and the non-negative Lasso, when feature directions were known. The model proved to be robust to unreliable (or absent) feature directions, still outperforming alternative models under diverse conditions. Our results in Tetris were obtained by using a novel approach to learning in sequential decision environments based on multinomial logistic regression.}\n}", "pdf": "http://proceedings.mlr.press/v97/lichtenberg19a/lichtenberg19a.pdf", "supp": "", "pdf_size": 430748, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3730086703392279743&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Bath, Bath, United Kingdom; Department of Computer Science, University of Bath, Bath, United Kingdom", "aff_domain": "bath.ac.uk; ", "email": "bath.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/lichtenberg19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "University of Bath", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.bath.ac.uk", "aff_unique_abbr": "Bath", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Bath", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Rehashing Kernel Evaluation in High Dimensions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4232", "id": "4232", "author_site": "Paris Siminelakis, Kexin Rong, Peter Bailis, Moses Charikar, Philip Levis", "author": "Paris Siminelakis; Kexin Rong; Peter Bailis; Moses Charikar; Philip Levis", "abstract": "Kernel methods are effective but do not scale well to large scale data, especially in high dimensions where the geometric data structures used to accelerate kernel evaluation suffer from the curse of dimensionality. Recent theoretical advances have proposed fast kernel evaluation algorithms leveraging hashing techniques with worst-case asymptotic improvements. However, these advances are largely confined to the theoretical realm due to concerns such as super-linear preprocessing time and diminishing gains in non-worst case datasets. In this paper, we close the gap between theory and practice by addressing these challenges via provable and practical procedures for adaptive sample size selection, preprocessing time reduction, and refined variance bounds that quantify the data-dependent performance of random sampling and hashing-based kernel evaluation methods. Our experiments show that these new tools offer up to $10\\times$ improvement in evaluation time on a range of synthetic and real-world datasets.", "bibtex": "@InProceedings{pmlr-v97-siminelakis19a,\n title = \t {Rehashing Kernel Evaluation in High Dimensions},\n author = {Siminelakis, Paris and Rong, Kexin and Bailis, Peter and Charikar, Moses and Levis, Philip},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5789--5798},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/siminelakis19a/siminelakis19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/siminelakis19a.html},\n abstract = \t {Kernel methods are effective but do not scale well to large scale data, especially in high dimensions where the geometric data structures used to accelerate kernel evaluation suffer from the curse of dimensionality. Recent theoretical advances have proposed fast kernel evaluation algorithms leveraging hashing techniques with worst-case asymptotic improvements. However, these advances are largely confined to the theoretical realm due to concerns such as super-linear preprocessing time and diminishing gains in non-worst case datasets. In this paper, we close the gap between theory and practice by addressing these challenges via provable and practical procedures for adaptive sample size selection, preprocessing time reduction, and refined variance bounds that quantify the data-dependent performance of random sampling and hashing-based kernel evaluation methods. Our experiments show that these new tools offer up to $10\\times$ improvement in evaluation time on a range of synthetic and real-world datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/siminelakis19a/siminelakis19a.pdf", "supp": "", "pdf_size": 1464327, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7887061389019880539&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Stanford University; Stanford University; Stanford University; Stanford University; Stanford University", "aff_domain": "stanford.edu;stanford.edu; ; ; ", "email": "stanford.edu;stanford.edu; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/siminelakis19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Reinforcement Learning in Configurable Continuous Environments", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4037", "id": "4037", "author_site": "Alberto Maria Metelli, Emanuele Ghelfi, Marcello Restelli", "author": "Alberto Maria Metelli; Emanuele Ghelfi; Marcello Restelli", "abstract": "Configurable Markov Decision Processes (Conf-MDPs) have been recently introduced as an extension of the usual MDP model to account for the possibility of configuring the environment to improve the agent\u2019s performance. Currently, there is still no suitable algorithm to solve the learning problem for real-world Conf-MDPs. In this paper, we fill this gap by proposing a trust-region method, Relative Entropy Model Policy Search (REMPS), able to learn both the policy and the MDP configuration in continuous domains without requiring the knowledge of the true model of the environment. After introducing our approach and providing a finite-sample analysis, we empirically evaluate REMPS on both benchmark and realistic environments by comparing our results with those of the gradient methods.", "bibtex": "@InProceedings{pmlr-v97-metelli19a,\n title = \t {Reinforcement Learning in Configurable Continuous Environments},\n author = {Metelli, Alberto Maria and Ghelfi, Emanuele and Restelli, Marcello},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4546--4555},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/metelli19a/metelli19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/metelli19a.html},\n abstract = \t {Configurable Markov Decision Processes (Conf-MDPs) have been recently introduced as an extension of the usual MDP model to account for the possibility of configuring the environment to improve the agent\u2019s performance. Currently, there is still no suitable algorithm to solve the learning problem for real-world Conf-MDPs. In this paper, we fill this gap by proposing a trust-region method, Relative Entropy Model Policy Search (REMPS), able to learn both the policy and the MDP configuration in continuous domains without requiring the knowledge of the true model of the environment. After introducing our approach and providing a finite-sample analysis, we empirically evaluate REMPS on both benchmark and realistic environments by comparing our results with those of the gradient methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/metelli19a/metelli19a.pdf", "supp": "", "pdf_size": 861116, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11874845117841299302&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Politecnico di Milano; Politecnico di Milano; Politecnico di Milano", "aff_domain": "polimi.it; ; ", "email": "polimi.it; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/metelli19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "title": "Relational Pooling for Graph Representations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4298", "id": "4298", "author_site": "Ryan Murphy, Balasubramaniam Srinivasan, Vinayak A Rao, Bruno Ribeiro", "author": "Ryan Murphy; Balasubramaniam Srinivasan; Vinayak Rao; Bruno Ribeiro", "abstract": "This work generalizes graph neural networks (GNNs) beyond those based on the Weisfeiler-Lehman (WL) algorithm, graph Laplacians, and diffusions. Our approach, denoted Relational Pooling (RP), draws from the theory of finite partial exchangeability to provide a framework with maximal representation power for graphs. RP can work with existing graph representation models and, somewhat counterintuitively, can make them even more powerful than the original WL isomorphism test. Additionally, RP allows architectures like Recurrent Neural Networks and Convolutional Neural Networks to be used in a theoretically sound approach for graph classification. We demonstrate improved performance of RP-based graph representations over state-of-the-art methods on a number of tasks.", "bibtex": "@InProceedings{pmlr-v97-murphy19a,\n title = \t {Relational Pooling for Graph Representations},\n author = {Murphy, Ryan and Srinivasan, Balasubramaniam and Rao, Vinayak and Ribeiro, Bruno},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4663--4673},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/murphy19a/murphy19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/murphy19a.html},\n abstract = \t {This work generalizes graph neural networks (GNNs) beyond those based on the Weisfeiler-Lehman (WL) algorithm, graph Laplacians, and diffusions. Our approach, denoted Relational Pooling (RP), draws from the theory of finite partial exchangeability to provide a framework with maximal representation power for graphs. RP can work with existing graph representation models and, somewhat counterintuitively, can make them even more powerful than the original WL isomorphism test. Additionally, RP allows architectures like Recurrent Neural Networks and Convolutional Neural Networks to be used in a theoretically sound approach for graph classification. We demonstrate improved performance of RP-based graph representations over state-of-the-art methods on a number of tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/murphy19a/murphy19a.pdf", "supp": "", "pdf_size": 463756, "gs_citation": 325, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6145744994249893945&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Statistics, Purdue University; Department of Computer Science, Purdue University; Department of Statistics, Purdue University; Department of Computer Science, Purdue University", "aff_domain": "purdue.edu; ; ; ", "email": "purdue.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/murphy19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "Department of Statistics", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Remember and Forget for Experience Replay", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4287", "id": "4287", "author_site": "Guido Novati, Petros Koumoutsakos", "author": "Guido Novati; Petros Koumoutsakos", "abstract": "Experience replay (ER) is a fundamental component of off-policy deep reinforcement learning (RL). ER recalls experiences from past iterations to compute gradient estimates for the current policy, increasing data-efficiency. However, the accuracy of such updates may deteriorate when the policy diverges from past behaviors and can undermine the performance of ER. Many algorithms mitigate this issue by tuning hyper-parameters to slow down policy changes. An alternative is to actively enforce the similarity between policy and the experiences in the replay memory. We introduce Remember and Forget Experience Replay (ReF-ER), a novel method that can enhance RL algorithms with parameterized policies. ReF-ER (1) skips gradients computed from experiences that are too unlikely with the current policy and (2) regulates policy changes within a trust region of the replayed behaviors. We couple ReF-ER with Q-learning, deterministic policy gradient and off-policy gradient methods. We find that ReF-ER consistently improves the performance of continuous-action, off-policy RL on fully observable benchmarks and partially observable flow control problems.", "bibtex": "@InProceedings{pmlr-v97-novati19a,\n title = \t {Remember and Forget for Experience Replay},\n author = {Novati, Guido and Koumoutsakos, Petros},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4851--4860},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/novati19a/novati19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/novati19a.html},\n abstract = \t {Experience replay (ER) is a fundamental component of off-policy deep reinforcement learning (RL). ER recalls experiences from past iterations to compute gradient estimates for the current policy, increasing data-efficiency. However, the accuracy of such updates may deteriorate when the policy diverges from past behaviors and can undermine the performance of ER. Many algorithms mitigate this issue by tuning hyper-parameters to slow down policy changes. An alternative is to actively enforce the similarity between policy and the experiences in the replay memory. We introduce Remember and Forget Experience Replay (ReF-ER), a novel method that can enhance RL algorithms with parameterized policies. ReF-ER (1) skips gradients computed from experiences that are too unlikely with the current policy and (2) regulates policy changes within a trust region of the replayed behaviors. We couple ReF-ER with Q-learning, deterministic policy gradient and off-policy gradient methods. We find that ReF-ER consistently improves the performance of continuous-action, off-policy RL on fully observable benchmarks and partially observable flow control problems.}\n}", "pdf": "http://proceedings.mlr.press/v97/novati19a/novati19a.pdf", "supp": "", "pdf_size": 1315497, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13050806613216384530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Computational Science & Engineering Laboratory, ETH Zurich, Zurich, Switzerland; Computational Science & Engineering Laboratory, ETH Zurich, Zurich, Switzerland", "aff_domain": "ethz.ch;ethz.ch", "email": "ethz.ch;ethz.ch", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/novati19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Computational Science & Engineering Laboratory", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Zurich", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "title": "Repairing without Retraining: Avoiding Disparate Impact with Counterfactual Distributions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3898", "id": "3898", "author_site": "Hao Wang, Berk Ustun, Flavio Calmon", "author": "Hao Wang; Berk Ustun; Flavio Calmon", "abstract": "When the performance of a machine learning model varies over groups defined by sensitive attributes (e.g., gender or ethnicity), the performance disparity can be expressed in terms of the probability distributions of the input and output variables over each group. In this paper, we exploit this fact to reduce the disparate impact of a fixed classification model over a population of interest. Given a black-box classifier, we aim to eliminate the performance gap by perturbing the distribution of input variables for the disadvantaged group. We refer to the perturbed distribution as a counterfactual distribution, and characterize its properties for common fairness criteria. We introduce a descent algorithm to learn a counterfactual distribution from data. We then discuss how the estimated distribution can be used to build a data preprocessor that can reduce disparate impact without training a new model. We validate our approach through experiments on real-world datasets, showing that it can repair different forms of disparity without a significant drop in accuracy.", "bibtex": "@InProceedings{pmlr-v97-wang19l,\n title = \t {Repairing without Retraining: Avoiding Disparate Impact with Counterfactual Distributions},\n author = {Wang, Hao and Ustun, Berk and Calmon, Flavio},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6618--6627},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19l/wang19l.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19l.html},\n abstract = \t {When the performance of a machine learning model varies over groups defined by sensitive attributes (e.g., gender or ethnicity), the performance disparity can be expressed in terms of the probability distributions of the input and output variables over each group. In this paper, we exploit this fact to reduce the disparate impact of a fixed classification model over a population of interest. Given a black-box classifier, we aim to eliminate the performance gap by perturbing the distribution of input variables for the disadvantaged group. We refer to the perturbed distribution as a counterfactual distribution, and characterize its properties for common fairness criteria. We introduce a descent algorithm to learn a counterfactual distribution from data. We then discuss how the estimated distribution can be used to build a data preprocessor that can reduce disparate impact without training a new model. We validate our approach through experiments on real-world datasets, showing that it can repair different forms of disparity without a significant drop in accuracy.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19l/wang19l.pdf", "supp": "", "pdf_size": 475487, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16561986856093629430&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Harvard University, MA, USA; Harvard University, MA, USA; Harvard University, MA, USA", "aff_domain": "g.harvard.edu;seas.harvard.edu;seas.harvard.edu", "email": "g.harvard.edu;seas.harvard.edu;seas.harvard.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wang19l.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Replica Conditional Sequential Monte Carlo", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3766", "id": "3766", "author_site": "Alex Shestopaloff, Arnaud Doucet", "author": "Alex Shestopaloff; Arnaud Doucet", "abstract": "We propose a Markov chain Monte Carlo (MCMC) scheme to perform state inference in non-linear non-Gaussian state-space models. Current state-of-the-art methods to address this problem rely on particle MCMC techniques and its variants, such as the iterated conditional Sequential Monte Carlo (cSMC) scheme, which uses a Sequential Monte Carlo (SMC) type proposal within MCMC. A deficiency of standard SMC proposals is that they only use observations up to time $t$ to propose states at time $t$ when an entire observation sequence is available. More sophisticated SMC based on lookahead techniques could be used but they can be difficult to put in practice. We propose here replica cSMC where we build SMC proposals for one replica using information from the entire observation sequence by conditioning on the states of the other replicas. This approach is easily parallelizable and we demonstrate its excellent empirical performance when compared to the standard iterated cSMC scheme at fixed computational complexity.", "bibtex": "@InProceedings{pmlr-v97-shestopaloff19a,\n title = \t {Replica Conditional Sequential {M}onte {C}arlo},\n author = {Shestopaloff, Alex and Doucet, Arnaud},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5749--5757},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shestopaloff19a/shestopaloff19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/shestopaloff19a.html},\n abstract = \t {We propose a Markov chain Monte Carlo (MCMC) scheme to perform state inference in non-linear non-Gaussian state-space models. Current state-of-the-art methods to address this problem rely on particle MCMC techniques and its variants, such as the iterated conditional Sequential Monte Carlo (cSMC) scheme, which uses a Sequential Monte Carlo (SMC) type proposal within MCMC. A deficiency of standard SMC proposals is that they only use observations up to time $t$ to propose states at time $t$ when an entire observation sequence is available. More sophisticated SMC based on lookahead techniques could be used but they can be difficult to put in practice. We propose here replica cSMC where we build SMC proposals for one replica using information from the entire observation sequence by conditioning on the states of the other replicas. This approach is easily parallelizable and we demonstrate its excellent empirical performance when compared to the standard iterated cSMC scheme at fixed computational complexity.}\n}", "pdf": "http://proceedings.mlr.press/v97/shestopaloff19a/shestopaloff19a.pdf", "supp": "", "pdf_size": 657939, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8937563905514647283&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Mathematics, University of Edinburgh, Edinburgh, UK + The Alan Turing Institute, London, UK; Department of Statistics, University of Oxford, Oxford, UK + The Alan Turing Institute, London, UK", "aff_domain": "turing.ac.uk; ", "email": "turing.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/shestopaloff19a.html", "aff_unique_index": "0+1;2+1", "aff_unique_norm": "University of Edinburgh;Alan Turing Institute;University of Oxford", "aff_unique_dep": "School of Mathematics;;Department of Statistics", "aff_unique_url": "https://www.ed.ac.uk;https://www.turing.ac.uk;https://www.ox.ac.uk", "aff_unique_abbr": "Edinburgh;ATI;Oxford", "aff_campus_unique_index": "0+1;2+1", "aff_campus_unique": "Edinburgh;London;Oxford", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Rethinking Lossy Compression: The Rate-Distortion-Perception Tradeoff", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3643", "id": "3643", "author_site": "Yochai Blau, Tomer Michaeli", "author": "Yochai Blau; Tomer Michaeli", "abstract": "Lossy compression algorithms are typically designed and analyzed through the lens of Shannon\u2019s rate-distortion theory, where the goal is to achieve the lowest possible distortion (e.g., low MSE or high SSIM) at any given bit rate. However, in recent years, it has become increasingly accepted that \"low distortion\" is not a synonym for \"high perceptual quality\", and in fact optimization of one often comes at the expense of the other. In light of this understanding, it is natural to seek for a generalization of rate-distortion theory which takes perceptual quality into account. In this paper, we adopt the mathematical definition of perceptual quality recently proposed by Blau & Michaeli (2018), and use it to study the three-way tradeoff between rate, distortion, and perception. We show that restricting the perceptual quality to be high, generally leads to an elevation of the rate-distortion curve, thus necessitating a sacrifice in either rate or distortion. We prove several fundamental properties of this triple-tradeoff, calculate it in closed form for a Bernoulli source, and illustrate it visually on a toy MNIST example.", "bibtex": "@InProceedings{pmlr-v97-blau19a,\n title = \t {Rethinking Lossy Compression: The Rate-Distortion-Perception Tradeoff},\n author = {Blau, Yochai and Michaeli, Tomer},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {675--685},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/blau19a/blau19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/blau19a.html},\n abstract = \t {Lossy compression algorithms are typically designed and analyzed through the lens of Shannon\u2019s rate-distortion theory, where the goal is to achieve the lowest possible distortion (e.g., low MSE or high SSIM) at any given bit rate. However, in recent years, it has become increasingly accepted that \"low distortion\" is not a synonym for \"high perceptual quality\", and in fact optimization of one often comes at the expense of the other. In light of this understanding, it is natural to seek for a generalization of rate-distortion theory which takes perceptual quality into account. In this paper, we adopt the mathematical definition of perceptual quality recently proposed by Blau & Michaeli (2018), and use it to study the three-way tradeoff between rate, distortion, and perception. We show that restricting the perceptual quality to be high, generally leads to an elevation of the rate-distortion curve, thus necessitating a sacrifice in either rate or distortion. We prove several fundamental properties of this triple-tradeoff, calculate it in closed form for a Bernoulli source, and illustrate it visually on a toy MNIST example.}\n}", "pdf": "http://proceedings.mlr.press/v97/blau19a/blau19a.pdf", "supp": "", "pdf_size": 1346340, "gs_citation": 375, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15392997649995963632&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Technion\u2013Israel Institute of Technology, Haifa, Israel; Technion\u2013Israel Institute of Technology, Haifa, Israel", "aff_domain": "campus.technion.ac.il;ee.technion.ac.il", "email": "campus.technion.ac.il;ee.technion.ac.il", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/blau19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technion\u2013Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Haifa", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Revisiting precision recall definition for generative modeling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3584", "id": "3584", "author_site": "Loic Simon, Ryan Webster, Julien Rabin", "author": "Loic Simon; Ryan Webster; Julien Rabin", "abstract": "In this article we revisit the definition of Precision-Recall (PR) curves for generative models proposed by (Sajjadi et al., 2018). Rather than providing a scalar for generative quality, PR curves distinguish mode-collapse (poor recall) and bad quality (poor precision). We first generalize their formulation to arbitrary measures hence removing any restriction to finite support. We also expose a bridge between PR curves and type I and type II error (a.k.a. false detection and rejection) rates of likelihood ratio classifiers on the task of discriminating between samples of the two distributions. Building upon this new perspective, we propose a novel algorithm to approximate precision-recall curves, that shares some interesting methodological properties with the hypothesis testing technique from (Lopez-Paz & Oquab, 2017). We demonstrate the interest of the proposed formulation over the original approach on controlled multi-modal datasets.", "bibtex": "@InProceedings{pmlr-v97-simon19a,\n title = \t {Revisiting precision recall definition for generative modeling},\n author = {Simon, Loic and Webster, Ryan and Rabin, Julien},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5799--5808},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/simon19a/simon19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/simon19a.html},\n abstract = \t {In this article we revisit the definition of Precision-Recall (PR) curves for generative models proposed by (Sajjadi et al., 2018). Rather than providing a scalar for generative quality, PR curves distinguish mode-collapse (poor recall) and bad quality (poor precision). We first generalize their formulation to arbitrary measures hence removing any restriction to finite support. We also expose a bridge between PR curves and type I and type II error (a.k.a. false detection and rejection) rates of likelihood ratio classifiers on the task of discriminating between samples of the two distributions. Building upon this new perspective, we propose a novel algorithm to approximate precision-recall curves, that shares some interesting methodological properties with the hypothesis testing technique from (Lopez-Paz & Oquab, 2017). We demonstrate the interest of the proposed formulation over the original approach on controlled multi-modal datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/simon19a/simon19a.pdf", "supp": "", "pdf_size": 2428893, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1628286305913966761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "Normandie Univ, UNICAEN, ENSICAEN, CNRS, GREYC; Normandie Univ, UNICAEN, ENSICAEN, CNRS, GREYC; Normandie Univ, UNICAEN, ENSICAEN, CNRS, GREYC", "aff_domain": "ensicaen.fr; ; ", "email": "ensicaen.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/simon19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Normandie University", "aff_unique_dep": "", "aff_unique_url": "https://www.unicaen.fr", "aff_unique_abbr": "UNICAEN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Revisiting the Softmax Bellman Operator: New Benefits and New Perspective", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4170", "id": "4170", "author_site": "Zhao Song, Ron Parr, Lawrence Carin", "author": "Zhao Song; Ron Parr; Lawrence Carin", "abstract": "The impact of softmax on the value function itself in reinforcement learning (RL) is often viewed as problematic because it leads to sub-optimal value (or Q) functions and interferes with the contraction properties of the Bellman operator. Surprisingly, despite these concerns, and independent of its effect on exploration, the softmax Bellman operator when combined with Deep Q-learning, leads to Q-functions with superior policies in practice, even outperforming its double Q-learning counterpart. To better understand how and why this occurs, we revisit theoretical properties of the softmax Bellman operator, and prove that (i) it converges to the standard Bellman operator exponentially fast in the inverse temperature parameter, and (ii) the distance of its Q function from the optimal one can be bounded. These alone do not explain its superior performance, so we also show that the softmax operator can reduce the overestimation error, which may give some insight into why a sub-optimal operator leads to better performance in the presence of value function approximation. A comparison among different Bellman operators is then presented, showing the trade-offs when selecting them.", "bibtex": "@InProceedings{pmlr-v97-song19c,\n title = \t {Revisiting the Softmax {B}ellman Operator: New Benefits and New Perspective},\n author = {Song, Zhao and Parr, Ron and Carin, Lawrence},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5916--5925},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/song19c/song19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/song19c.html},\n abstract = \t {The impact of softmax on the value function itself in reinforcement learning (RL) is often viewed as problematic because it leads to sub-optimal value (or Q) functions and interferes with the contraction properties of the Bellman operator. Surprisingly, despite these concerns, and independent of its effect on exploration, the softmax Bellman operator when combined with Deep Q-learning, leads to Q-functions with superior policies in practice, even outperforming its double Q-learning counterpart. To better understand how and why this occurs, we revisit theoretical properties of the softmax Bellman operator, and prove that (i) it converges to the standard Bellman operator exponentially fast in the inverse temperature parameter, and (ii) the distance of its Q function from the optimal one can be bounded. These alone do not explain its superior performance, so we also show that the softmax operator can reduce the overestimation error, which may give some insight into why a sub-optimal operator leads to better performance in the presence of value function approximation. A comparison among different Bellman operators is then presented, showing the trade-offs when selecting them.}\n}", "pdf": "http://proceedings.mlr.press/v97/song19c/song19c.pdf", "supp": "", "pdf_size": 1344106, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12009633864988483522&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Duke University; Duke University; Duke University", "aff_domain": "alumni.duke.edu;cs.duke.edu;duke.edu", "email": "alumni.duke.edu;cs.duke.edu;duke.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/song19c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Riemannian adaptive stochastic gradient algorithms on matrix manifolds", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3989", "id": "3989", "author_site": "Hiroyuki Kasai, Pratik Kumar Jawanpuria, Bamdev Mishra", "author": "Hiroyuki Kasai; Pratik Jawanpuria; Bamdev Mishra", "abstract": "Adaptive stochastic gradient algorithms in the Euclidean space have attracted much attention lately. Such explorations on Riemannian manifolds, on the other hand, are relatively new, limited, and challenging. This is because of the intrinsic non-linear structure of the underlying manifold and the absence of a canonical coordinate system. In machine learning applications, however, most manifolds of interest are represented as matrices with notions of row and column subspaces. In addition, the implicit manifold-related constraints may also lie on such subspaces. For example, the Grassmann manifold is the set of column subspaces. To this end, such a rich structure should not be lost by transforming matrices to just a stack of vectors while developing optimization algorithms on manifolds. We propose novel stochastic gradient algorithms for problems on Riemannian matrix manifolds by adapting the row and column subspaces of gradients. Our algorithms are provably convergent and they achieve the convergence rate of order $O(log(T)/sqrt(T))$, where $T$ is the number of iterations. Our experiments illustrate that the proposed algorithms outperform existing Riemannian adaptive stochastic algorithms.", "bibtex": "@InProceedings{pmlr-v97-kasai19a,\n title = \t {{R}iemannian adaptive stochastic gradient algorithms on matrix manifolds},\n author = {Kasai, Hiroyuki and Jawanpuria, Pratik and Mishra, Bamdev},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3262--3271},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kasai19a/kasai19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kasai19a.html},\n abstract = \t {Adaptive stochastic gradient algorithms in the Euclidean space have attracted much attention lately. Such explorations on Riemannian manifolds, on the other hand, are relatively new, limited, and challenging. This is because of the intrinsic non-linear structure of the underlying manifold and the absence of a canonical coordinate system. In machine learning applications, however, most manifolds of interest are represented as matrices with notions of row and column subspaces. In addition, the implicit manifold-related constraints may also lie on such subspaces. For example, the Grassmann manifold is the set of column subspaces. To this end, such a rich structure should not be lost by transforming matrices to just a stack of vectors while developing optimization algorithms on manifolds. We propose novel stochastic gradient algorithms for problems on Riemannian matrix manifolds by adapting the row and column subspaces of gradients. Our algorithms are provably convergent and they achieve the convergence rate of order $O(log(T)/sqrt(T))$, where $T$ is the number of iterations. Our experiments illustrate that the proposed algorithms outperform existing Riemannian adaptive stochastic algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/kasai19a/kasai19a.pdf", "supp": "", "pdf_size": 1781221, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11814345447980112497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Graduate School of Informatics and Engineering, The University of Electro-Communications, Tokyo, Japan; Microsoft, India; Microsoft, India", "aff_domain": "is.uec.ac.jp; ; ", "email": "is.uec.ac.jp; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kasai19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "University of Electro-Communications;Microsoft", "aff_unique_dep": "Graduate School of Informatics and Engineering;Microsoft Corporation", "aff_unique_url": "https://www.uec.ac.jp;https://www.microsoft.com/en-in", "aff_unique_abbr": "UEC;Microsoft", "aff_campus_unique_index": "0", "aff_campus_unique": "Tokyo;", "aff_country_unique_index": "0;1;1", "aff_country_unique": "Japan;India" }, { "title": "Robust Decision Trees Against Adversarial Examples", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4169", "id": "4169", "author_site": "Hongge Chen, Huan Zhang, Duane Boning, Cho-Jui Hsieh", "author": "Hongge Chen; Huan Zhang; Duane Boning; Cho-Jui Hsieh", "abstract": "Although adversarial examples and model robust-ness have been extensively studied in the context of neural networks, research on this issue in tree-based models and how to make tree-based models robust against adversarial examples is still limited. In this paper, we show that tree-based models are also vulnerable to adversarial examples and develop a novel algorithm to learn robust trees. At its core, our method aims to optimize the performance under the worst-case perturbation of input features, which leads to a max-min saddle point problem. Incorporating this saddle point objective into the decision tree building procedure is non-trivial due to the discrete nature of trees{\u2014}a naive approach to finding the best split according to this saddle point objective will take exponential time. To make our approach practical and scalable, we propose efficient tree building algorithms by approximating the inner minimizer in the saddlepoint problem, and present efficient implementations for classical information gain based trees as well as state-of-the-art tree boosting systems such as XGBoost. Experimental results on real world datasets demonstrate that the proposed algorithms can significantly improve the robustness of tree-based models against adversarial examples.", "bibtex": "@InProceedings{pmlr-v97-chen19m,\n title = \t {Robust Decision Trees Against Adversarial Examples},\n author = {Chen, Hongge and Zhang, Huan and Boning, Duane and Hsieh, Cho-Jui},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1122--1131},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19m/chen19m.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19m.html},\n abstract = \t {Although adversarial examples and model robust-ness have been extensively studied in the context of neural networks, research on this issue in tree-based models and how to make tree-based models robust against adversarial examples is still limited. In this paper, we show that tree-based models are also vulnerable to adversarial examples and develop a novel algorithm to learn robust trees. At its core, our method aims to optimize the performance under the worst-case perturbation of input features, which leads to a max-min saddle point problem. Incorporating this saddle point objective into the decision tree building procedure is non-trivial due to the discrete nature of trees{\u2014}a naive approach to finding the best split according to this saddle point objective will take exponential time. To make our approach practical and scalable, we propose efficient tree building algorithms by approximating the inner minimizer in the saddlepoint problem, and present efficient implementations for classical information gain based trees as well as state-of-the-art tree boosting systems such as XGBoost. Experimental results on real world datasets demonstrate that the proposed algorithms can significantly improve the robustness of tree-based models against adversarial examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19m/chen19m.pdf", "supp": "", "pdf_size": 1105409, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18298482644739407816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "MIT; UCLA; MIT; UCLA", "aff_domain": "mit.edu;huan-zhang.com; ; ", "email": "mit.edu;huan-zhang.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chen19m.html", "aff_unique_index": "0;1;0;1", "aff_unique_norm": "Massachusetts Institute of Technology;University of California, Los Angeles", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.ucla.edu", "aff_unique_abbr": "MIT;UCLA", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Estimation of Tree Structured Gaussian Graphical Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3767", "id": "3767", "author_site": "Ashish Katiyar, Jessica Hoffmann, Constantine Caramanis", "author": "Ashish Katiyar; Jessica Hoffmann; Constantine Caramanis", "abstract": "Consider jointly Gaussian random variables whose conditional independence structure is specified by a graphical model. If we observe realizations of the variables, we can compute the covariance matrix, and it is well known that the support of the inverse covariance matrix corresponds to the edges of the graphical model. Instead, suppose we only have noisy observations. If the noise at each node is independent, we can compute the sum of the covariance matrix and an unknown diagonal. The inverse of this sum is (in general) dense. We ask: can the original independence structure be recovered? We address this question for tree structured graphical models. We prove that this problem is unidentifiable, but show that this unidentifiability is limited to a small class of candidate trees. We further present additional constraints under which the problem is identifiable. Finally, we provide an O(n^3) algorithm to find this equivalence class of trees.", "bibtex": "@InProceedings{pmlr-v97-katiyar19a,\n title = \t {Robust Estimation of Tree Structured {G}aussian Graphical Models},\n author = {Katiyar, Ashish and Hoffmann, Jessica and Caramanis, Constantine},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3292--3300},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/katiyar19a/katiyar19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/katiyar19a.html},\n abstract = \t {Consider jointly Gaussian random variables whose conditional independence structure is specified by a graphical model. If we observe realizations of the variables, we can compute the covariance matrix, and it is well known that the support of the inverse covariance matrix corresponds to the edges of the graphical model. Instead, suppose we only have noisy observations. If the noise at each node is independent, we can compute the sum of the covariance matrix and an unknown diagonal. The inverse of this sum is (in general) dense. We ask: can the original independence structure be recovered? We address this question for tree structured graphical models. We prove that this problem is unidentifiable, but show that this unidentifiability is limited to a small class of candidate trees. We further present additional constraints under which the problem is identifiable. Finally, we provide an O(n^3) algorithm to find this equivalence class of trees.}\n}", "pdf": "http://proceedings.mlr.press/v97/katiyar19a/katiyar19a.pdf", "supp": "", "pdf_size": 1396318, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7239380105804712342&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Electrical and Computer Engineering, The University of Texas at Austin, Texas, USA; Department of Electrical and Computer Engineering, The University of Texas at Austin, Texas, USA; Department of Electrical and Computer Engineering, The University of Texas at Austin, Texas, USA", "aff_domain": "utexas.edu; ;utexas.edu", "email": "utexas.edu; ;utexas.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/katiyar19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Inference via Generative Classifiers for Handling Noisy Labels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3968", "id": "3968", "author_site": "Kimin Lee, Sukmin Yun, Kibok Lee, Honglak Lee, Bo Li, Jinwoo Shin", "author": "Kimin Lee; Sukmin Yun; Kibok Lee; Honglak Lee; Bo Li; Jinwoo Shin", "abstract": "Large-scale datasets may contain significant proportions of noisy (incorrect) class labels, and it is well-known that modern deep neural networks (DNNs) poorly generalize from such noisy training datasets. To mitigate the issue, we propose a novel inference method, termed Robust Generative classifier (RoG), applicable to any discriminative (e.g., softmax) neural classifier pre-trained on noisy datasets. In particular, we induce a generative classifier on top of hidden feature spaces of the pre-trained DNNs, for obtaining a more robust decision boundary. By estimating the parameters of generative classifier using the minimum covariance determinant estimator, we significantly improve the classification accuracy with neither re-training of the deep model nor changing its architectures. With the assumption of Gaussian distribution for features, we prove that RoG generalizes better than baselines under noisy labels. Finally, we propose the ensemble version of RoG to improve its performance by investigating the layer-wise characteristics of DNNs. Our extensive experimental results demonstrate the superiority of RoG given different learning models optimized by several training techniques to handle diverse scenarios of noisy labels.", "bibtex": "@InProceedings{pmlr-v97-lee19f,\n title = \t {Robust Inference via Generative Classifiers for Handling Noisy Labels},\n author = {Lee, Kimin and Yun, Sukmin and Lee, Kibok and Lee, Honglak and Li, Bo and Shin, Jinwoo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3763--3772},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lee19f/lee19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/lee19f.html},\n abstract = \t {Large-scale datasets may contain significant proportions of noisy (incorrect) class labels, and it is well-known that modern deep neural networks (DNNs) poorly generalize from such noisy training datasets. To mitigate the issue, we propose a novel inference method, termed Robust Generative classifier (RoG), applicable to any discriminative (e.g., softmax) neural classifier pre-trained on noisy datasets. In particular, we induce a generative classifier on top of hidden feature spaces of the pre-trained DNNs, for obtaining a more robust decision boundary. By estimating the parameters of generative classifier using the minimum covariance determinant estimator, we significantly improve the classification accuracy with neither re-training of the deep model nor changing its architectures. With the assumption of Gaussian distribution for features, we prove that RoG generalizes better than baselines under noisy labels. Finally, we propose the ensemble version of RoG to improve its performance by investigating the layer-wise characteristics of DNNs. Our extensive experimental results demonstrate the superiority of RoG given different learning models optimized by several training techniques to handle diverse scenarios of noisy labels.}\n}", "pdf": "http://proceedings.mlr.press/v97/lee19f/lee19f.pdf", "supp": "", "pdf_size": 867665, "gs_citation": 173, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14567604075585438767&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 12, "aff": "KAIST; KAIST; University of Michigan Ann Arbor; Google Brain; University of Illinois at Urbana Champaign; AItrics", "aff_domain": "kaist.ac.kr; ; ; ; ; ", "email": "kaist.ac.kr; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/lee19f.html", "aff_unique_index": "0;0;1;2;3;4", "aff_unique_norm": "Korea Advanced Institute of Science and Technology;University of Michigan;Google;University of Illinois Urbana-Champaign;AITRICS", "aff_unique_dep": ";;Google Brain;;", "aff_unique_url": "https://www.kaist.ac.kr;https://www.umich.edu;https://brain.google.com;https://illinois.edu;", "aff_unique_abbr": "KAIST;UM;Google Brain;UIUC;", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Ann Arbor;Mountain View;Urbana-Champaign", "aff_country_unique_index": "0;0;1;1;1", "aff_country_unique": "South Korea;United States;" }, { "title": "Robust Influence Maximization for Hyperparametric Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3771", "id": "3771", "author_site": "Dimitrios Kalimeris, Gal Kaplun, Yaron Singer", "author": "Dimitris Kalimeris; Gal Kaplun; Yaron Singer", "abstract": "In this paper we study the problem of robust influence maximization in the independent cascade model under a hyperparametric assumption. In social networks users influence and are influenced by individuals with similar characteristics and as such they are associated with some features. A recent surging research direction in influence maximization focuses on the case where the edge probabilities on the graph are not arbitrary but are generated as a function of the features of the users and a global hyperparameter. We propose a model where the objective is to maximize the worst-case number of influenced users for any possible value of that hyperparameter. We provide theoretical results showing that proper robust solution in our model is NP-hard and an algorithm that achieves improper robust optimization. We make-use of sampling based techniques and of the renowned multiplicative weight updates algorithm. Additionally we validate our method empirically and prove that it outperforms the state-of-the-art robust influence maximization techniques.", "bibtex": "@InProceedings{pmlr-v97-kalimeris19a,\n title = \t {Robust Influence Maximization for Hyperparametric Models},\n author = {Kalimeris, Dimitris and Kaplun, Gal and Singer, Yaron},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3192--3200},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kalimeris19a/kalimeris19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kalimeris19a.html},\n abstract = \t {In this paper we study the problem of robust influence maximization in the independent cascade model under a hyperparametric assumption. In social networks users influence and are influenced by individuals with similar characteristics and as such they are associated with some features. A recent surging research direction in influence maximization focuses on the case where the edge probabilities on the graph are not arbitrary but are generated as a function of the features of the users and a global hyperparameter. We propose a model where the objective is to maximize the worst-case number of influenced users for any possible value of that hyperparameter. We provide theoretical results showing that proper robust solution in our model is NP-hard and an algorithm that achieves improper robust optimization. We make-use of sampling based techniques and of the renowned multiplicative weight updates algorithm. Additionally we validate our method empirically and prove that it outperforms the state-of-the-art robust influence maximization techniques.}\n}", "pdf": "http://proceedings.mlr.press/v97/kalimeris19a/kalimeris19a.pdf", "supp": "", "pdf_size": 1922688, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11470028929550404338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Harvard University, Cambridge, MA, USA; Department of Computer Science, Harvard University, Cambridge, MA, USA; Department of Computer Science, Harvard University, Cambridge, MA, USA", "aff_domain": "g.harvard.edu;g.harvard.edu; ", "email": "g.harvard.edu;g.harvard.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kalimeris19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Harvard University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.harvard.edu", "aff_unique_abbr": "Harvard", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Robust Learning from Untrusted Sources", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3816", "id": "3816", "author_site": "Nikola Konstantinov, Christoph H. Lampert", "author": "Nikola Konstantinov; Christoph Lampert", "abstract": "Modern machine learning methods often require more data for training than a single expert can provide. Therefore, it has become a standard procedure to collect data from multiple external sources, \\eg via crowdsourcing. Unfortunately, the quality of these sources is not always guaranteed. As further complications, the data might be stored in a distributed way, or might even have to remain private. In this work, we address the question of how to learn robustly in such scenarios. Studying the problem through the lens of statistical learning theory, we derive a procedure that allows for learning from all available sources, yet automatically suppresses irrelevant or corrupted data. We show by extensive experiments that our method provides significant improvements over alternative approaches from robust statistics and distributed optimization.", "bibtex": "@InProceedings{pmlr-v97-konstantinov19a,\n title = \t {Robust Learning from Untrusted Sources},\n author = {Konstantinov, Nikola and Lampert, Christoph},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3488--3498},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/konstantinov19a/konstantinov19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/konstantinov19a.html},\n abstract = \t {Modern machine learning methods often require more data for training than a single expert can provide. Therefore, it has become a standard procedure to collect data from multiple external sources, \\eg via crowdsourcing. Unfortunately, the quality of these sources is not always guaranteed. As further complications, the data might be stored in a distributed way, or might even have to remain private. In this work, we address the question of how to learn robustly in such scenarios. Studying the problem through the lens of statistical learning theory, we derive a procedure that allows for learning from all available sources, yet automatically suppresses irrelevant or corrupted data. We show by extensive experiments that our method provides significant improvements over alternative approaches from robust statistics and distributed optimization.}\n}", "pdf": "http://proceedings.mlr.press/v97/konstantinov19a/konstantinov19a.pdf", "supp": "", "pdf_size": 2945597, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4366540847036601471&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Institute of Science and Technology, Klosterneuburg, Austria; Institute of Science and Technology, Klosterneuburg, Austria", "aff_domain": "ist.ac.at; ", "email": "ist.ac.at; ", "github": "https://github.com/NikolaKon1994/Robust-Learning-from-Untrusted-Sources", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/konstantinov19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Institute of Science and Technology Austria", "aff_unique_dep": "", "aff_unique_url": "https://www.ist.ac.at", "aff_unique_abbr": "IST Austria", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Klosterneuburg", "aff_country_unique_index": "0;0", "aff_country_unique": "Austria" }, { "title": "Robustly Disentangled Causal Mechanisms: Validating Deep Representations for Interventional Robustness", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4055", "id": "4055", "author_site": "Raphael Suter, Djordje Miladinovic, Bernhard Sch\u00f6lkopf, Stefan Bauer", "author": "Raphael Suter; Djordje Miladinovic; Bernhard Sch\u00f6lkopf; Stefan Bauer", "abstract": "The ability to learn disentangled representations that split underlying sources of variation in high dimensional, unstructured data is important for data efficient and robust use of neural networks. While various approaches aiming towards this goal have been proposed in recent times, a commonly accepted definition and validation procedure is missing. We provide a causal perspective on representation learning which covers disentanglement and domain shift robustness as special cases. Our causal framework allows us to introduce a new metric for the quantitative evaluation of deep latent variable models. We show how this metric can be estimated from labeled observational data and further provide an efficient estimation algorithm that scales linearly in the dataset size.", "bibtex": "@InProceedings{pmlr-v97-suter19a,\n title = \t {Robustly Disentangled Causal Mechanisms: Validating Deep Representations for Interventional Robustness},\n author = {Suter, Raphael and Miladinovic, Djordje and Sch{\\\"o}lkopf, Bernhard and Bauer, Stefan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6056--6065},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/suter19a/suter19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/suter19a.html},\n abstract = \t {The ability to learn disentangled representations that split underlying sources of variation in high dimensional, unstructured data is important for data efficient and robust use of neural networks. While various approaches aiming towards this goal have been proposed in recent times, a commonly accepted definition and validation procedure is missing. We provide a causal perspective on representation learning which covers disentanglement and domain shift robustness as special cases. Our causal framework allows us to introduce a new metric for the quantitative evaluation of deep latent variable models. We show how this metric can be estimated from labeled observational data and further provide an efficient estimation algorithm that scales linearly in the dataset size.}\n}", "pdf": "http://proceedings.mlr.press/v97/suter19a/suter19a.pdf", "supp": "", "pdf_size": 1217409, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10781334169107419114&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/suter19a.html" }, { "title": "Rotation Invariant Householder Parameterization for Bayesian PCA", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3805", "id": "3805", "author_site": "Rajbir-Singh Nirwan, Nils Bertschinger", "author": "Rajbir Nirwan; Nils Bertschinger", "abstract": "We consider probabilistic PCA and related factor models from a Bayesian perspective. These models are in general not identifiable as the likelihood has a rotational symmetry. This gives rise to complicated posterior distributions with continuous subspaces of equal density and thus hinders efficiency of inference as well as interpretation of obtained parameters. In particular, posterior averages over factor loadings become meaningless and only model predictions are unambiguous. Here, we propose a parameterization based on Householder transformations, which remove the rotational symmetry of the posterior. Furthermore, by relying on results from random matrix theory, we establish the parameter distribution which leaves the model unchanged compared to the original rotationally symmetric formulation. In particular, we avoid the need to compute the Jacobian determinant of the parameter transformation. This allows us to efficiently implement probabilistic PCA in a rotation invariant fashion in any state of the art toolbox. Here, we implemented our model in the probabilistic programming language Stan and illustrate it on several examples.", "bibtex": "@InProceedings{pmlr-v97-nirwan19a,\n title = \t {Rotation Invariant Householder Parameterization for {B}ayesian {PCA}},\n author = {Nirwan, Rajbir and Bertschinger, Nils},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4820--4828},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nirwan19a/nirwan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nirwan19a.html},\n abstract = \t {We consider probabilistic PCA and related factor models from a Bayesian perspective. These models are in general not identifiable as the likelihood has a rotational symmetry. This gives rise to complicated posterior distributions with continuous subspaces of equal density and thus hinders efficiency of inference as well as interpretation of obtained parameters. In particular, posterior averages over factor loadings become meaningless and only model predictions are unambiguous. Here, we propose a parameterization based on Householder transformations, which remove the rotational symmetry of the posterior. Furthermore, by relying on results from random matrix theory, we establish the parameter distribution which leaves the model unchanged compared to the original rotationally symmetric formulation. In particular, we avoid the need to compute the Jacobian determinant of the parameter transformation. This allows us to efficiently implement probabilistic PCA in a rotation invariant fashion in any state of the art toolbox. Here, we implemented our model in the probabilistic programming language Stan and illustrate it on several examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/nirwan19a/nirwan19a.pdf", "supp": "", "pdf_size": 4581273, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6089302904183911614&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Goethe University, Frankfurt, Germany+Frankfurt Institute for Advanced Studies, Frankfurt, Germany; Department of Computer Science, Goethe University, Frankfurt, Germany+Frankfurt Institute for Advanced Studies, Frankfurt, Germany", "aff_domain": "fias.uni-frankfurt.de;fias.uni-frankfurt.de", "email": "fias.uni-frankfurt.de;fias.uni-frankfurt.de", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/nirwan19a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Goethe University;Frankfurt Institute for Advanced Studies", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.uni-frankfurt.de;https://www.fias.uni-frankfurt.de/", "aff_unique_abbr": "GU;FIAS", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Frankfurt", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Germany" }, { "title": "SAGA with Arbitrary Sampling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3969", "id": "3969", "author_site": "Xun Qian, Zheng Qu, Peter Richtarik", "author": "Xun Qian; Zheng Qu; Peter Richt\u00e1rik", "abstract": "We study the problem of minimizing the average of a very large number of smooth functions, which is of key importance in training supervised learning models. One of the most celebrated methods in this context is the SAGA algorithm of Defazio et al. (2014). Despite years of research on the topic, a general-purpose version of SAGA\u2014one that would include arbitrary importance sampling and minibatching schemes\u2014does not exist. We remedy this situation and propose a general and flexible variant of SAGA following the arbitrary sampling paradigm. We perform an iteration complexity analysis of the method, largely possible due to the construction of new stochastic Lyapunov functions. We establish linear convergence rates in the smooth and strongly convex regime, and under certain error bound conditions also in a regime without strong convexity. Our rates match those of the primal-dual method Quartz (Qu et al., 2015) for which an arbitrary sampling analysis is available, which makes a significant step towards closing the gap in our understanding of complexity of primal and dual methods for finite sum problems. Finally, we show through experiments that specific variants of our general SAGA method can perform better in practice than other competing methods.", "bibtex": "@InProceedings{pmlr-v97-qian19a,\n title = \t {{SAGA} with Arbitrary Sampling},\n author = {Qian, Xun and Qu, Zheng and Richt{\\'a}rik, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5190--5199},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/qian19a/qian19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/qian19a.html},\n abstract = \t {We study the problem of minimizing the average of a very large number of smooth functions, which is of key importance in training supervised learning models. One of the most celebrated methods in this context is the SAGA algorithm of Defazio et al. (2014). Despite years of research on the topic, a general-purpose version of SAGA\u2014one that would include arbitrary importance sampling and minibatching schemes\u2014does not exist. We remedy this situation and propose a general and flexible variant of SAGA following the arbitrary sampling paradigm. We perform an iteration complexity analysis of the method, largely possible due to the construction of new stochastic Lyapunov functions. We establish linear convergence rates in the smooth and strongly convex regime, and under certain error bound conditions also in a regime without strong convexity. Our rates match those of the primal-dual method Quartz (Qu et al., 2015) for which an arbitrary sampling analysis is available, which makes a significant step towards closing the gap in our understanding of complexity of primal and dual methods for finite sum problems. Finally, we show through experiments that specific variants of our general SAGA method can perform better in practice than other competing methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/qian19a/qian19a.pdf", "supp": "", "pdf_size": 406179, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9204824938139784202&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": "King Abdullah University of Science and Technology, Thuwal, Kingdom of Saudi Arabia+Moscow Institute of Physics and Technology, Dolgoprudny, Moscow Region, Russia; University of Hong Kong, Hong Kong; King Abdullah University of Science and Technology, Thuwal, Kingdom of Saudi Arabia", "aff_domain": "kaust.edu.sa;hku.hk;kaust.edu.sa", "email": "kaust.edu.sa;hku.hk;kaust.edu.sa", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/qian19a.html", "aff_unique_index": "0+1;2;0", "aff_unique_norm": "King Abdullah University of Science and Technology;Moscow Institute of Physics and Technology;University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kast.kau.edu.sa;https://www.mipt.ru;https://www.hku.hk", "aff_unique_abbr": "KAUST;MIPT;HKU", "aff_campus_unique_index": "0+1;2;0", "aff_campus_unique": "Thuwal;Dolgoprudny;Hong Kong SAR", "aff_country_unique_index": "0+1;2;0", "aff_country_unique": "Kingdom of Saudi Arabia;Russian Federation;China" }, { "title": "SATNet: Bridging deep learning and logical reasoning using a differentiable satisfiability solver", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3947", "id": "3947", "author_site": "Po-Wei Wang, Priya Donti, Bryan Wilder, Zico Kolter", "author": "Po-Wei Wang; Priya Donti; Bryan Wilder; Zico Kolter", "abstract": "Integrating logical reasoning within deep learning architectures has been a major goal of modern AI systems. In this paper, we propose a new direction toward this goal by introducing a differentiable (smoothed) maximum satisfiability (MAXSAT) solver that can be integrated into the loop of larger deep learning systems. Our (approximate) solver is based upon a fast coordinate descent approach to solving the semidefinite program (SDP) associated with the MAXSAT problem. We show how to analytically differentiate through the solution to this SDP and efficiently solve the associated backward pass. We demonstrate that by integrating this solver into end-to-end learning systems, we can learn the logical structure of challenging problems in a minimally supervised fashion. In particular, we show that we can learn the parity function using single-bit supervision (a traditionally hard task for deep networks) and learn how to play 9x9 Sudoku solely from examples. We also solve a \u201cvisual Sudoku\u201d problem that maps images of Sudoku puzzles to their associated logical solutions by combining our MAXSAT solver with a traditional convolutional architecture. Our approach thus shows promise in integrating logical structures within deep learning.", "bibtex": "@InProceedings{pmlr-v97-wang19e,\n title = \t {{SATN}et: Bridging deep learning and logical reasoning using a differentiable satisfiability solver},\n author = {Wang, Po-Wei and Donti, Priya and Wilder, Bryan and Kolter, Zico},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6545--6554},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19e/wang19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19e.html},\n abstract = \t {Integrating logical reasoning within deep learning architectures has been a major goal of modern AI systems. In this paper, we propose a new direction toward this goal by introducing a differentiable (smoothed) maximum satisfiability (MAXSAT) solver that can be integrated into the loop of larger deep learning systems. Our (approximate) solver is based upon a fast coordinate descent approach to solving the semidefinite program (SDP) associated with the MAXSAT problem. We show how to analytically differentiate through the solution to this SDP and efficiently solve the associated backward pass. We demonstrate that by integrating this solver into end-to-end learning systems, we can learn the logical structure of challenging problems in a minimally supervised fashion. In particular, we show that we can learn the parity function using single-bit supervision (a traditionally hard task for deep networks) and learn how to play 9x9 Sudoku solely from examples. We also solve a \u201cvisual Sudoku\u201d problem that maps images of Sudoku puzzles to their associated logical solutions by combining our MAXSAT solver with a traditional convolutional architecture. Our approach thus shows promise in integrating logical structures within deep learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19e/wang19e.pdf", "supp": "", "pdf_size": 1493800, "gs_citation": 335, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6370111040782984805&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "School of Computer Science, Carnegie Mellon University; School of Computer Science, Carnegie Mellon University + Department of Engineering & Public Policy, Carnegie Mellon University; Department of Computer Science, University of Southern California; School of Computer Science, Carnegie Mellon University + Bosch Center for Arti\ufb01cial Intelligence", "aff_domain": "cs.cmu.edu;cmu.edu;usc.edu;cs.cmu.edu", "email": "cs.cmu.edu;cmu.edu;usc.edu;cs.cmu.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/wang19e.html", "aff_unique_index": "0;0+0;1;0+2", "aff_unique_norm": "Carnegie Mellon University;University of Southern California;Bosch Center for Arti\ufb01cial Intelligence", "aff_unique_dep": "School of Computer Science;Department of Computer Science;Artificial Intelligence", "aff_unique_url": "https://www.cmu.edu;https://www.usc.edu;https://www.bosch-ai.com", "aff_unique_abbr": "CMU;USC;BCAI", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Pittsburgh;;Los Angeles", "aff_country_unique_index": "0;0+0;0;0+1", "aff_country_unique": "United States;Germany" }, { "title": "SELFIE: Refurbishing Unclean Samples for Robust Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3582", "id": "3582", "author_site": "Hwanjun Song, Minseok Kim, Jae-Gil Lee", "author": "Hwanjun Song; Minseok Kim; Jae-Gil Lee", "abstract": "Owing to the extremely high expressive power of deep neural networks, their side effect is to totally memorize training data even when the labels are extremely noisy. To overcome overfitting on the noisy labels, we propose a novel robust training method called SELFIE. Our key idea is to selectively refurbish and exploit unclean samples that can be corrected with high precision, thereby gradually increasing the number of available training samples. Taking advantage of this design, SELFIE effectively prevents the risk of noise accumulation from the false correction and fully exploits the training data. To validate the superiority of SELFIE, we conducted extensive experimentation using four real-world or synthetic data sets. The result showed that SELFIE remarkably improved absolute test error compared with two state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v97-song19b,\n title = \t {{SELFIE}: Refurbishing Unclean Samples for Robust Deep Learning},\n author = {Song, Hwanjun and Kim, Minseok and Lee, Jae-Gil},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5907--5915},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/song19b/song19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/song19b.html},\n abstract = \t {Owing to the extremely high expressive power of deep neural networks, their side effect is to totally memorize training data even when the labels are extremely noisy. To overcome overfitting on the noisy labels, we propose a novel robust training method called SELFIE. Our key idea is to selectively refurbish and exploit unclean samples that can be corrected with high precision, thereby gradually increasing the number of available training samples. Taking advantage of this design, SELFIE effectively prevents the risk of noise accumulation from the false correction and fully exploits the training data. To validate the superiority of SELFIE, we conducted extensive experimentation using four real-world or synthetic data sets. The result showed that SELFIE remarkably improved absolute test error compared with two state-of-the-art methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/song19b/song19b.pdf", "supp": "", "pdf_size": 449051, "gs_citation": 522, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15529381766683515507&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Graduate School of Knowledge Service Engineering, KAIST, Daejeon, Korea; Graduate School of Knowledge Service Engineering, KAIST, Daejeon, Korea; Graduate School of Knowledge Service Engineering, KAIST, Daejeon, Korea", "aff_domain": "kaist.ac.kr; ; ", "email": "kaist.ac.kr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/song19b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "KAIST", "aff_unique_dep": "Graduate School of Knowledge Service Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "SGD without Replacement: Sharper Rates for General Smooth Convex Functions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3814", "id": "3814", "author_site": "Dheeraj Nagaraj, Prateek Jain, Praneeth Netrapalli", "author": "Dheeraj Nagaraj; Prateek Jain; Praneeth Netrapalli", "abstract": "We study stochastic gradient descent", "bibtex": "@InProceedings{pmlr-v97-nagaraj19a,\n title = \t {{SGD} without Replacement: Sharper Rates for General Smooth Convex Functions},\n author = {Nagaraj, Dheeraj and Jain, Prateek and Netrapalli, Praneeth},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4703--4711},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nagaraj19a/nagaraj19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nagaraj19a.html},\n abstract = \t {We study stochastic gradient descent", "pdf": "http://proceedings.mlr.press/v97/nagaraj19a/nagaraj19a.pdf", "supp": "", "pdf_size": 403686, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10242898150506844810&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Massachusetts Institute of Technology, Cambridge, Massachusetts, USA; Microsoft Research, Bengaluru, Karnataka, India; Microsoft Research, Bengaluru, Karnataka, India", "aff_domain": "mit.edu;microsoft.com;microsoft.com", "email": "mit.edu;microsoft.com;microsoft.com", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/nagaraj19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.mit.edu;https://www.microsoft.com/en-us/research/group/microsoft-research-india", "aff_unique_abbr": "MIT;MSR India", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Cambridge;Bengaluru", "aff_country_unique_index": "0;1;1", "aff_country_unique": "United States;India" }, { "title": "SGD: General Analysis and Improved Rates", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4155", "id": "4155", "author_site": "Robert Gower, Nicolas Loizou, Xun Qian, Alibek Sailanbayev, Egor Shulgin, Peter Richtarik", "author": "Robert Mansel Gower; Nicolas Loizou; Xun Qian; Alibek Sailanbayev; Egor Shulgin; Peter Richt\u00e1rik", "abstract": "We propose a general yet simple theorem describing the convergence of SGD under the arbitrary sampling paradigm. Our theorem describes the convergence of an infinite array of variants of SGD, each of which is associated with a specific probability law governing the data selection rule used to form minibatches. This is the first time such an analysis is performed, and most of our variants of SGD were never explicitly considered in the literature before. Our analysis relies on the recently introduced notion of expected smoothness and does not rely on a uniform bound on the variance of the stochastic gradients. By specializing our theorem to different mini-batching strategies, such as sampling with replacement and independent sampling, we derive exact expressions for the stepsize as a function of the mini-batch size. With this we can also determine the mini-batch size that optimizes the total complexity, and show explicitly that as the variance of the stochastic gradient evaluated at the minimum grows, so does the optimal mini-batch size. For zero variance, the optimal mini-batch size is one. Moreover, we prove insightful stepsize-switching rules which describe when one should switch from a constant to a decreasing stepsize regime.", "bibtex": "@InProceedings{pmlr-v97-qian19b,\n title = \t {{SGD}: General Analysis and Improved Rates},\n author = {Gower, Robert Mansel and Loizou, Nicolas and Qian, Xun and Sailanbayev, Alibek and Shulgin, Egor and Richt{\\'a}rik, Peter},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5200--5209},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/qian19b/qian19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/qian19b.html},\n abstract = \t {We propose a general yet simple theorem describing the convergence of SGD under the arbitrary sampling paradigm. Our theorem describes the convergence of an infinite array of variants of SGD, each of which is associated with a specific probability law governing the data selection rule used to form minibatches. This is the first time such an analysis is performed, and most of our variants of SGD were never explicitly considered in the literature before. Our analysis relies on the recently introduced notion of expected smoothness and does not rely on a uniform bound on the variance of the stochastic gradients. By specializing our theorem to different mini-batching strategies, such as sampling with replacement and independent sampling, we derive exact expressions for the stepsize as a function of the mini-batch size. With this we can also determine the mini-batch size that optimizes the total complexity, and show explicitly that as the variance of the stochastic gradient evaluated at the minimum grows, so does the optimal mini-batch size. For zero variance, the optimal mini-batch size is one. Moreover, we prove insightful stepsize-switching rules which describe when one should switch from a constant to a decreasing stepsize regime.}\n}", "pdf": "http://proceedings.mlr.press/v97/qian19b/qian19b.pdf", "supp": "", "pdf_size": 919988, "gs_citation": 557, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12515131175955494573&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 16, "aff": "T\u00e9l\u00e9com ParisTech, LTCI, Universit\u00e9 Paris-Saclay, France; University of Edinburgh, United Kingdom; King Abdullah University of Science and Technology, Kingdom of Saudi Arabia; King Abdullah University of Science and Technology, Kingdom of Saudi Arabia; Moscow Institute of Physics and Technology, Russian Federation; King Abdullah University of Science and Technology, Kingdom of Saudi Arabia + University of Edinburgh, United Kingdom + Moscow Institute of Physics and Technology, Russian Federation", "aff_domain": "gmail.com; ; ; ; ; ", "email": "gmail.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/qian19b.html", "aff_unique_index": "0;1;2;2;3;2+1+3", "aff_unique_norm": "T\u00e9l\u00e9com ParisTech;University of Edinburgh;King Abdullah University of Science and Technology;Moscow Institute of Physics and Technology", "aff_unique_dep": "LTCI;;;", "aff_unique_url": "https://www.telecom-paristech.fr;https://www.ed.ac.uk;https://www.kast.kau.edu.sa;https://www.mipt.ru/en", "aff_unique_abbr": "TP;Edinburgh;KAUST;MIPT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;2;3;2+1+3", "aff_country_unique": "France;United Kingdom;Saudi Arabia;Russian Federation" }, { "title": "SOLAR: Deep Structured Representations for Model-Based Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4307", "id": "4307", "author_site": "Marvin Zhang, Sharad Vikram, Laura Smith, Pieter Abbeel, Matthew Johnson, Sergey Levine", "author": "Marvin Zhang; Sharad Vikram; Laura Smith; Pieter Abbeel; Matthew Johnson; Sergey Levine", "abstract": "Model-based reinforcement learning (RL) has proven to be a data efficient approach for learning control tasks but is difficult to utilize in domains with complex observations such as images. In this paper, we present a method for learning representations that are suitable for iterative model-based policy improvement, even when the underlying dynamical system has complex dynamics and image observations, in that these representations are optimized for inferring simple dynamics and cost models given data from the current policy. This enables a model-based RL method based on the linear-quadratic regulator (LQR) to be used for systems with image observations. We evaluate our approach on a range of robotics tasks, including manipulation with a real-world robotic arm directly from images. We find that our method produces substantially better final performance than other model-based RL methods while being significantly more efficient than model-free RL.", "bibtex": "@InProceedings{pmlr-v97-zhang19m,\n title = \t {{SOLAR}: Deep Structured Representations for Model-Based Reinforcement Learning},\n author = {Zhang, Marvin and Vikram, Sharad and Smith, Laura and Abbeel, Pieter and Johnson, Matthew and Levine, Sergey},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7444--7453},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19m/zhang19m.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19m.html},\n abstract = \t {Model-based reinforcement learning (RL) has proven to be a data efficient approach for learning control tasks but is difficult to utilize in domains with complex observations such as images. In this paper, we present a method for learning representations that are suitable for iterative model-based policy improvement, even when the underlying dynamical system has complex dynamics and image observations, in that these representations are optimized for inferring simple dynamics and cost models given data from the current policy. This enables a model-based RL method based on the linear-quadratic regulator (LQR) to be used for systems with image observations. We evaluate our approach on a range of robotics tasks, including manipulation with a real-world robotic arm directly from images. We find that our method produces substantially better final performance than other model-based RL methods while being significantly more efficient than model-free RL.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19m/zhang19m.pdf", "supp": "", "pdf_size": 3665683, "gs_citation": 321, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3160286257401504607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "University of California, Berkeley; University of California, San Diego; University of California, Berkeley; University of California, Berkeley; Google; University of California, Berkeley", "aff_domain": "eecs.berkeley.edu; ; ; ; ; ", "email": "eecs.berkeley.edu; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/zhang19m.html", "aff_unique_index": "0;1;0;0;2;0", "aff_unique_norm": "University of California, Berkeley;University of California, San Diego;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.ucsd.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;UCSD;Google", "aff_campus_unique_index": "0;1;0;0;2;0", "aff_campus_unique": "Berkeley;San Diego;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "SWALP : Stochastic Weight Averaging in Low Precision Training", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3602", "id": "3602", "author_site": "Guandao Yang, Tianyi Zhang, Polina Kirichenko, Junwen Bai, Andrew Wilson, Christopher De Sa", "author": "Guandao Yang; Tianyi Zhang; Polina Kirichenko; Junwen Bai; Andrew Gordon Wilson; Chris De Sa", "abstract": "Low precision operations can provide scalability, memory savings, portability, and energy efficiency. This paper proposes SWALP, an approach to low precision training that averages low-precision SGD iterates with a modified learning rate schedule. SWALP is easy to implement and can match the performance of full-precision SGD even with all numbers quantized down to 8 bits, including the gradient accumulators. Additionally, we show that SWALP converges arbitrarily close to the optimal solution for quadratic objectives, and to a noise ball asymptotically smaller than low precision SGD in strongly convex settings.", "bibtex": "@InProceedings{pmlr-v97-yang19d,\n title = \t {{SWALP} : Stochastic Weight Averaging in Low Precision Training},\n author = {Yang, Guandao and Zhang, Tianyi and Kirichenko, Polina and Bai, Junwen and Wilson, Andrew Gordon and De Sa, Chris},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7015--7024},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yang19d/yang19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/yang19d.html},\n abstract = \t {Low precision operations can provide scalability, memory savings, portability, and energy efficiency. This paper proposes SWALP, an approach to low precision training that averages low-precision SGD iterates with a modified learning rate schedule. SWALP is easy to implement and can match the performance of full-precision SGD even with all numbers quantized down to 8 bits, including the gradient accumulators. Additionally, we show that SWALP converges arbitrarily close to the optimal solution for quadratic objectives, and to a noise ball asymptotically smaller than low precision SGD in strongly convex settings.}\n}", "pdf": "http://proceedings.mlr.press/v97/yang19d/yang19d.pdf", "supp": "", "pdf_size": 808993, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2750552851394751845&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Cornell University; Cornell University; Cornell University; Cornell University; Cornell University; Cornell University", "aff_domain": "cornell.edu;cornell.edu; ; ; ; ", "email": "cornell.edu;cornell.edu; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/yang19d.html", "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Safe Grid Search with Optimal Complexity", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3812", "id": "3812", "author_site": "Eugene Ndiaye, Tam Le, Olivier Fercoq, Joseph Salmon, Ichiro Takeuchi", "author": "Eugene Ndiaye; Tam Le; Olivier Fercoq; Joseph Salmon; Ichiro Takeuchi", "abstract": "Popular machine learning estimators involve regularization parameters that can be challenging to tune, and standard strategies rely on grid search for this task. In this paper, we revisit the techniques of approximating the regularization path up to predefined tolerance $\\epsilon$ in a unified framework and show that its complexity is $O(1/\\sqrt[d]{\\epsilon})$ for uniformly convex loss of order $d \\geq 2$ and $O(1/\\sqrt{\\epsilon})$ for Generalized Self-Concordant functions. This framework encompasses least-squares but also logistic regression, a case that as far as we know was not handled as precisely in previous works. We leverage our technique to provide refined bounds on the validation error as well as a practical algorithm for hyperparameter tuning. The latter has global convergence guarantee when targeting a prescribed accuracy on the validation set. Last but not least, our approach helps relieving the practitioner from the (often neglected) task of selecting a stopping criterion when optimizing over the training set: our method automatically calibrates this criterion based on the targeted accuracy on the validation set.", "bibtex": "@InProceedings{pmlr-v97-ndiaye19a,\n title = \t {Safe Grid Search with Optimal Complexity},\n author = {Ndiaye, Eugene and Le, Tam and Fercoq, Olivier and Salmon, Joseph and Takeuchi, Ichiro},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4771--4780},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ndiaye19a/ndiaye19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ndiaye19a.html},\n abstract = \t {Popular machine learning estimators involve regularization parameters that can be challenging to tune, and standard strategies rely on grid search for this task. In this paper, we revisit the techniques of approximating the regularization path up to predefined tolerance $\\epsilon$ in a unified framework and show that its complexity is $O(1/\\sqrt[d]{\\epsilon})$ for uniformly convex loss of order $d \\geq 2$ and $O(1/\\sqrt{\\epsilon})$ for Generalized Self-Concordant functions. This framework encompasses least-squares but also logistic regression, a case that as far as we know was not handled as precisely in previous works. We leverage our technique to provide refined bounds on the validation error as well as a practical algorithm for hyperparameter tuning. The latter has global convergence guarantee when targeting a prescribed accuracy on the validation set. Last but not least, our approach helps relieving the practitioner from the (often neglected) task of selecting a stopping criterion when optimizing over the training set: our method automatically calibrates this criterion based on the targeted accuracy on the validation set.}\n}", "pdf": "http://proceedings.mlr.press/v97/ndiaye19a/ndiaye19a.pdf", "supp": "", "pdf_size": 561346, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1378644094816844028&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/ndiaye19a.html" }, { "title": "Safe Policy Improvement with Baseline Bootstrapping", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3681", "id": "3681", "author_site": "Romain Laroche, Paul TRICHELAIR, Remi Tachet des Combes", "author": "Romain Laroche; Paul Trichelair; Remi Tachet Des Combes", "abstract": "This paper considers Safe Policy Improvement (SPI) in Batch Reinforcement Learning (Batch RL): from a fixed dataset and without direct access to the true environment, train a policy that is guaranteed to perform at least as well as the baseline policy used to collect the data. \t Our approach, called SPI with Baseline Bootstrapping (SPIBB), is inspired by the knows-what-it-knows paradigm: it bootstraps the trained policy with the baseline when the uncertainty is high. \t Our first algorithm, $\\Pi_b$-SPIBB, comes with SPI theoretical guarantees. \t We also implement a variant, $\\Pi_{\\leq b}$-SPIBB, that is even more efficient in practice. \t We apply our algorithms to a motivational stochastic gridworld domain and further demonstrate on randomly generated MDPs the superiority of SPIBB with respect to existing algorithms, not only in safety but also in mean performance. \t Finally, we implement a model-free version of SPIBB and show its benefits on a navigation task with deep RL implementation called SPIBB-DQN, which is, to the best of our knowledge, the first RL algorithm relying on a neural network representation able to train efficiently and reliably from batch data, without any interaction with the environment.", "bibtex": "@InProceedings{pmlr-v97-laroche19a,\n title = \t {Safe Policy Improvement with Baseline Bootstrapping},\n author = {Laroche, Romain and Trichelair, Paul and Combes, Remi Tachet Des},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3652--3661},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/laroche19a/laroche19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/laroche19a.html},\n abstract = \t {This paper considers Safe Policy Improvement (SPI) in Batch Reinforcement Learning (Batch RL): from a fixed dataset and without direct access to the true environment, train a policy that is guaranteed to perform at least as well as the baseline policy used to collect the data. \t Our approach, called SPI with Baseline Bootstrapping (SPIBB), is inspired by the knows-what-it-knows paradigm: it bootstraps the trained policy with the baseline when the uncertainty is high. \t Our first algorithm, $\\Pi_b$-SPIBB, comes with SPI theoretical guarantees. \t We also implement a variant, $\\Pi_{\\leq b}$-SPIBB, that is even more efficient in practice. \t We apply our algorithms to a motivational stochastic gridworld domain and further demonstrate on randomly generated MDPs the superiority of SPIBB with respect to existing algorithms, not only in safety but also in mean performance. \t Finally, we implement a model-free version of SPIBB and show its benefits on a navigation task with deep RL implementation called SPIBB-DQN, which is, to the best of our knowledge, the first RL algorithm relying on a neural network representation able to train efficiently and reliably from batch data, without any interaction with the environment.}\n}", "pdf": "http://proceedings.mlr.press/v97/laroche19a/laroche19a.pdf", "supp": "", "pdf_size": 842092, "gs_citation": 252, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5058123810223044287&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Microsoft Research, Montr \u00b4eal, Canada; Microsoft Research, Montr \u00b4eal, Canada; Microsoft Research, Montr \u00b4eal, Canada", "aff_domain": "microsoft.com; ; ", "email": "microsoft.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/laroche19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Microsoft", "aff_unique_dep": "Microsoft Research", "aff_unique_url": "https://www.microsoft.com/en-us/research", "aff_unique_abbr": "MSR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Montr\u00e9al", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "title": "Same, Same But Different: Recovering Neural Network Quantization Error Through Weight Factorization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4099", "id": "4099", "author_site": "Eldad Meller, Alexander Finkelstein, Uri Almog, Mark Grobman", "author": "Eldad Meller; Alexander Finkelstein; Uri Almog; Mark Grobman", "abstract": "Quantization of neural networks has become common practice, driven by the need for efficient implementations of deep neural networks on embedded devices. In this paper, we exploit an oft-overlooked degree of freedom in most networks - for a given layer, individual output channels can be scaled by any factor provided that the corresponding weights of the next layer are inversely scaled. Therefore, a given network has many factorizations which change the weights of the network without changing its function. We present a conceptually simple and easy to implement method that uses this property and show that proper factorizations significantly decrease the degradation caused by quantization. We show improvement on a wide variety of networks and achieve state-of-the-art degradation results for MobileNets. While our focus is on quantization, this type of factorization is applicable to other domains such as network-pruning, neural nets regularization and network interpretability.", "bibtex": "@InProceedings{pmlr-v97-meller19a,\n title = \t {Same, Same But Different: Recovering Neural Network Quantization Error Through Weight Factorization},\n author = {Meller, Eldad and Finkelstein, Alexander and Almog, Uri and Grobman, Mark},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4486--4495},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/meller19a/meller19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/meller19a.html},\n abstract = \t {Quantization of neural networks has become common practice, driven by the need for efficient implementations of deep neural networks on embedded devices. In this paper, we exploit an oft-overlooked degree of freedom in most networks - for a given layer, individual output channels can be scaled by any factor provided that the corresponding weights of the next layer are inversely scaled. Therefore, a given network has many factorizations which change the weights of the network without changing its function. We present a conceptually simple and easy to implement method that uses this property and show that proper factorizations significantly decrease the degradation caused by quantization. We show improvement on a wide variety of networks and achieve state-of-the-art degradation results for MobileNets. While our focus is on quantization, this type of factorization is applicable to other domains such as network-pruning, neural nets regularization and network interpretability.}\n}", "pdf": "http://proceedings.mlr.press/v97/meller19a/meller19a.pdf", "supp": "", "pdf_size": 607280, "gs_citation": 122, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9808269715917629964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "Hailo technologies, Tel Aviv, Israel; Hailo technologies, Tel Aviv, Israel; Hailo technologies, Tel Aviv, Israel; Hailo technologies, Tel Aviv, Israel", "aff_domain": "hailotech.com; ; ;hailotech.com", "email": "hailotech.com; ; ;hailotech.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/meller19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Hailo Technologies", "aff_unique_dep": "", "aff_unique_url": "https://www.hailo.ai", "aff_unique_abbr": "Hailo", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Tel Aviv", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "title": "Sample-Optimal Parametric Q-Learning Using Linearly Additive Features", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3991", "id": "3991", "author_site": "Lin Yang, Mengdi Wang", "author": "Lin Yang; Mengdi Wang", "abstract": "Consider a Markov decision process (MDP) that admits a set of state-action features, which can linearly express the process\u2019s probabilistic transition model. We propose a parametric Q-learning algorithm that finds an approximate-optimal policy using a sample size proportional to the feature dimension $K$ and invariant with respect to the size of the state space. To further improve its sample efficiency, we exploit the monotonicity property and intrinsic noise structure of the Bellman operator, provided the existence of anchor state-actions that imply implicit non-negativity in the feature space. We augment the algorithm using techniques of variance reduction, monotonicity preservation, and confidence bounds. It is proved to find a policy which is $\\epsilon$-optimal from any initial state with high probability using $\\widetilde{O}(K/\\epsilon^2(1-\\gamma)^3)$ sample transitions for arbitrarily large-scale MDP with a discount factor $\\gamma\\in(0,1)$. A matching information-theoretical lower bound is proved, confirming the sample optimality of the proposed method with respect to all parameters (up to polylog factors).", "bibtex": "@InProceedings{pmlr-v97-yang19b,\n title = \t {Sample-Optimal Parametric Q-Learning Using Linearly Additive Features},\n author = {Yang, Lin and Wang, Mengdi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6995--7004},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yang19b/yang19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/yang19b.html},\n abstract = \t {Consider a Markov decision process (MDP) that admits a set of state-action features, which can linearly express the process\u2019s probabilistic transition model. We propose a parametric Q-learning algorithm that finds an approximate-optimal policy using a sample size proportional to the feature dimension $K$ and invariant with respect to the size of the state space. To further improve its sample efficiency, we exploit the monotonicity property and intrinsic noise structure of the Bellman operator, provided the existence of anchor state-actions that imply implicit non-negativity in the feature space. We augment the algorithm using techniques of variance reduction, monotonicity preservation, and confidence bounds. It is proved to find a policy which is $\\epsilon$-optimal from any initial state with high probability using $\\widetilde{O}(K/\\epsilon^2(1-\\gamma)^3)$ sample transitions for arbitrarily large-scale MDP with a discount factor $\\gamma\\in(0,1)$. A matching information-theoretical lower bound is proved, confirming the sample optimality of the proposed method with respect to all parameters (up to polylog factors).}\n}", "pdf": "http://proceedings.mlr.press/v97/yang19b/yang19b.pdf", "supp": "", "pdf_size": 351458, "gs_citation": 377, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5925912571515825669&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 9, "aff": "Department of Operations Research and Financial Engineering, Princeton University; Department of Operations Research and Financial Engineering, Princeton University", "aff_domain": "princeton.edu;princeton.edu", "email": "princeton.edu;princeton.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/yang19b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Operations Research and Financial Engineering", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Scalable Fair Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3962", "id": "3962", "author_site": "Arturs Backurs, Piotr Indyk, Krzysztof Onak, Baruch Schieber, Ali Vakilian, Tal Wagner", "author": "Arturs Backurs; Piotr Indyk; Krzysztof Onak; Baruch Schieber; Ali Vakilian; Tal Wagner", "abstract": "We study the fair variant of the classic k-median problem introduced by (Chierichetti et al., NeurIPS 2017) in which the points are colored, and the goal is to minimize the same average distance objective as in the standard $k$-median problem while ensuring that all clusters have an \u201capproximately equal\u201d number of points of each color. (Chierichetti et al., NeurIPS 2017) proposed a two-phase algorithm for fair $k$-clustering. In the first step, the pointset is partitioned into subsets called fairlets that satisfy the fairness requirement and approximately preserve the k-median objective. In the second step, fairlets are merged into k clusters by one of the existing k-median algorithms. The running time of this algorithm is dominated by the first step, which takes super-quadratic time. In this paper, we present a practical approximate fairlet decomposition algorithm that runs in nearly linear time.", "bibtex": "@InProceedings{pmlr-v97-backurs19a,\n title = \t {Scalable Fair Clustering},\n author = {Backurs, Arturs and Indyk, Piotr and Onak, Krzysztof and Schieber, Baruch and Vakilian, Ali and Wagner, Tal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {405--413},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/backurs19a/backurs19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/backurs19a.html},\n abstract = \t {We study the fair variant of the classic k-median problem introduced by (Chierichetti et al., NeurIPS 2017) in which the points are colored, and the goal is to minimize the same average distance objective as in the standard $k$-median problem while ensuring that all clusters have an \u201capproximately equal\u201d number of points of each color. (Chierichetti et al., NeurIPS 2017) proposed a two-phase algorithm for fair $k$-clustering. In the first step, the pointset is partitioned into subsets called fairlets that satisfy the fairness requirement and approximately preserve the k-median objective. In the second step, fairlets are merged into k clusters by one of the existing k-median algorithms. The running time of this algorithm is dominated by the first step, which takes super-quadratic time. In this paper, we present a practical approximate fairlet decomposition algorithm that runs in nearly linear time.}\n}", "pdf": "http://proceedings.mlr.press/v97/backurs19a/backurs19a.pdf", "supp": "", "pdf_size": 1533719, "gs_citation": 277, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16665021693225941817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/backurs19a.html" }, { "title": "Scalable Learning in Reproducing Kernel Krein Spaces", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3763", "id": "3763", "author_site": "Dino Oglic, Thomas Gaertner", "author": "Dino Oglic; Thomas G\u00e4rtner", "abstract": "We provide the first mathematically complete derivation of the Nystr{\u00f6}m method for low-rank approximation of indefinite kernels and propose an efficient method for finding an approximate eigendecomposition of such kernel matrices. Building on this result, we devise highly scalable methods for learning in reproducing kernel Krein spaces. The devised approaches provide a principled and theoretically well-founded means to tackle large scale learning problems with indefinite kernels. The main motivation for our work comes from problems with structured representations (e.g., graphs, strings, time-series), where it is relatively easy to devise a pairwise (dis)similarity function based on intuition and/or knowledge of domain experts. Such functions are typically not positive definite and it is often well beyond the expertise of practitioners to verify this condition. The effectiveness of the devised approaches is evaluated empirically using indefinite kernels defined on structured and vectorial data representations.", "bibtex": "@InProceedings{pmlr-v97-oglic19a,\n title = \t {Scalable Learning in Reproducing Kernel Krein Spaces},\n author = {Oglic, Dino and G{\\\"a}rtner, Thomas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4912--4921},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/oglic19a/oglic19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/oglic19a.html},\n abstract = \t {We provide the first mathematically complete derivation of the Nystr{\u00f6}m method for low-rank approximation of indefinite kernels and propose an efficient method for finding an approximate eigendecomposition of such kernel matrices. Building on this result, we devise highly scalable methods for learning in reproducing kernel Krein spaces. The devised approaches provide a principled and theoretically well-founded means to tackle large scale learning problems with indefinite kernels. The main motivation for our work comes from problems with structured representations (e.g., graphs, strings, time-series), where it is relatively easy to devise a pairwise (dis)similarity function based on intuition and/or knowledge of domain experts. Such functions are typically not positive definite and it is often well beyond the expertise of practitioners to verify this condition. The effectiveness of the devised approaches is evaluated empirically using indefinite kernels defined on structured and vectorial data representations.}\n}", "pdf": "http://proceedings.mlr.press/v97/oglic19a/oglic19a.pdf", "supp": "", "pdf_size": 442150, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17308665770377403997&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Informatics, King\u2019s College London, UK; School of Computer Science, University of Nottingham, UK", "aff_domain": "uni-bonn.de; ", "email": "uni-bonn.de; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/oglic19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "King\u2019s College London;University of Nottingham", "aff_unique_dep": "Department of Informatics;School of Computer Science", "aff_unique_url": "https://www.kcl.ac.uk;https://www.nottingham.ac.uk", "aff_unique_abbr": "KCL;UoN", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Scalable Metropolis-Hastings for Exact Bayesian Inference with Large Datasets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4131", "id": "4131", "author_site": "Rob Cornish, Paul Vanetti, Alexandre Bouchard-C\u00f4t\u00e9, George Deligiannidis, Arnaud Doucet", "author": "Rob Cornish; Paul Vanetti; Alexandre Bouchard-Cote; George Deligiannidis; Arnaud Doucet", "abstract": "Bayesian inference via standard Markov Chain Monte Carlo (MCMC) methods such as Metropolis-Hastings is too computationally intensive to handle large datasets, since the cost per step usually scales like $O(n)$ in the number of data points $n$. We propose the Scalable Metropolis-Hastings (SMH) kernel that only requires processing on average $O(1)$ or even $O(1/\\sqrt{n})$ data points per step. This scheme is based on a combination of factorized acceptance probabilities, procedures for fast simulation of Bernoulli processes, and control variate ideas. Contrary to many MCMC subsampling schemes such as fixed step-size Stochastic Gradient Langevin Dynamics, our approach is exact insofar as the invariant distribution is the true posterior and not an approximation to it. We characterise the performance of our algorithm theoretically, and give realistic and verifiable conditions under which it is geometrically ergodic. This theory is borne out by empirical results that demonstrate overall performance benefits over standard Metropolis-Hastings and various subsampling algorithms.", "bibtex": "@InProceedings{pmlr-v97-cornish19a,\n title = \t {Scalable {M}etropolis-{H}astings for Exact {B}ayesian Inference with Large Datasets},\n author = {Cornish, Rob and Vanetti, Paul and Bouchard-Cote, Alexandre and Deligiannidis, George and Doucet, Arnaud},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1351--1360},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cornish19a/cornish19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cornish19a.html},\n abstract = \t {Bayesian inference via standard Markov Chain Monte Carlo (MCMC) methods such as Metropolis-Hastings is too computationally intensive to handle large datasets, since the cost per step usually scales like $O(n)$ in the number of data points $n$. We propose the Scalable Metropolis-Hastings (SMH) kernel that only requires processing on average $O(1)$ or even $O(1/\\sqrt{n})$ data points per step. This scheme is based on a combination of factorized acceptance probabilities, procedures for fast simulation of Bernoulli processes, and control variate ideas. Contrary to many MCMC subsampling schemes such as fixed step-size Stochastic Gradient Langevin Dynamics, our approach is exact insofar as the invariant distribution is the true posterior and not an approximation to it. We characterise the performance of our algorithm theoretically, and give realistic and verifiable conditions under which it is geometrically ergodic. This theory is borne out by empirical results that demonstrate overall performance benefits over standard Metropolis-Hastings and various subsampling algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/cornish19a/cornish19a.pdf", "supp": "", "pdf_size": 529728, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10400262915897387298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "University of Oxford; University of Oxford; University of British Columbia; University of Oxford + The Alan Turing Institute; University of Oxford + The Alan Turing Institute", "aff_domain": "robots.ox.ac.uk; ; ; ; ", "email": "robots.ox.ac.uk; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/cornish19a.html", "aff_unique_index": "0;0;1;0+2;0+2", "aff_unique_norm": "University of Oxford;University of British Columbia;Alan Turing Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ox.ac.uk;https://www.ubc.ca;https://www.turing.ac.uk", "aff_unique_abbr": "Oxford;UBC;ATI", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0+0;0+0", "aff_country_unique": "United Kingdom;Canada" }, { "title": "Scalable Nonparametric Sampling from Multimodal Posteriors with the Posterior Bootstrap", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4150", "id": "4150", "author_site": "Edwin Fong, Simon Lyddon, Christopher Holmes", "author": "Edwin Fong; Simon Lyddon; Chris Holmes", "abstract": "Increasingly complex datasets pose a number of challenges for Bayesian inference. Conventional posterior sampling based on Markov chain Monte Carlo can be too computationally intensive, is serial in nature and mixes poorly between posterior modes. Furthermore, all models are misspecified, which brings into question the validity of the conventional Bayesian update. We present a scalable Bayesian nonparametric learning routine that enables posterior sampling through the optimization of suitably randomized objective functions. A Dirichlet process prior on the unknown data distribution accounts for model misspecification, and admits an embarrassingly parallel posterior bootstrap algorithm that generates independent and exact samples from the nonparametric posterior distribution. Our method is particularly adept at sampling from multimodal posterior distributions via a random restart mechanism, and we demonstrate this on Gaussian mixture model and sparse logistic regression examples.", "bibtex": "@InProceedings{pmlr-v97-fong19a,\n title = \t {Scalable Nonparametric Sampling from Multimodal Posteriors with the Posterior Bootstrap},\n author = {Fong, Edwin and Lyddon, Simon and Holmes, Chris},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1952--1962},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/fong19a/fong19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/fong19a.html},\n abstract = \t {Increasingly complex datasets pose a number of challenges for Bayesian inference. Conventional posterior sampling based on Markov chain Monte Carlo can be too computationally intensive, is serial in nature and mixes poorly between posterior modes. Furthermore, all models are misspecified, which brings into question the validity of the conventional Bayesian update. We present a scalable Bayesian nonparametric learning routine that enables posterior sampling through the optimization of suitably randomized objective functions. A Dirichlet process prior on the unknown data distribution accounts for model misspecification, and admits an embarrassingly parallel posterior bootstrap algorithm that generates independent and exact samples from the nonparametric posterior distribution. Our method is particularly adept at sampling from multimodal posterior distributions via a random restart mechanism, and we demonstrate this on Gaussian mixture model and sparse logistic regression examples.}\n}", "pdf": "http://proceedings.mlr.press/v97/fong19a/fong19a.pdf", "supp": "", "pdf_size": 1824394, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14627195645565170893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Statistics, University of Oxford, Oxford, United Kingdom+The Alan Turing Institute, London, United Kingdom; Department of Statistics, University of Oxford, Oxford, United Kingdom; Department of Statistics, University of Oxford, Oxford, United Kingdom+The Alan Turing Institute, London, United Kingdom", "aff_domain": "stats.ox.ac.uk; ; ", "email": "stats.ox.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/fong19a.html", "aff_unique_index": "0+1;0;0+1", "aff_unique_norm": "University of Oxford;Alan Turing Institute", "aff_unique_dep": "Department of Statistics;", "aff_unique_url": "https://www.ox.ac.uk;https://www.turing.ac.uk", "aff_unique_abbr": "Oxford;ATI", "aff_campus_unique_index": "0+1;0;0+1", "aff_campus_unique": "Oxford;London", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Scalable Training of Inference Networks for Gaussian-Process Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3746", "id": "3746", "author_site": "Jiaxin Shi, Mohammad Emtiyaz Khan, Jun Zhu", "author": "Jiaxin Shi; Mohammad Emtiyaz Khan; Jun Zhu", "abstract": "Inference in Gaussian process (GP) models is computationally challenging for large data, and often difficult to approximate with a small number of inducing points. We explore an alternative approximation that employs stochastic inference networks for a flexible inference. Unfortunately, for such networks, minibatch training is difficult to be able to learn meaningful correlations over function outputs for a large dataset. We propose an algorithm that enables such training by tracking a stochastic, functional mirror-descent algorithm. At each iteration, this only requires considering a finite number of input locations, resulting in a scalable and easy-to-implement algorithm. Empirical results show comparable and, sometimes, superior performance to existing sparse variational GP methods.", "bibtex": "@InProceedings{pmlr-v97-shi19a,\n title = \t {Scalable Training of Inference Networks for {G}aussian-Process Models},\n author = {Shi, Jiaxin and Khan, Mohammad Emtiyaz and Zhu, Jun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5758--5768},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/shi19a/shi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/shi19a.html},\n abstract = \t {Inference in Gaussian process (GP) models is computationally challenging for large data, and often difficult to approximate with a small number of inducing points. We explore an alternative approximation that employs stochastic inference networks for a flexible inference. Unfortunately, for such networks, minibatch training is difficult to be able to learn meaningful correlations over function outputs for a large dataset. We propose an algorithm that enables such training by tracking a stochastic, functional mirror-descent algorithm. At each iteration, this only requires considering a finite number of input locations, resulting in a scalable and easy-to-implement algorithm. Empirical results show comparable and, sometimes, superior performance to existing sparse variational GP methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/shi19a/shi19a.pdf", "supp": "", "pdf_size": 1378340, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18315311533765480343&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Dept. of Comp. Sci. & Tech., Institute for AI, BNRist Center, THBI Lab, Tsinghua University, Beijing, China; RIKEN Center for Advanced Intelligence project, Tokyo, Japan; Dept. of Comp. Sci. & Tech., Institute for AI, BNRist Center, THBI Lab, Tsinghua University, Beijing, China", "aff_domain": "mails.tsinghua.edu.cn; ;tsinghua.edu.cn", "email": "mails.tsinghua.edu.cn; ;tsinghua.edu.cn", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/shi19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Tsinghua University;RIKEN Center for Advanced Intelligence Project", "aff_unique_dep": "Dept. of Comp. Sci. & Tech.;Center for Advanced Intelligence", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.riken.jp/en/crai/", "aff_unique_abbr": "THU;RIKEN CRAI", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Beijing;Tokyo", "aff_country_unique_index": "0;1;0", "aff_country_unique": "China;Japan" }, { "title": "Scale-free adaptive planning for deterministic dynamics & discounted rewards", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4144", "id": "4144", "author_site": "Peter Bartlett, Victor Gabillon, Jennifer Healey, Michal Valko", "author": "Peter Bartlett; Victor Gabillon; Jennifer Healey; Michal Valko", "abstract": "We address the problem of planning in an environment with deterministic dynamics and stochastic discounted rewards under a limited numerical budget where the ranges of both rewards and noise are unknown. We introduce PlaTypOOS, an adaptive, robust, and efficient alternative to the OLOP (open-loop optimistic planning) algorithm. Whereas OLOP requires a priori knowledge of the ranges of both rewards and noise, PlaTypOOS dynamically adapts its behavior to both. This allows PlaTypOOS to be immune to two vulnerabilities of OLOP: failure when given underestimated ranges of noise and rewards and inefficiency when these are overestimated. PlaTypOOS additionally adapts to the global smoothness of the value function. PlaTypOOS acts in a provably more efficient manner vs. OLOP when OLOP is given an overestimated reward and show that in the case of no noise, PlaTypOOS learns exponentially faster.", "bibtex": "@InProceedings{pmlr-v97-bartlett19a,\n title = \t {Scale-free adaptive planning for deterministic dynamics & discounted rewards},\n author = {Bartlett, Peter and Gabillon, Victor and Healey, Jennifer and Valko, Michal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {495--504},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bartlett19a/bartlett19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bartlett19a.html},\n abstract = \t {We address the problem of planning in an environment with deterministic dynamics and stochastic discounted rewards under a limited numerical budget where the ranges of both rewards and noise are unknown. We introduce PlaTypOOS, an adaptive, robust, and efficient alternative to the OLOP (open-loop optimistic planning) algorithm. Whereas OLOP requires a priori knowledge of the ranges of both rewards and noise, PlaTypOOS dynamically adapts its behavior to both. This allows PlaTypOOS to be immune to two vulnerabilities of OLOP: failure when given underestimated ranges of noise and rewards and inefficiency when these are overestimated. PlaTypOOS additionally adapts to the global smoothness of the value function. PlaTypOOS acts in a provably more efficient manner vs. OLOP when OLOP is given an overestimated reward and show that in the case of no noise, PlaTypOOS learns exponentially faster.}\n}", "pdf": "http://proceedings.mlr.press/v97/bartlett19a/bartlett19a.pdf", "supp": "", "pdf_size": 525879, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=354209837175768345&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of California, Berkeley, USA; Noah\u2019s Ark Lab, Huawei Technologies, London, UK; Adobe Research, San Jose, USA; SequeL team, INRIA Lille - Nord Europe, France", "aff_domain": "berkeley.edu;huawei.com;adobe.com;inria.fr", "email": "berkeley.edu;huawei.com;adobe.com;inria.fr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/bartlett19a.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of California, Berkeley;Huawei;Adobe;INRIA Lille - Nord Europe", "aff_unique_dep": ";Noah\u2019s Ark Lab;Adobe Research;SequeL team", "aff_unique_url": "https://www.berkeley.edu;https://www.huawei.com;https://research.adobe.com;https://www.inria.fr/en/centre/lille-nord-europe", "aff_unique_abbr": "UC Berkeley;Huawei;Adobe;INRIA", "aff_campus_unique_index": "0;1;2;3", "aff_campus_unique": "Berkeley;London;San Jose;Lille", "aff_country_unique_index": "0;1;0;2", "aff_country_unique": "United States;United Kingdom;France" }, { "title": "Scaling Up Ordinal Embedding: A Landmark Approach", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4058", "id": "4058", "author_site": "Jesse Anderton, Javed Aslam", "author": "Jesse Anderton; Javed Aslam", "abstract": "Ordinal Embedding is the problem of placing n objects into R^d to satisfy constraints like \"object a is closer to b than to c.\" It can accommodate data that embeddings from features or distances cannot, but is a more difficult problem. We propose a novel landmark-based method as a partial solution. At small to medium scales, we present a novel combination of existing methods with some new theoretical justification. For very large values of n optimizing over an entire embedding breaks down, so we propose a novel method which first embeds a subset of m << n objects and then embeds the remaining objects independently and in parallel. We prove a distance error bound for our method in terms of m and that it has O(dn log m) time complexity, and show empirically that it is able to produce high quality embeddings in a fraction of the time needed for any published method.", "bibtex": "@InProceedings{pmlr-v97-anderton19a,\n title = \t {Scaling Up Ordinal Embedding: A Landmark Approach},\n author = {Anderton, Jesse and Aslam, Javed},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {282--290},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/anderton19a/anderton19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/anderton19a.html},\n abstract = \t {Ordinal Embedding is the problem of placing n objects into R^d to satisfy constraints like \"object a is closer to b than to c.\" It can accommodate data that embeddings from features or distances cannot, but is a more difficult problem. We propose a novel landmark-based method as a partial solution. At small to medium scales, we present a novel combination of existing methods with some new theoretical justification. For very large values of n optimizing over an entire embedding breaks down, so we propose a novel method which first embeds a subset of m << n objects and then embeds the remaining objects independently and in parallel. We prove a distance error bound for our method in terms of m and that it has O(dn log m) time complexity, and show empirically that it is able to produce high quality embeddings in a fraction of the time needed for any published method.}\n}", "pdf": "http://proceedings.mlr.press/v97/anderton19a/anderton19a.pdf", "supp": "", "pdf_size": 960569, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8271494065986770449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff": "College of Computer and Information Science, Northeastern University, Boston, Massachusetts; College of Computer and Information Science, Northeastern University, Boston, Massachusetts", "aff_domain": "ccs.neu.edu; ", "email": "ccs.neu.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/anderton19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "College of Computer and Information Science", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Screening rules for Lasso with non-convex Sparse Regularizers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4181", "id": "4181", "author_site": "alain rakotomamonjy, Gilles Gasso, Joseph Salmon", "author": "Alain Rakotomamonjy; Gilles Gasso; Joseph Salmon", "abstract": "Leveraging on the convexity of the Lasso problem, screening rules help in accelerating solvers by discarding irrelevant variables, during the optimization process. However, because they provide better theoretical guarantees in identifying relevant\tvariables, several non-convex regularizers for the Lasso have been proposed in the literature. This work is the first that introduces a screening rule strategy into a non-convex Lasso solver. The approach we propose is based on a iterative majorization-minimization (MM) strategy that includes a screening rule in the inner solver and a condition for propagating screened variables between iterations of MM. In addition to improve efficiency of solvers, we also provide guarantees that the inner solver is able to identify the zeros components of its critical point in finite time. Our experimental analysis illustrates the significant computational gain brought by the new screening rule compared to classical coordinate-descent or proximal gradient descent methods.", "bibtex": "@InProceedings{pmlr-v97-rakotomamonjy19a,\n title = \t {Screening rules for Lasso with non-convex Sparse Regularizers},\n author = {Rakotomamonjy, Alain and Gasso, Gilles and Salmon, Joseph},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5341--5350},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rakotomamonjy19a/rakotomamonjy19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rakotomamonjy19a.html},\n abstract = \t {Leveraging on the convexity of the Lasso problem, screening rules help in accelerating solvers by discarding irrelevant variables, during the optimization process. However, because they provide better theoretical guarantees in identifying relevant\tvariables, several non-convex regularizers for the Lasso have been proposed in the literature. This work is the first that introduces a screening rule strategy into a non-convex Lasso solver. The approach we propose is based on a iterative majorization-minimization (MM) strategy that includes a screening rule in the inner solver and a condition for propagating screened variables between iterations of MM. In addition to improve efficiency of solvers, we also provide guarantees that the inner solver is able to identify the zeros components of its critical point in finite time. Our experimental analysis illustrates the significant computational gain brought by the new screening rule compared to classical coordinate-descent or proximal gradient descent methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/rakotomamonjy19a/rakotomamonjy19a.pdf", "supp": "", "pdf_size": 4082585, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16753066935443894802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "LITIS, Normandy University, University of Rouen Normandie, INSA Rouen Normadie, France+Criteo AI Lab, Paris, France; LITIS, Normandy University, University of Rouen Normandie, INSA Rouen Normadie, France; IMAG, Univ Montpellier, CNRS, Montpellier, France", "aff_domain": "univ-rouen.fr; ; ", "email": "univ-rouen.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/rakotomamonjy19a.html", "aff_unique_index": "0+1;0;2", "aff_unique_norm": "University of Rouen Normandie;Criteo;University of Montpellier", "aff_unique_dep": "LITIS;Criteo AI Lab;IMAG", "aff_unique_url": "https://www.univ-rouen.fr;https://www.criteo.com;https://www.univ-montp1.fr", "aff_unique_abbr": ";Criteo;Univ Montpellier", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Paris;Montpellier", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "France" }, { "title": "SelectiveNet: A Deep Neural Network with an Integrated Reject Option", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4020", "id": "4020", "author_site": "Yonatan Geifman, Ran El-Yaniv", "author": "Yonatan Geifman; Ran El-Yaniv", "abstract": "We consider the problem of selective prediction (also known as reject option) in deep neural networks, and introduce SelectiveNet, a deep neural architecture with an integrated reject option. Existing rejection mechanisms are based mostly on a threshold over the prediction confidence of a pre-trained network. In contrast, SelectiveNet is trained to optimize both classification (or regression) and rejection simultaneously, end-to-end. The result is a deep neural network that is optimized over the covered domain. In our experiments, we show a consistently improved risk-coverage trade-off over several well-known classification and regression datasets, thus reaching new state-of-the-art results for deep selective classification.", "bibtex": "@InProceedings{pmlr-v97-geifman19a,\n title = \t {{S}elective{N}et: A Deep Neural Network with an Integrated Reject Option},\n author = {Geifman, Yonatan and El-Yaniv, Ran},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2151--2159},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/geifman19a/geifman19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/geifman19a.html},\n abstract = \t {We consider the problem of selective prediction (also known as reject option) in deep neural networks, and introduce SelectiveNet, a deep neural architecture with an integrated reject option. Existing rejection mechanisms are based mostly on a threshold over the prediction confidence of a pre-trained network. In contrast, SelectiveNet is trained to optimize both classification (or regression) and rejection simultaneously, end-to-end. The result is a deep neural network that is optimized over the covered domain. In our experiments, we show a consistently improved risk-coverage trade-off over several well-known classification and regression datasets, thus reaching new state-of-the-art results for deep selective classification.}\n}", "pdf": "http://proceedings.mlr.press/v97/geifman19a/geifman19a.pdf", "supp": "", "pdf_size": 677034, "gs_citation": 420, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3455752188101558663&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Technion - Israel Institute of Technology; Technion - Israel Institute of Technology", "aff_domain": "cs.technion.ac.il; ", "email": "cs.technion.ac.il; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/geifman19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Self-Attention Generative Adversarial Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3774", "id": "3774", "author_site": "Han Zhang, Ian Goodfellow, Dimitris Metaxas, Augustus Odena", "author": "Han Zhang; Ian Goodfellow; Dimitris Metaxas; Augustus Odena", "abstract": "In this paper, we propose the Self-Attention Generative Adversarial Network (SAGAN) which allows attention-driven, long-range dependency modeling for image generation tasks. Traditional convolutional GANs generate high-resolution details as a function of only spatially local points in lower-resolution feature maps. In SAGAN, details can be generated using cues from all feature locations. Moreover, the discriminator can check that highly detailed features in distant portions of the image are consistent with each other. Furthermore, recent work has shown that generator conditioning affects GAN performance. Leveraging this insight, we apply spectral normalization to the GAN generator and find that this improves training dynamics. The proposed SAGAN performs better than prior work, boosting the best published Inception score from 36.8 to 52.52 and reducing Fr\u00e9chet Inception distance from 27.62 to 18.65 on the challenging ImageNet dataset. Visualization of the attention layers shows that the generator leverages neighborhoods that correspond to object shapes rather than local regions of fixed shape.", "bibtex": "@InProceedings{pmlr-v97-zhang19d,\n title = \t {Self-Attention Generative Adversarial Networks},\n author = {Zhang, Han and Goodfellow, Ian and Metaxas, Dimitris and Odena, Augustus},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7354--7363},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19d/zhang19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19d.html},\n abstract = \t {In this paper, we propose the Self-Attention Generative Adversarial Network (SAGAN) which allows attention-driven, long-range dependency modeling for image generation tasks. Traditional convolutional GANs generate high-resolution details as a function of only spatially local points in lower-resolution feature maps. In SAGAN, details can be generated using cues from all feature locations. Moreover, the discriminator can check that highly detailed features in distant portions of the image are consistent with each other. Furthermore, recent work has shown that generator conditioning affects GAN performance. Leveraging this insight, we apply spectral normalization to the GAN generator and find that this improves training dynamics. The proposed SAGAN performs better than prior work, boosting the best published Inception score from 36.8 to 52.52 and reducing Fr\u00e9chet Inception distance from 27.62 to 18.65 on the challenging ImageNet dataset. Visualization of the attention layers shows that the generator leverages neighborhoods that correspond to object shapes rather than local regions of fixed shape.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19d/zhang19d.pdf", "supp": "", "pdf_size": 6213860, "gs_citation": 5274, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7330853420568873733&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Department of Computer Science, Rutgers University+Google Research, Brain Team; Google Research, Brain Team; Department of Computer Science, Rutgers University; Google Research, Brain Team", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/zhang19d.html", "aff_unique_index": "0+1;1;0;1", "aff_unique_norm": "Rutgers University;Google", "aff_unique_dep": "Department of Computer Science;Google Research", "aff_unique_url": "https://www.rutgers.edu;https://research.google", "aff_unique_abbr": "Rutgers;Google", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Self-Attention Graph Pooling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3965", "id": "3965", "author_site": "Junhyun Lee, Inyeop Lee, Jaewoo Kang", "author": "Junhyun Lee; Inyeop Lee; Jaewoo Kang", "abstract": "Advanced methods of applying deep learning to structured data such as graphs have been proposed in recent years. In particular, studies have focused on generalizing convolutional neural networks to graph data, which includes redefining the convolution and the downsampling (pooling) operations for graphs. The method of generalizing the convolution operation to graphs has been proven to improve performance and is widely used. However, the method of applying downsampling to graphs is still difficult to perform and has room for improvement. In this paper, we propose a graph pooling method based on self-attention. Self-attention using graph convolution allows our pooling method to consider both node features and graph topology. To ensure a fair comparison, the same training procedures and model architectures were used for the existing pooling methods and our method. The experimental results demonstrate that our method achieves superior graph classification performance on the benchmark datasets using a reasonable number of parameters.", "bibtex": "@InProceedings{pmlr-v97-lee19c,\n title = \t {Self-Attention Graph Pooling},\n author = {Lee, Junhyun and Lee, Inyeop and Kang, Jaewoo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3734--3743},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lee19c/lee19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/lee19c.html},\n abstract = \t {Advanced methods of applying deep learning to structured data such as graphs have been proposed in recent years. In particular, studies have focused on generalizing convolutional neural networks to graph data, which includes redefining the convolution and the downsampling (pooling) operations for graphs. The method of generalizing the convolution operation to graphs has been proven to improve performance and is widely used. However, the method of applying downsampling to graphs is still difficult to perform and has room for improvement. In this paper, we propose a graph pooling method based on self-attention. Self-attention using graph convolution allows our pooling method to consider both node features and graph topology. To ensure a fair comparison, the same training procedures and model architectures were used for the existing pooling methods and our method. The experimental results demonstrate that our method achieves superior graph classification performance on the benchmark datasets using a reasonable number of parameters.}\n}", "pdf": "http://proceedings.mlr.press/v97/lee19c/lee19c.pdf", "supp": "", "pdf_size": 471691, "gs_citation": 1542, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8950252210828065007&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science and Engineering, Korea University; Department of Computer Science and Engineering, Korea University; Department of Computer Science and Engineering, Korea University", "aff_domain": "korea.ac.kr;korea.ac.kr;korea.ac.kr", "email": "korea.ac.kr;korea.ac.kr;korea.ac.kr", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/lee19c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea University", "aff_unique_dep": "Department of Computer Science and Engineering", "aff_unique_url": "https://www.korea.ac.kr", "aff_unique_abbr": "KU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Self-Supervised Exploration via Disagreement", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4234", "id": "4234", "author_site": "Deepak Pathak, Dhiraj Gandhi, Abhinav Gupta", "author": "Deepak Pathak; Dhiraj Gandhi; Abhinav Gupta", "abstract": "Efficient exploration is a long-standing problem in sensorimotor learning. Major advances have been demonstrated in noise-free, non-stochastic domains such as video games and simulation. However, most of these formulations either get stuck in environments with stochastic dynamics or are too inefficient to be scalable to real robotics setups. In this paper, we propose a formulation for exploration inspired by the work in active learning literature. Specifically, we train an ensemble of dynamics models and incentivize the agent to explore such that the disagreement of those ensembles is maximized. This allows the agent to learn skills by exploring in a self-supervised manner without any external reward. Notably, we further leverage the disagreement objective to optimize the agent\u2019s policy in a differentiable manner, without using reinforcement learning, which results in a sample-efficient exploration. We demonstrate the efficacy of this formulation across a variety of benchmark environments including stochastic-Atari, Mujoco and Unity. Finally, we implement our differentiable exploration on a real robot which learns to interact with objects completely from scratch. Project videos and code are at https://pathak22.github.io/exploration-by-disagreement/", "bibtex": "@InProceedings{pmlr-v97-pathak19a,\n title = \t {Self-Supervised Exploration via Disagreement},\n author = {Pathak, Deepak and Gandhi, Dhiraj and Gupta, Abhinav},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5062--5071},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/pathak19a/pathak19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/pathak19a.html},\n abstract = \t {Efficient exploration is a long-standing problem in sensorimotor learning. Major advances have been demonstrated in noise-free, non-stochastic domains such as video games and simulation. However, most of these formulations either get stuck in environments with stochastic dynamics or are too inefficient to be scalable to real robotics setups. In this paper, we propose a formulation for exploration inspired by the work in active learning literature. Specifically, we train an ensemble of dynamics models and incentivize the agent to explore such that the disagreement of those ensembles is maximized. This allows the agent to learn skills by exploring in a self-supervised manner without any external reward. Notably, we further leverage the disagreement objective to optimize the agent\u2019s policy in a differentiable manner, without using reinforcement learning, which results in a sample-efficient exploration. We demonstrate the efficacy of this formulation across a variety of benchmark environments including stochastic-Atari, Mujoco and Unity. Finally, we implement our differentiable exploration on a real robot which learns to interact with objects completely from scratch. Project videos and code are at https://pathak22.github.io/exploration-by-disagreement/}\n}", "pdf": "http://proceedings.mlr.press/v97/pathak19a/pathak19a.pdf", "supp": "", "pdf_size": 4530583, "gs_citation": 485, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13780996231531586358&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "UC Berkeley; CMU; Facebook AI Research", "aff_domain": "cs.berkeley.edu; ; ", "email": "cs.berkeley.edu; ; ", "github": "", "project": "https://pathak22.github.io/exploration-by-disagreement/", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/pathak19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University;Meta", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu;https://research.facebook.com", "aff_unique_abbr": "UC Berkeley;CMU;FAIR", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Self-similar Epochs: Value in arrangement", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4107", "id": "4107", "author_site": "Eliav Buchnik, Edith Cohen, Avinatan Hasidim, Yossi Matias", "author": "Eliav Buchnik; Edith Cohen; Avinatan Hasidim; Yossi Matias", "abstract": "Optimization of machine learning models is commonly performed through stochastic gradient updates on randomly ordered training examples. This practice means that each fraction of an epoch comprises an independent random sample of the training data that may not preserve informative structure present in the full data. We hypothesize that the training can be more effective with", "bibtex": "@InProceedings{pmlr-v97-buchnik19a,\n title = \t {Self-similar Epochs: Value in arrangement},\n author = {Buchnik, Eliav and Cohen, Edith and Hasidim, Avinatan and Matias, Yossi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {841--850},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/buchnik19a/buchnik19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/buchnik19a.html},\n abstract = \t {Optimization of machine learning models is commonly performed through stochastic gradient updates on randomly ordered training examples. This practice means that each fraction of an epoch comprises an independent random sample of the training data that may not preserve informative structure present in the full data. We hypothesize that the training can be more effective with", "pdf": "http://proceedings.mlr.press/v97/buchnik19a/buchnik19a.pdf", "supp": "", "pdf_size": 3496446, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9674609023302044986&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Tel Aviv University, Israel+Google Research; Google Research+Tel Aviv University, Israel; Google Research; Google Research", "aff_domain": "gmail.com;cohenwang.com; ; ", "email": "gmail.com;cohenwang.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/buchnik19a.html", "aff_unique_index": "0+1;1+0;1;1", "aff_unique_norm": "Tel Aviv University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.tau.ac.il;https://research.google", "aff_unique_abbr": "TAU;Google Research", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0+1;1+0;1;1", "aff_country_unique": "Israel;United States" }, { "title": "Semi-Cyclic Stochastic Gradient Descent", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4282", "id": "4282", "author_site": "Hubert Eichner, Tomer Koren, Brendan McMahan, Nati Srebro, Kunal Talwar", "author": "Hubert Eichner; Tomer Koren; Brendan Mcmahan; Nathan Srebro; Kunal Talwar", "abstract": "We consider convex SGD updates with a block-cyclic structure, i.e., where each cycle consists of a small number of blocks, each with many samples from a possibly different, block-specific, distribution. This situation arises, e.g., in Federated Learning where the mobile devices available for updates at different times during the day have different characteristics. We show that such block-cyclic structure can significantly deteriorate the performance of SGD, but propose a simple approach that allows prediction with the same guarantees as for i.i.d., non-cyclic, sampling.", "bibtex": "@InProceedings{pmlr-v97-eichner19a,\n title = \t {Semi-Cyclic Stochastic Gradient Descent},\n author = {Eichner, Hubert and Koren, Tomer and Mcmahan, Brendan and Srebro, Nathan and Talwar, Kunal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1764--1773},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/eichner19a/eichner19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/eichner19a.html},\n abstract = \t {We consider convex SGD updates with a block-cyclic structure, i.e., where each cycle consists of a small number of blocks, each with many samples from a possibly different, block-specific, distribution. This situation arises, e.g., in Federated Learning where the mobile devices available for updates at different times during the day have different characteristics. We show that such block-cyclic structure can significantly deteriorate the performance of SGD, but propose a simple approach that allows prediction with the same guarantees as for i.i.d., non-cyclic, sampling.}\n}", "pdf": "http://proceedings.mlr.press/v97/eichner19a/eichner19a.pdf", "supp": "", "pdf_size": 342093, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13511493615549582385&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google; Google; Google; Toyota Technological Institute at Chicago; Google", "aff_domain": "google.com;google.com;google.com;ttic.edu;google.com", "email": "google.com;google.com;google.com;ttic.edu;google.com", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/eichner19a.html", "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Google;Toyota Technological Institute at Chicago", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.tti-chicago.org", "aff_unique_abbr": "Google;TTI Chicago", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Mountain View;Chicago", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Sensitivity Analysis of Linear Structural Causal Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4243", "id": "4243", "author_site": "Carlos Cinelli, Daniel Kumor, Bryant Chen, Judea Pearl, Elias Bareinboim", "author": "Carlos Cinelli; Daniel Kumor; Bryant Chen; Judea Pearl; Elias Bareinboim", "abstract": "Causal inference requires assumptions about the data generating process, many of which are unverifiable from the data. Given that some causal assumptions might be uncertain or disputed, formal methods are needed to quantify how sensitive research conclusions are to violations of those assumptions. Although an extensive literature exists on the topic, most results are limited to specific model structures, while a general-purpose algorithmic framework for sensitivity analysis is still lacking. In this paper, we develop a formal, systematic approach to sensitivity analysis for arbitrary linear Structural Causal Models (SCMs). We start by formalizing sensitivity analysis as a constrained identification problem. We then develop an efficient, graph-based identification algorithm that exploits non-zero constraints on both directed and bidirected edges. This allows researchers to systematically derive sensitivity curves for a target causal quantity with an arbitrary set of path coefficients and error covariances as sensitivity parameters. These results can be used to display the degree to which violations of causal assumptions affect the target quantity of interest, and to judge, on scientific grounds, whether problematic degrees of violations are plausible.", "bibtex": "@InProceedings{pmlr-v97-cinelli19a,\n title = \t {Sensitivity Analysis of Linear Structural Causal Models},\n author = {Cinelli, Carlos and Kumor, Daniel and Chen, Bryant and Pearl, Judea and Bareinboim, Elias},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1252--1261},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cinelli19a/cinelli19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cinelli19a.html},\n abstract = \t {Causal inference requires assumptions about the data generating process, many of which are unverifiable from the data. Given that some causal assumptions might be uncertain or disputed, formal methods are needed to quantify how sensitive research conclusions are to violations of those assumptions. Although an extensive literature exists on the topic, most results are limited to specific model structures, while a general-purpose algorithmic framework for sensitivity analysis is still lacking. In this paper, we develop a formal, systematic approach to sensitivity analysis for arbitrary linear Structural Causal Models (SCMs). We start by formalizing sensitivity analysis as a constrained identification problem. We then develop an efficient, graph-based identification algorithm that exploits non-zero constraints on both directed and bidirected edges. This allows researchers to systematically derive sensitivity curves for a target causal quantity with an arbitrary set of path coefficients and error covariances as sensitivity parameters. These results can be used to display the degree to which violations of causal assumptions affect the target quantity of interest, and to judge, on scientific grounds, whether problematic degrees of violations are plausible.}\n}", "pdf": "http://proceedings.mlr.press/v97/cinelli19a/cinelli19a.pdf", "supp": "", "pdf_size": 360635, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12897158713362656935&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Depts. of Statistics and Computer Science, University of California, Los Angeles, California, USA; Dept. of Computer Science, Purdue University, West Lafayette, IN, USA; Brex, San Francisco, CA, USA; Depts. of Statistics and Computer Science, University of California, Los Angeles, California, USA; Dept. of Computer Science, Purdue University, West Lafayette, IN, USA", "aff_domain": "ucla.edu; ; ; ; ", "email": "ucla.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/cinelli19a.html", "aff_unique_index": "0;1;2;0;1", "aff_unique_norm": "University of California, Los Angeles;Purdue University;Brex", "aff_unique_dep": "Department of Statistics and Computer Science;Department of Computer Science;", "aff_unique_url": "https://www.ucla.edu;https://www.purdue.edu;https://www.brex.com", "aff_unique_abbr": "UCLA;Purdue;", "aff_campus_unique_index": "0;1;2;0;1", "aff_campus_unique": "Los Angeles;West Lafayette;San Francisco", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Separating value functions across time-scales", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4227", "id": "4227", "author_site": "Joshua Romoff, Peter Henderson, Ahmed Touati, Yann Ollivier, Joelle Pineau, Emma Brunskill", "author": "Joshua Romoff; Peter Henderson; Ahmed Touati; Emma Brunskill; Joelle Pineau; Yann Ollivier", "abstract": "In many finite horizon episodic reinforcement learning (RL) settings, it is desirable to optimize for the undiscounted return - in settings like Atari, for instance, the goal is to collect the most points while staying alive in the long run. Yet, it may be difficult (or even intractable) mathematically to learn with this target. As such, temporal discounting is often applied to optimize over a shorter effective planning horizon. This comes at the cost of potentially biasing the optimization target away from the undiscounted goal. In settings where this bias is unacceptable - where the system must optimize for longer horizons at higher discounts - the target of the value function approximator may increase in variance leading to difficulties in learning. We present an extension of temporal difference (TD) learning, which we call TD($\\Delta$), that breaks down a value function into a series of components based on the differences between value functions with smaller discount factors. The separation of a longer horizon value function into these components has useful properties in scalability and performance. We discuss these properties and show theoretic and empirical improvements over standard TD learning in certain settings.", "bibtex": "@InProceedings{pmlr-v97-romoff19a,\n title = \t {Separating value functions across time-scales},\n author = {Romoff, Joshua and Henderson, Peter and Touati, Ahmed and Brunskill, Emma and Pineau, Joelle and Ollivier, Yann},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5468--5477},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/romoff19a/romoff19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/romoff19a.html},\n abstract = \t {In many finite horizon episodic reinforcement learning (RL) settings, it is desirable to optimize for the undiscounted return - in settings like Atari, for instance, the goal is to collect the most points while staying alive in the long run. Yet, it may be difficult (or even intractable) mathematically to learn with this target. As such, temporal discounting is often applied to optimize over a shorter effective planning horizon. This comes at the cost of potentially biasing the optimization target away from the undiscounted goal. In settings where this bias is unacceptable - where the system must optimize for longer horizons at higher discounts - the target of the value function approximator may increase in variance leading to difficulties in learning. We present an extension of temporal difference (TD) learning, which we call TD($\\Delta$), that breaks down a value function into a series of components based on the differences between value functions with smaller discount factors. The separation of a longer horizon value function into these components has useful properties in scalability and performance. We discuss these properties and show theoretic and empirical improvements over standard TD learning in certain settings.}\n}", "pdf": "http://proceedings.mlr.press/v97/romoff19a/romoff19a.pdf", "supp": "", "pdf_size": 1151061, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4770640199000017982&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "MILA, McGill University + Facebook AI Research; Facebook AI Research; MILA, Universit \u00b4e de Montr \u00b4eal; Stanford University; MILA, McGill University + Facebook AI Research; Facebook AI Research", "aff_domain": "mail.mcgill.ca;stanford.edu; ; ; ; ", "email": "mail.mcgill.ca;stanford.edu; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/romoff19a.html", "aff_unique_index": "0+1;1;2;3;0+1;1", "aff_unique_norm": "McGill University;Meta;Universit\u00e9 de Montr\u00e9al;Stanford University", "aff_unique_dep": "MILA;Facebook AI Research;MILA;", "aff_unique_url": "https://www.mcgill.ca;https://research.facebook.com;https://www.mila.quebec;https://www.stanford.edu", "aff_unique_abbr": "McGill;FAIR;MILA;Stanford", "aff_campus_unique_index": "0;2;3;0", "aff_campus_unique": "Montreal;;Montr\u00e9al;Stanford", "aff_country_unique_index": "0+1;1;0;1;0+1;1", "aff_country_unique": "Canada;United States" }, { "title": "Sequential Facility Location: Approximate Submodularity and Greedy Algorithm", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3609", "id": "3609", "author": "Ehsan Elhamifar", "abstract": "We develop and analyze a novel utility function and a fast optimization algorithm for subset selection in sequential data that incorporates the dynamic model of data. We propose a cardinality-constrained sequential facility location function that finds a fixed number of representatives, where the sequence of representatives is compatible with the dynamic model and well encodes the data. As maximizing this new objective function is NP-hard, we develop a fast greedy algorithm based on submodular maximization. Unlike the conventional facility location, the computation of the marginal gain in our case cannot be done by operations on each item independently. We exploit the sequential structure of the problem and develop an efficient dynamic programming-based algorithm that computes the marginal gain exactly. We investigate conditions on the dynamic model, under which our utility function is ($\\epsilon$-approximately) submodualr, hence, the greedy algorithm comes with performance guarantees. By experiments on synthetic data and the problem of procedure learning from instructional videos, we show that our framework significantly improves the computational time, achieves better objective function values and obtains more coherent summaries.", "bibtex": "@InProceedings{pmlr-v97-elhamifar19a,\n title = \t {Sequential Facility Location: Approximate Submodularity and Greedy Algorithm},\n author = {Elhamifar, Ehsan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1784--1793},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/elhamifar19a/elhamifar19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/elhamifar19a.html},\n abstract = \t {We develop and analyze a novel utility function and a fast optimization algorithm for subset selection in sequential data that incorporates the dynamic model of data. We propose a cardinality-constrained sequential facility location function that finds a fixed number of representatives, where the sequence of representatives is compatible with the dynamic model and well encodes the data. As maximizing this new objective function is NP-hard, we develop a fast greedy algorithm based on submodular maximization. Unlike the conventional facility location, the computation of the marginal gain in our case cannot be done by operations on each item independently. We exploit the sequential structure of the problem and develop an efficient dynamic programming-based algorithm that computes the marginal gain exactly. We investigate conditions on the dynamic model, under which our utility function is ($\\epsilon$-approximately) submodualr, hence, the greedy algorithm comes with performance guarantees. By experiments on synthetic data and the problem of procedure learning from instructional videos, we show that our framework significantly improves the computational time, achieves better objective function values and obtains more coherent summaries.}\n}", "pdf": "http://proceedings.mlr.press/v97/elhamifar19a/elhamifar19a.pdf", "supp": "", "pdf_size": 555861, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16994637402224058324&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Assistant Professor, Khoury College of Computer Sciences, Northeastern University, Boston, MA, USA", "aff_domain": "northeastern.edu", "email": "northeastern.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/elhamifar19a.html", "aff_unique_index": "0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "Khoury College of Computer Sciences", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "0", "aff_campus_unique": "Boston", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Set Transformer: A Framework for Attention-based Permutation-Invariant Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3716", "id": "3716", "author_site": "Juho Lee, Yoonho Lee, Jungtaek Kim, Adam Kosiorek, Seungjin Choi, Yee-Whye Teh", "author": "Juho Lee; Yoonho Lee; Jungtaek Kim; Adam Kosiorek; Seungjin Choi; Yee Whye Teh", "abstract": "Many machine learning tasks such as multiple instance learning, 3D shape recognition, and few-shot image classification are defined on sets of instances. Since solutions to such problems do not depend on the order of elements of the set, models used to address them should be permutation invariant. We present an attention-based neural network module, the Set Transformer, specifically designed to model interactions among elements in the input set. The model consists of an encoder and a decoder, both of which rely on attention mechanisms. In an effort to reduce computational complexity, we introduce an attention scheme inspired by inducing point methods from sparse Gaussian process literature. It reduces the computation time of self-attention from quadratic to linear in the number of elements in the set. We show that our model is theoretically attractive and we evaluate it on a range of tasks, demonstrating the state-of-the-art performance compared to recent methods for set-structured data.", "bibtex": "@InProceedings{pmlr-v97-lee19d,\n title = \t {Set Transformer: A Framework for Attention-based Permutation-Invariant Neural Networks},\n author = {Lee, Juho and Lee, Yoonho and Kim, Jungtaek and Kosiorek, Adam and Choi, Seungjin and Teh, Yee Whye},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3744--3753},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lee19d/lee19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/lee19d.html},\n abstract = \t {Many machine learning tasks such as multiple instance learning, 3D shape recognition, and few-shot image classification are defined on sets of instances. Since solutions to such problems do not depend on the order of elements of the set, models used to address them should be permutation invariant. We present an attention-based neural network module, the Set Transformer, specifically designed to model interactions among elements in the input set. The model consists of an encoder and a decoder, both of which rely on attention mechanisms. In an effort to reduce computational complexity, we introduce an attention scheme inspired by inducing point methods from sparse Gaussian process literature. It reduces the computation time of self-attention from quadratic to linear in the number of elements in the set. We show that our model is theoretically attractive and we evaluate it on a range of tasks, demonstrating the state-of-the-art performance compared to recent methods for set-structured data.}\n}", "pdf": "http://proceedings.mlr.press/v97/lee19d/lee19d.pdf", "supp": "", "pdf_size": 4877237, "gs_citation": 1601, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=564620061424738263&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/lee19d.html" }, { "title": "Sever: A Robust Meta-Algorithm for Stochastic Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3572", "id": "3572", "author_site": "Ilias Diakonikolas, Gautam Kamath, Daniel Kane, Jerry Li, Jacob Steinhardt, Alistair Stewart", "author": "Ilias Diakonikolas; Gautam Kamath; Daniel Kane; Jerry Li; Jacob Steinhardt; Alistair Stewart", "abstract": "In high dimensions, most machine learning methods are brittle to even a small fraction of structured outliers. To address this, we introduce a new meta-algorithm that can take in a base learner such as least squares or stochastic gradient descent, and harden the learner to be resistant to outliers. Our method, Sever, possesses strong theoretical guarantees yet is also highly scalable \u2013 beyond running the base learner itself, it only requires computing the top singular vector of a certain n{\\texttimes}d matrix. We apply Sever on a drug design dataset and a spam classification dataset, and find that in both cases it has substantially greater robustness than several baselines. On the spam dataset, with 1% corruptions, we achieved 7.4% test error, compared to 13.4%-20.5% for the baselines, and 3% error on the uncorrupted dataset. Similarly, on the drug design dataset, with 10% corruptions, we achieved 1.42 mean-squared error test error, compared to 1.51-2.33 for the baselines, and 1.23 error on the uncorrupted dataset.", "bibtex": "@InProceedings{pmlr-v97-diakonikolas19a,\n title = \t {Sever: A Robust Meta-Algorithm for Stochastic Optimization},\n author = {Diakonikolas, Ilias and Kamath, Gautam and Kane, Daniel and Li, Jerry and Steinhardt, Jacob and Stewart, Alistair},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1596--1606},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/diakonikolas19a/diakonikolas19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/diakonikolas19a.html},\n abstract = \t {In high dimensions, most machine learning methods are brittle to even a small fraction of structured outliers. To address this, we introduce a new meta-algorithm that can take in a base learner such as least squares or stochastic gradient descent, and harden the learner to be resistant to outliers. Our method, Sever, possesses strong theoretical guarantees yet is also highly scalable \u2013 beyond running the base learner itself, it only requires computing the top singular vector of a certain n{\\texttimes}d matrix. We apply Sever on a drug design dataset and a spam classification dataset, and find that in both cases it has substantially greater robustness than several baselines. On the spam dataset, with 1% corruptions, we achieved 7.4% test error, compared to 13.4%-20.5% for the baselines, and 3% error on the uncorrupted dataset. Similarly, on the drug design dataset, with 10% corruptions, we achieved 1.42 mean-squared error test error, compared to 1.51-2.33 for the baselines, and 1.23 error on the uncorrupted dataset.}\n}", "pdf": "http://proceedings.mlr.press/v97/diakonikolas19a/diakonikolas19a.pdf", "supp": "", "pdf_size": 382055, "gs_citation": 355, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1735563344640957243&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "Department of Computer Science, University of Southern California, Los Angeles, California, USA; Simons Institute for the Theory of Computing, Berkeley, California, USA; Departments of Mathematics and Computer Science and Engineering, University of California, San Diego, California, USA; Microsoft Research AI, Redmond, Washington, USA; Department of Statistics, University of California, Berkeley, California, USA; Web3 Foundation, Zug, Switzerland", "aff_domain": "usc.edu;csail.mit.edu;cs.ucsd.edu;microsoft.com;berkeley.edu;gmail.com", "email": "usc.edu;csail.mit.edu;cs.ucsd.edu;microsoft.com;berkeley.edu;gmail.com", "github": "https://github.com/hoonose/severberg", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/diakonikolas19a.html", "aff_unique_index": "0;1;2;3;4;5", "aff_unique_norm": "University of Southern California;Simons Institute for the Theory of Computing;University of California, San Diego;Microsoft;University of California, Berkeley;Web3 Foundation", "aff_unique_dep": "Department of Computer Science;;Departments of Mathematics and Computer Science and Engineering;AI;Department of Statistics;", "aff_unique_url": "https://www.usc.edu;https://simons.berkeley.edu;https://www.ucsd.edu;https://www.microsoft.com/en-us/research/group/ai;https://www.berkeley.edu;https://web3.foundation", "aff_unique_abbr": "USC;SITC;UCSD;MSR AI;UC Berkeley;Web3F", "aff_campus_unique_index": "0;1;2;3;1", "aff_campus_unique": "Los Angeles;Berkeley;San Diego;Redmond;", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United States;Switzerland" }, { "title": "Shallow-Deep Networks: Understanding and Mitigating Network Overthinking", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3840", "id": "3840", "author_site": "Yigitcan Kaya, Sanghyun Hong, Tudor Dumitras", "author": "Yigitcan Kaya; Sanghyun Hong; Tudor Dumitras", "abstract": "We characterize a prevalent weakness of deep neural networks (DNNs), \u2019overthinking\u2019, which occurs when a DNN can reach correct predictions before its final layer. Overthinking is computationally wasteful, and it can also be destructive when, by the final layer, a correct prediction changes into a misclassification. Understanding overthinking requires studying how each prediction evolves during a DNN\u2019s forward pass, which conventionally is opaque. For prediction transparency, we propose the Shallow-Deep Network (SDN), a generic modification to off-the-shelf DNNs that introduces internal classifiers. We apply SDN to four modern architectures, trained on three image classification tasks, to characterize the overthinking problem. We show that SDNs can mitigate the wasteful effect of overthinking with confidence-based early exits, which reduce the average inference cost by more than 50% and preserve the accuracy. We also find that the destructive effect occurs for 50% of misclassifications on natural inputs and that it can be induced, adversarially, with a recent backdooring attack. To mitigate this effect, we propose a new confusion metric to quantify the internal disagreements that will likely to lead to misclassifications.", "bibtex": "@InProceedings{pmlr-v97-kaya19a,\n title = \t {Shallow-Deep Networks: Understanding and Mitigating Network Overthinking},\n author = {Kaya, Yigitcan and Hong, Sanghyun and Dumitras, Tudor},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3301--3310},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kaya19a/kaya19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kaya19a.html},\n abstract = \t {We characterize a prevalent weakness of deep neural networks (DNNs), \u2019overthinking\u2019, which occurs when a DNN can reach correct predictions before its final layer. Overthinking is computationally wasteful, and it can also be destructive when, by the final layer, a correct prediction changes into a misclassification. Understanding overthinking requires studying how each prediction evolves during a DNN\u2019s forward pass, which conventionally is opaque. For prediction transparency, we propose the Shallow-Deep Network (SDN), a generic modification to off-the-shelf DNNs that introduces internal classifiers. We apply SDN to four modern architectures, trained on three image classification tasks, to characterize the overthinking problem. We show that SDNs can mitigate the wasteful effect of overthinking with confidence-based early exits, which reduce the average inference cost by more than 50% and preserve the accuracy. We also find that the destructive effect occurs for 50% of misclassifications on natural inputs and that it can be induced, adversarially, with a recent backdooring attack. To mitigate this effect, we propose a new confusion metric to quantify the internal disagreements that will likely to lead to misclassifications.}\n}", "pdf": "http://proceedings.mlr.press/v97/kaya19a/kaya19a.pdf", "supp": "", "pdf_size": 327462, "gs_citation": 368, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6970216830123198900&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "University of Maryland, Maryland, USA; University of Maryland, Maryland, USA; University of Maryland, Maryland, USA", "aff_domain": "umiacs.umd.edu; ; ", "email": "umiacs.umd.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kaya19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Maryland", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Shape Constraints for Set Functions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4160", "id": "4160", "author_site": "Andrew Cotter, Maya Gupta, Heinrich Jiang, Erez Louidor, James Muller, Taman Narayan, Serena Wang, Tao Zhu", "author": "Andrew Cotter; Maya Gupta; Heinrich Jiang; Erez Louidor; James Muller; Tamann Narayan; Serena Wang; Tao Zhu", "abstract": "Set functions predict a label from a permutation-invariant variable-size collection of feature vectors. We propose making set functions more understandable and regularized by capturing domain knowledge through shape constraints. We show how prior work in monotonic constraints can be adapted to set functions, and then propose two new shape constraints designed to generalize the conditioning role of weights in a weighted mean. We show how one can train standard functions and set functions that satisfy these shape constraints with a deep lattice network. We propose a nonlinear estimation strategy we call the semantic feature engine that uses set functions with the proposed shape constraints to estimate labels for compound sparse categorical features. Experiments on real-world data show the achieved accuracy is similar to deep sets or deep neural networks, but provides guarantees on the model behavior, which makes it easier to explain and debug.", "bibtex": "@InProceedings{pmlr-v97-cotter19a,\n title = \t {Shape Constraints for Set Functions},\n author = {Cotter, Andrew and Gupta, Maya and Jiang, Heinrich and Louidor, Erez and Muller, James and Narayan, Tamann and Wang, Serena and Zhu, Tao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1388--1396},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cotter19a/cotter19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/cotter19a.html},\n abstract = \t {Set functions predict a label from a permutation-invariant variable-size collection of feature vectors. We propose making set functions more understandable and regularized by capturing domain knowledge through shape constraints. We show how prior work in monotonic constraints can be adapted to set functions, and then propose two new shape constraints designed to generalize the conditioning role of weights in a weighted mean. We show how one can train standard functions and set functions that satisfy these shape constraints with a deep lattice network. We propose a nonlinear estimation strategy we call the semantic feature engine that uses set functions with the proposed shape constraints to estimate labels for compound sparse categorical features. Experiments on real-world data show the achieved accuracy is similar to deep sets or deep neural networks, but provides guarantees on the model behavior, which makes it easier to explain and debug.}\n}", "pdf": "http://proceedings.mlr.press/v97/cotter19a/cotter19a.pdf", "supp": "", "pdf_size": 982249, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6179408537802481998&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Google AI, Mountain View, CA; Google AI, Mountain View, CA; Google AI, Mountain View, CA; Google AI, Mountain View, CA; Google AI, Mountain View, CA; Google AI, Mountain View, CA; Google AI, Mountain View, CA; Google AI, Mountain View, CA", "aff_domain": "google.com; ; ; ; ; ; ;google.com", "email": "google.com; ; ; ; ; ; ;google.com", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/cotter19a.html", "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google AI", "aff_unique_url": "https://ai.google", "aff_unique_abbr": "Google AI", "aff_campus_unique_index": "0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Similarity of Neural Network Representations Revisited", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4185", "id": "4185", "author_site": "Simon Kornblith, Mohammad Norouzi, Honglak Lee, Geoffrey Hinton", "author": "Simon Kornblith; Mohammad Norouzi; Honglak Lee; Geoffrey Hinton", "abstract": "Recent work has sought to understand the behavior of neural networks by comparing representations between layers and between different trained models. We examine methods for comparing neural network representations based on canonical correlation analysis (CCA). We show that CCA belongs to a family of statistics for measuring multivariate similarity, but that neither CCA nor any other statistic that is invariant to invertible linear transformation can measure meaningful similarities between representations of higher dimension than the number of data points. We introduce a similarity index that measures the relationship between representational similarity matrices and does not suffer from this limitation. This similarity index is equivalent to centered kernel alignment (CKA) and is also closely connected to CCA. Unlike CCA, CKA can reliably identify correspondences between representations in networks trained from different initializations.", "bibtex": "@InProceedings{pmlr-v97-kornblith19a,\n title = \t {Similarity of Neural Network Representations Revisited},\n author = {Kornblith, Simon and Norouzi, Mohammad and Lee, Honglak and Hinton, Geoffrey},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3519--3529},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kornblith19a/kornblith19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kornblith19a.html},\n abstract = \t {Recent work has sought to understand the behavior of neural networks by comparing representations between layers and between different trained models. We examine methods for comparing neural network representations based on canonical correlation analysis (CCA). We show that CCA belongs to a family of statistics for measuring multivariate similarity, but that neither CCA nor any other statistic that is invariant to invertible linear transformation can measure meaningful similarities between representations of higher dimension than the number of data points. We introduce a similarity index that measures the relationship between representational similarity matrices and does not suffer from this limitation. This similarity index is equivalent to centered kernel alignment (CKA) and is also closely connected to CCA. Unlike CCA, CKA can reliably identify correspondences between representations in networks trained from different initializations.}\n}", "pdf": "http://proceedings.mlr.press/v97/kornblith19a/kornblith19a.pdf", "supp": "", "pdf_size": 1479027, "gs_citation": 1704, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18357214027033370010&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "aff": "Google Brain; Google Brain; Google Brain; Google Brain", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kornblith19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Simple Black-box Adversarial Attacks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3754", "id": "3754", "author_site": "Chuan Guo, Jacob Gardner, Yurong You, Andrew Wilson, Kilian Weinberger", "author": "Chuan Guo; Jacob Gardner; Yurong You; Andrew Gordon Wilson; Kilian Weinberger", "abstract": "We propose an intriguingly simple method for the construction of adversarial images in the black-box setting. In constrast to the white-box scenario, constructing black-box adversarial images has the additional constraint on query budget, and efficient attacks remain an open problem to date. With only the mild assumption of requiring continuous-valued confidence scores, our highly query-efficient algorithm utilizes the following simple iterative principle: we randomly sample a vector from a predefined orthonormal basis and either add or subtract it to the target image. Despite its simplicity, the proposed method can be used for both untargeted and targeted attacks \u2013 resulting in previously unprecedented query efficiency in both settings. We demonstrate the efficacy and efficiency of our algorithm on several real world settings including the Google Cloud Vision API. We argue that our proposed algorithm should serve as a strong baseline for future black-box attacks, in particular because it is extremely fast and its implementation requires less than 20 lines of PyTorch code.", "bibtex": "@InProceedings{pmlr-v97-guo19a,\n title = \t {Simple Black-box Adversarial Attacks},\n author = {Guo, Chuan and Gardner, Jacob and You, Yurong and Wilson, Andrew Gordon and Weinberger, Kilian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2484--2493},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/guo19a/guo19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/guo19a.html},\n abstract = \t {We propose an intriguingly simple method for the construction of adversarial images in the black-box setting. In constrast to the white-box scenario, constructing black-box adversarial images has the additional constraint on query budget, and efficient attacks remain an open problem to date. With only the mild assumption of requiring continuous-valued confidence scores, our highly query-efficient algorithm utilizes the following simple iterative principle: we randomly sample a vector from a predefined orthonormal basis and either add or subtract it to the target image. Despite its simplicity, the proposed method can be used for both untargeted and targeted attacks \u2013 resulting in previously unprecedented query efficiency in both settings. We demonstrate the efficacy and efficiency of our algorithm on several real world settings including the Google Cloud Vision API. We argue that our proposed algorithm should serve as a strong baseline for future black-box attacks, in particular because it is extremely fast and its implementation requires less than 20 lines of PyTorch code.}\n}", "pdf": "http://proceedings.mlr.press/v97/guo19a/guo19a.pdf", "supp": "", "pdf_size": 4922600, "gs_citation": 740, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14524309362525785070&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Cornell University, Ithaca, New York, USA; Uber AI Labs, San Francisco, California, USA; Department of Computer Science, Cornell University, Ithaca, New York, USA; Department of Computer Science, Cornell University, Ithaca, New York, USA; Department of Computer Science, Cornell University, Ithaca, New York, USA", "aff_domain": "cornell.edu; ; ; ; ", "email": "cornell.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/guo19a.html", "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Cornell University;Uber", "aff_unique_dep": "Department of Computer Science;Uber AI Labs", "aff_unique_url": "https://www.cornell.edu;https://www.uber.com", "aff_unique_abbr": "Cornell;Uber", "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "Ithaca;San Francisco", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Simple Stochastic Gradient Methods for Non-Smooth Non-Convex Regularized Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3711", "id": "3711", "author_site": "Michael Metel, Akiko Takeda", "author": "Michael Metel; Akiko Takeda", "abstract": "Our work focuses on stochastic gradient methods for optimizing a smooth non-convex loss function with a non-smooth non-convex regularizer. Research on this class of problem is quite limited, and until recently no non-asymptotic convergence results have been reported. We present two simple stochastic gradient algorithms, for finite-sum and general stochastic optimization problems, which have superior convergence complexities compared to the current state-of-the-art. We also compare our algorithms\u2019 performance in practice for empirical risk minimization.", "bibtex": "@InProceedings{pmlr-v97-metel19a,\n title = \t {Simple Stochastic Gradient Methods for Non-Smooth Non-Convex Regularized Optimization},\n author = {Metel, Michael and Takeda, Akiko},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4537--4545},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/metel19a/metel19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/metel19a.html},\n abstract = \t {Our work focuses on stochastic gradient methods for optimizing a smooth non-convex loss function with a non-smooth non-convex regularizer. Research on this class of problem is quite limited, and until recently no non-asymptotic convergence results have been reported. We present two simple stochastic gradient algorithms, for finite-sum and general stochastic optimization problems, which have superior convergence complexities compared to the current state-of-the-art. We also compare our algorithms\u2019 performance in practice for empirical risk minimization.}\n}", "pdf": "http://proceedings.mlr.press/v97/metel19a/metel19a.pdf", "supp": "", "pdf_size": 345731, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12195563721345585267&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff": "RIKEN Center for Advanced Intelligence Project, Tokyo, Japan + Department of Creative Informatics, Graduate School of Information Science and Technology, the University of Tokyo, Tokyo, Japan; RIKEN Center for Advanced Intelligence Project, Tokyo, Japan + Department of Creative Informatics, Graduate School of Information Science and Technology, the University of Tokyo, Tokyo, Japan", "aff_domain": "riken.jp; ", "email": "riken.jp; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/metel19a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "RIKEN Center for Advanced Intelligence Project;University of Tokyo", "aff_unique_dep": "Center for Advanced Intelligence Project;Graduate School of Information Science and Technology, Department of Creative Informatics", "aff_unique_url": "https://www.riken.jp/en/c-aip/;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "RIKEN C-AIP;UTokyo", "aff_campus_unique_index": "0+0;0+0", "aff_campus_unique": "Tokyo", "aff_country_unique_index": "0+0;0+0", "aff_country_unique": "Japan" }, { "title": "Simplifying Graph Convolutional Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3683", "id": "3683", "author_site": "Felix Wu, Amauri Souza, Tianyi Zhang, Christopher Fifty, Tao Yu, Kilian Weinberger", "author": "Felix Wu; Amauri Souza; Tianyi Zhang; Christopher Fifty; Tao Yu; Kilian Weinberger", "abstract": "Graph Convolutional Networks (GCNs) and their variants have experienced significant attention and have become the de facto methods for learning graph representations. GCNs derive inspiration primarily from recent deep learning approaches, and as a result, may inherit unnecessary complexity and redundant computation. In this paper, we reduce this excess complexity through successively removing nonlinearities and collapsing weight matrices between consecutive layers. We theoretically analyze the resulting linear model and show that it corresponds to a fixed low-pass filter followed by a linear classifier. Notably, our experimental evaluation demonstrates that these simplifications do not negatively impact accuracy in many downstream applications. Moreover, the resulting model scales to larger datasets, is naturally interpretable, and yields up to two orders of magnitude speedup over FastGCN.", "bibtex": "@InProceedings{pmlr-v97-wu19e,\n title = \t {Simplifying Graph Convolutional Networks},\n author = {Wu, Felix and Souza, Amauri and Zhang, Tianyi and Fifty, Christopher and Yu, Tao and Weinberger, Kilian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6861--6871},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wu19e/wu19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/wu19e.html},\n abstract = \t {Graph Convolutional Networks (GCNs) and their variants have experienced significant attention and have become the de facto methods for learning graph representations. GCNs derive inspiration primarily from recent deep learning approaches, and as a result, may inherit unnecessary complexity and redundant computation. In this paper, we reduce this excess complexity through successively removing nonlinearities and collapsing weight matrices between consecutive layers. We theoretically analyze the resulting linear model and show that it corresponds to a fixed low-pass filter followed by a linear classifier. Notably, our experimental evaluation demonstrates that these simplifications do not negatively impact accuracy in many downstream applications. Moreover, the resulting model scales to larger datasets, is naturally interpretable, and yields up to two orders of magnitude speedup over FastGCN.}\n}", "pdf": "http://proceedings.mlr.press/v97/wu19e/wu19e.pdf", "supp": "", "pdf_size": 1121554, "gs_citation": 4182, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17348071344751182786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Cornell University; Cornell University; Cornell University + Federal Institute of Ceara (Brazil); Cornell University; Cornell University; Cornell University", "aff_domain": "cornell.edu;cornell.edu; ; ; ; ", "email": "cornell.edu;cornell.edu; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/wu19e.html", "aff_unique_index": "0;0;0+1;0;0;0", "aff_unique_norm": "Cornell University;Federal Institute of Ceara", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;", "aff_unique_abbr": "Cornell;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+1;0;0;0", "aff_country_unique": "United States;Brazil" }, { "title": "Sliced-Wasserstein Flows: Nonparametric Generative Modeling via Optimal Transport and Diffusions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3561", "id": "3561", "author_site": "Antoine Liutkus, Umut Simsekli, Szymon Majewski, Alain Durmus, Fabian-Robert St\u00f6ter", "author": "Antoine Liutkus; Umut Simsekli; Szymon Majewski; Alain Durmus; Fabian-Robert St\u00f6ter", "abstract": "By building upon the recent theory that established the connection between implicit generative modeling (IGM) and optimal transport, in this study, we propose a novel parameter-free algorithm for learning the underlying distributions of complicated datasets and sampling from them. The proposed algorithm is based on a functional optimization problem, which aims at finding a measure that is close to the data distribution as much as possible and also expressive enough for generative modeling purposes. We formulate the problem as a gradient flow in the space of probability measures. The connections between gradient flows and stochastic differential equations let us develop a computationally efficient algorithm for solving the optimization problem. We provide formal theoretical analysis where we prove finite-time error guarantees for the proposed algorithm. To the best of our knowledge, the proposed algorithm is the first nonparametric IGM algorithm with explicit theoretical guarantees. Our experimental results support our theory and show that our algorithm is able to successfully capture the structure of different types of data distributions.", "bibtex": "@InProceedings{pmlr-v97-liutkus19a,\n title = \t {Sliced-{W}asserstein Flows: Nonparametric Generative Modeling via Optimal Transport and Diffusions},\n author = {Liutkus, Antoine and Simsekli, Umut and Majewski, Szymon and Durmus, Alain and St{\\\"o}ter, Fabian-Robert},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4104--4113},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liutkus19a/liutkus19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/liutkus19a.html},\n abstract = \t {By building upon the recent theory that established the connection between implicit generative modeling (IGM) and optimal transport, in this study, we propose a novel parameter-free algorithm for learning the underlying distributions of complicated datasets and sampling from them. The proposed algorithm is based on a functional optimization problem, which aims at finding a measure that is close to the data distribution as much as possible and also expressive enough for generative modeling purposes. We formulate the problem as a gradient flow in the space of probability measures. The connections between gradient flows and stochastic differential equations let us develop a computationally efficient algorithm for solving the optimization problem. We provide formal theoretical analysis where we prove finite-time error guarantees for the proposed algorithm. To the best of our knowledge, the proposed algorithm is the first nonparametric IGM algorithm with explicit theoretical guarantees. Our experimental results support our theory and show that our algorithm is able to successfully capture the structure of different types of data distributions.}\n}", "pdf": "http://proceedings.mlr.press/v97/liutkus19a/liutkus19a.pdf", "supp": "", "pdf_size": 5472754, "gs_citation": 159, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7685202431169756099&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Inria and LIRMM, Univ. of Montpellier, France; LTCI, T\u00b4el\u00b4ecom Paristech, Universit\u00b4e Paris-Saclay, Paris, France; Institute of Mathematics, Polish Academy of Sciences, Warsaw, Poland; CNRS, ENS Paris-Saclay, Universit\u00b4e Paris-Saclay, Cachan, France; Inria and LIRMM, Univ. of Montpellier, France", "aff_domain": "inria.fr;telecom-paristech.fr; ; ;inria.fr", "email": "inria.fr;telecom-paristech.fr; ; ;inria.fr", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/liutkus19a.html", "aff_unique_index": "0;1;2;3;0", "aff_unique_norm": "INRIA;T\u00e9l\u00e9com Paris;Polish Academy of Sciences;CNRS", "aff_unique_dep": ";LTCI;Institute of Mathematics;", "aff_unique_url": "https://www.inria.fr;https://www.telecom-paris.fr;https://www.impan.pl/;https://www.cnrs.fr", "aff_unique_abbr": "Inria;T\u00e9l\u00e9com Paris;PAS;CNRS", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Paris;Warsaw", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "France;Poland" }, { "title": "Social Influence as Intrinsic Motivation for Multi-Agent Deep Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3608", "id": "3608", "author_site": "Natasha Jaques, Angeliki Lazaridou, Edward Hughes, Caglar Gulcehre, Pedro Ortega, DJ Strouse, Joel Z Leibo, Nando de Freitas", "author": "Natasha Jaques; Angeliki Lazaridou; Edward Hughes; Caglar Gulcehre; Pedro Ortega; Dj Strouse; Joel Z. Leibo; Nando De Freitas", "abstract": "We propose a unified mechanism for achieving coordination and communication in Multi-Agent Reinforcement Learning (MARL), through rewarding agents for having causal influence over other agents\u2019 actions. Causal influence is assessed using counterfactual reasoning. At each timestep, an agent simulates alternate actions that it could have taken, and computes their effect on the behavior of other agents. Actions that lead to bigger changes in other agents\u2019 behavior are considered influential and are rewarded. We show that this is equivalent to rewarding agents for having high mutual information between their actions. Empirical results demonstrate that influence leads to enhanced coordination and communication in challenging social dilemma environments, dramatically increasing the learning curves of the deep RL agents, and leading to more meaningful learned communication protocols. The influence rewards for all agents can be computed in a decentralized way by enabling agents to learn a model of other agents using deep neural networks. In contrast, key previous works on emergent communication in the MARL setting were unable to learn diverse policies in a decentralized manner and had to resort to centralized training. Consequently, the influence reward opens up a window of new opportunities for research in this area.", "bibtex": "@InProceedings{pmlr-v97-jaques19a,\n title = \t {Social Influence as Intrinsic Motivation for Multi-Agent Deep Reinforcement Learning},\n author = {Jaques, Natasha and Lazaridou, Angeliki and Hughes, Edward and Gulcehre, Caglar and Ortega, Pedro and Strouse, Dj and Leibo, Joel Z. and De Freitas, Nando},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3040--3049},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jaques19a/jaques19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jaques19a.html},\n abstract = \t {We propose a unified mechanism for achieving coordination and communication in Multi-Agent Reinforcement Learning (MARL), through rewarding agents for having causal influence over other agents\u2019 actions. Causal influence is assessed using counterfactual reasoning. At each timestep, an agent simulates alternate actions that it could have taken, and computes their effect on the behavior of other agents. Actions that lead to bigger changes in other agents\u2019 behavior are considered influential and are rewarded. We show that this is equivalent to rewarding agents for having high mutual information between their actions. Empirical results demonstrate that influence leads to enhanced coordination and communication in challenging social dilemma environments, dramatically increasing the learning curves of the deep RL agents, and leading to more meaningful learned communication protocols. The influence rewards for all agents can be computed in a decentralized way by enabling agents to learn a model of other agents using deep neural networks. In contrast, key previous works on emergent communication in the MARL setting were unable to learn diverse policies in a decentralized manner and had to resort to centralized training. Consequently, the influence reward opens up a window of new opportunities for research in this area.}\n}", "pdf": "http://proceedings.mlr.press/v97/jaques19a/jaques19a.pdf", "supp": "", "pdf_size": 669524, "gs_citation": 603, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13693459800833279358&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Media Lab, Massachusetts Institute of Technology, Cambridge, USA+Google DeepMind, London, UK; Google DeepMind, London, UK; Google DeepMind, London, UK; Google DeepMind, London, UK; Google DeepMind, London, UK; Institute for Advanced Study, Princeton University, Princeton, USA; Google DeepMind, London, UK; Google DeepMind, London, UK", "aff_domain": "mit.edu;google.com; ; ; ; ; ; ", "email": "mit.edu;google.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/jaques19a.html", "aff_unique_index": "0+1;1;1;1;1;2;1;1", "aff_unique_norm": "Massachusetts Institute of Technology;Google;Princeton University", "aff_unique_dep": "Media Lab;Google DeepMind;Institute for Advanced Study", "aff_unique_url": "https://www.mit.edu;https://deepmind.com;https://www.princeton.edu", "aff_unique_abbr": "MIT;DeepMind;Princeton", "aff_campus_unique_index": "0+1;1;1;1;1;2;1;1", "aff_campus_unique": "Cambridge;London;Princeton", "aff_country_unique_index": "0+1;1;1;1;1;0;1;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Sorting Out Lipschitz Function Approximation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3940", "id": "3940", "author_site": "Cem Anil, James Lucas, Roger Grosse", "author": "Cem Anil; James Lucas; Roger Grosse", "abstract": "Training neural networks under a strict Lipschitz constraint is useful for provable adversarial robustness, generalization bounds, interpretable gradients, and Wasserstein distance estimation. By the composition property of Lipschitz functions, it suffices to ensure that each individual affine transformation or nonlinear activation is 1-Lipschitz. The challenge is to do this while maintaining the expressive power. We identify a necessary property for such an architecture: each of the layers must preserve the gradient norm during backpropagation. Based on this, we propose to combine a gradient norm preserving activation function, GroupSort, with norm-constrained weight matrices. We show that norm-constrained GroupSort architectures are universal Lipschitz function approximators. Empirically, we show that norm-constrained GroupSort networks achieve tighter estimates of Wasserstein distance than their ReLU counterparts and can achieve provable adversarial robustness guarantees with little cost to accuracy.", "bibtex": "@InProceedings{pmlr-v97-anil19a,\n title = \t {Sorting Out {L}ipschitz Function Approximation},\n author = {Anil, Cem and Lucas, James and Grosse, Roger},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {291--301},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/anil19a/anil19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/anil19a.html},\n abstract = \t {Training neural networks under a strict Lipschitz constraint is useful for provable adversarial robustness, generalization bounds, interpretable gradients, and Wasserstein distance estimation. By the composition property of Lipschitz functions, it suffices to ensure that each individual affine transformation or nonlinear activation is 1-Lipschitz. The challenge is to do this while maintaining the expressive power. We identify a necessary property for such an architecture: each of the layers must preserve the gradient norm during backpropagation. Based on this, we propose to combine a gradient norm preserving activation function, GroupSort, with norm-constrained weight matrices. We show that norm-constrained GroupSort architectures are universal Lipschitz function approximators. Empirically, we show that norm-constrained GroupSort networks achieve tighter estimates of Wasserstein distance than their ReLU counterparts and can achieve provable adversarial robustness guarantees with little cost to accuracy.}\n}", "pdf": "http://proceedings.mlr.press/v97/anil19a/anil19a.pdf", "supp": "", "pdf_size": 1912386, "gs_citation": 407, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6756522330495028804&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada; Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada; Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute, Toronto, Canada", "aff_domain": "mail.utoronto.ca;cs.toronto.edu; ", "email": "mail.utoronto.ca;cs.toronto.edu; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/anil19a.html", "aff_unique_index": "0+1;0+1;0+1", "aff_unique_norm": "University of Toronto;Vector Institute", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai", "aff_unique_abbr": "U of T;Vector Institute", "aff_campus_unique_index": "0+0;0+0;0+0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0+0;0+0;0+0", "aff_country_unique": "Canada" }, { "title": "Sparse Extreme Multi-label Learning with Oracle Property", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3806", "id": "3806", "author_site": "Weiwei Liu, Xiaobo Shen", "author": "Weiwei Liu; Xiaobo Shen", "abstract": "The pioneering work of sparse local embeddings for extreme classification (SLEEC) (Bhatia et al., 2015) has shown great promise in multi-label learning. Unfortunately, the statistical rate of convergence and oracle property of SLEEC are still not well understood. To fill this gap, we present a unified framework for SLEEC with nonconvex penalty. Theoretically, we rigorously prove that our proposed estimator enjoys oracle property (i.e., performs as well as if the underlying model were known beforehand), and obtains a desirable statistical convergence rate. Moreover, we show that under a mild condition on the magnitude of the entries in the underlying model, we are able to obtain an improved convergence rate. Extensive numerical experiments verify our theoretical findings and the superiority of our proposed estimator.", "bibtex": "@InProceedings{pmlr-v97-liu19d,\n title = \t {Sparse Extreme Multi-label Learning with Oracle Property},\n author = {Liu, Weiwei and Shen, Xiaobo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4032--4041},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19d/liu19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19d.html},\n abstract = \t {The pioneering work of sparse local embeddings for extreme classification (SLEEC) (Bhatia et al., 2015) has shown great promise in multi-label learning. Unfortunately, the statistical rate of convergence and oracle property of SLEEC are still not well understood. To fill this gap, we present a unified framework for SLEEC with nonconvex penalty. Theoretically, we rigorously prove that our proposed estimator enjoys oracle property (i.e., performs as well as if the underlying model were known beforehand), and obtains a desirable statistical convergence rate. Moreover, we show that under a mild condition on the magnitude of the entries in the underlying model, we are able to obtain an improved convergence rate. Extensive numerical experiments verify our theoretical findings and the superiority of our proposed estimator.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19d/liu19d.pdf", "supp": "", "pdf_size": 333949, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1569585035381333098&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": "School of Computer Science, Wuhan University, Wuhan, China; School of Computer Science and Engineering, Nanjing University of Science and Technology, Nanjing, China", "aff_domain": "gmail.com; ", "email": "gmail.com; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/liu19d.html", "aff_unique_index": "0;1", "aff_unique_norm": "Wuhan University;Nanjing University of Science and Technology", "aff_unique_dep": "School of Computer Science;School of Computer Science and Engineering", "aff_unique_url": "http://www.whu.edu.cn;http://www.nust.edu.cn", "aff_unique_abbr": "WHU;NUST", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Wuhan;Nanjing", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "title": "Sparse Multi-Channel Variational Autoencoder for the Joint Analysis of Heterogeneous Data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4163", "id": "4163", "author_site": "Luigi Antelmi, Nicholas Ayache, Philippe Robert, Marco Lorenzi", "author": "Luigi Antelmi; Nicholas Ayache; Philippe Robert; Marco Lorenzi", "abstract": "Interpretable modeling of heterogeneous data channels is essential in medical applications, for example when jointly analyzing clinical scores and medical images. Variational Autoencoders (VAE) are powerful generative models that learn representations of complex data. The flexibility of VAE may come at the expense of lack of interpretability in describing the joint relationship between heterogeneous data. To tackle this problem, in this work we extend the variational framework of VAE to bring parsimony and interpretability when jointly account for latent relationships across multiple channels. In the latent space, this is achieved by constraining the variational distribution of each channel to a common target prior. Parsimonious latent representations are enforced by variational dropout. Experiments on synthetic data show that our model correctly identifies the prescribed latent dimensions and data relationships across multiple testing scenarios. When applied to imaging and clinical data, our method allows to identify the joint effect of age and pathology in describing clinical condition in a large scale clinical cohort.", "bibtex": "@InProceedings{pmlr-v97-antelmi19a,\n title = \t {Sparse Multi-Channel Variational Autoencoder for the Joint Analysis of Heterogeneous Data},\n author = {Antelmi, Luigi and Ayache, Nicholas and Robert, Philippe and Lorenzi, Marco},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {302--311},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/antelmi19a/antelmi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/antelmi19a.html},\n abstract = \t {Interpretable modeling of heterogeneous data channels is essential in medical applications, for example when jointly analyzing clinical scores and medical images. Variational Autoencoders (VAE) are powerful generative models that learn representations of complex data. The flexibility of VAE may come at the expense of lack of interpretability in describing the joint relationship between heterogeneous data. To tackle this problem, in this work we extend the variational framework of VAE to bring parsimony and interpretability when jointly account for latent relationships across multiple channels. In the latent space, this is achieved by constraining the variational distribution of each channel to a common target prior. Parsimonious latent representations are enforced by variational dropout. Experiments on synthetic data show that our model correctly identifies the prescribed latent dimensions and data relationships across multiple testing scenarios. When applied to imaging and clinical data, our method allows to identify the joint effect of age and pathology in describing clinical condition in a large scale clinical cohort.}\n}", "pdf": "http://proceedings.mlr.press/v97/antelmi19a/antelmi19a.pdf", "supp": "", "pdf_size": 0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14190186910164609470&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/antelmi19a.html" }, { "title": "Spectral Approximate Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3958", "id": "3958", "author_site": "Sejun Park, Eunho Yang, Se-Young Yun, Jinwoo Shin", "author": "Sejun Park; Eunho Yang; Se-Young Yun; Jinwoo Shin", "abstract": "Given a graphical model (GM), computing its partition function is the most essential inference task, but it is computationally intractable in general. To address the issue, iterative approximation algorithms exploring certain local structure/consistency of GM have been investigated as popular choices in practice. However, due to their local/iterative nature, they often output poor approximations or even do not converge, e.g., in low-temperature regimes (hard instances of large parameters). To overcome the limitation, we propose a novel approach utilizing the global spectral feature of GM. Our contribution is two-fold: (a) we first propose a fully polynomial-time approximation scheme (FPTAS) for approximating the partition function of GM associating with a low-rank coupling matrix; (b) for general high-rank GMs, we design a spectral mean-field scheme utilizing (a) as a subroutine, where it approximates a high-rank GM into a product of rank-1 GMs for an efficient approximation of the partition function. The proposed algorithm is more robust in its running time and accuracy than prior methods, i.e., neither suffers from the convergence issue nor depends on hard local structures, as demonstrated in our experiments.", "bibtex": "@InProceedings{pmlr-v97-park19c,\n title = \t {Spectral Approximate Inference},\n author = {Park, Sejun and Yang, Eunho and Yun, Se-Young and Shin, Jinwoo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5052--5061},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/park19c/park19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/park19c.html},\n abstract = \t {Given a graphical model (GM), computing its partition function is the most essential inference task, but it is computationally intractable in general. To address the issue, iterative approximation algorithms exploring certain local structure/consistency of GM have been investigated as popular choices in practice. However, due to their local/iterative nature, they often output poor approximations or even do not converge, e.g., in low-temperature regimes (hard instances of large parameters). To overcome the limitation, we propose a novel approach utilizing the global spectral feature of GM. Our contribution is two-fold: (a) we first propose a fully polynomial-time approximation scheme (FPTAS) for approximating the partition function of GM associating with a low-rank coupling matrix; (b) for general high-rank GMs, we design a spectral mean-field scheme utilizing (a) as a subroutine, where it approximates a high-rank GM into a product of rank-1 GMs for an efficient approximation of the partition function. The proposed algorithm is more robust in its running time and accuracy than prior methods, i.e., neither suffers from the convergence issue nor depends on hard local structures, as demonstrated in our experiments.}\n}", "pdf": "http://proceedings.mlr.press/v97/park19c/park19c.pdf", "supp": "", "pdf_size": 628886, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10068726506372979214&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "School of Electrical Engineering, KAIST, Daejeon, Korea; School of Computing, KAIST, Daejeon, Korea + Graduate School of AI, KAIST, Daejeon, Korea + AITRICS, Seoul, Korea; Graduate School of AI, KAIST, Daejeon, Korea + Department of Industrial & System Engineering, KAIST, Daejeon, Korea; School of Electrical Engineering, KAIST, Daejeon, Korea + Graduate School of AI, KAIST, Daejeon, Korea + AITRICS, Seoul, Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/park19c.html", "aff_unique_index": "0;0+0+1;0+0;0+0+1", "aff_unique_norm": "KAIST;AITRICS", "aff_unique_dep": "School of Electrical Engineering;", "aff_unique_url": "https://www.kaist.ac.kr;", "aff_unique_abbr": "KAIST;", "aff_campus_unique_index": "0;0+0+1;0+0;0+0+1", "aff_campus_unique": "Daejeon;Seoul", "aff_country_unique_index": "0;0+0+0;0+0;0+0+0", "aff_country_unique": "South Korea" }, { "title": "Spectral Clustering of Signed Graphs via Matrix Power Means", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3699", "id": "3699", "author_site": "Pedro Mercado, Francesco Tudisco, Matthias Hein", "author": "Pedro Mercado; Francesco Tudisco; Matthias Hein", "abstract": "Signed graphs encode positive (attractive) and negative (repulsive) relations between nodes. We extend spectral clustering to signed graphs via the one-parameter family of Signed Power Mean Laplacians, defined as the matrix power mean of normalized standard and signless Laplacians of positive and negative edges. We provide a thorough analysis of the proposed approach in the setting of a general Stochastic Block Model that includes models such as the Labeled Stochastic Block Model and the Censored Block Model. We show that in expectation the signed power mean Laplacian captures the ground truth clusters under reasonable settings where state-of-the-art approaches fail. Moreover, we prove that the eigenvalues and eigenvector of the signed power mean Laplacian concentrate around their expectation under reasonable conditions in the general Stochastic Block Model. Extensive experiments on random graphs and real world datasets confirm the theoretically predicted behaviour of the signed power mean Laplacian and show that it compares favourably with state-of-the-art methods.", "bibtex": "@InProceedings{pmlr-v97-mercado19a,\n title = \t {Spectral Clustering of Signed Graphs via Matrix Power Means},\n author = {Mercado, Pedro and Tudisco, Francesco and Hein, Matthias},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4526--4536},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mercado19a/mercado19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mercado19a.html},\n abstract = \t {Signed graphs encode positive (attractive) and negative (repulsive) relations between nodes. We extend spectral clustering to signed graphs via the one-parameter family of Signed Power Mean Laplacians, defined as the matrix power mean of normalized standard and signless Laplacians of positive and negative edges. We provide a thorough analysis of the proposed approach in the setting of a general Stochastic Block Model that includes models such as the Labeled Stochastic Block Model and the Censored Block Model. We show that in expectation the signed power mean Laplacian captures the ground truth clusters under reasonable settings where state-of-the-art approaches fail. Moreover, we prove that the eigenvalues and eigenvector of the signed power mean Laplacian concentrate around their expectation under reasonable conditions in the general Stochastic Block Model. Extensive experiments on random graphs and real world datasets confirm the theoretically predicted behaviour of the signed power mean Laplacian and show that it compares favourably with state-of-the-art methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/mercado19a/mercado19a.pdf", "supp": "", "pdf_size": 5442645, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13424655042076258465&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Saarland University+University of T\u00fcbingen; University of Strathclyde; University of T\u00fcbingen", "aff_domain": "cs.uni-saarland.de; ; ", "email": "cs.uni-saarland.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mercado19a.html", "aff_unique_index": "0+1;2;1", "aff_unique_norm": "Saarland University;University of T\u00fcbingen;University of Strathclyde", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-saarland.de;https://www.uni-tuebingen.de/;https://www.strath.ac.uk", "aff_unique_abbr": "UdS;Uni T\u00fcbingen;Strathclyde", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0+0;1;0", "aff_country_unique": "Germany;United Kingdom" }, { "title": "Stable and Fair Classification", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4314", "id": "4314", "author_site": "Lingxiao Huang, Nisheeth Vishnoi", "author": "Lingxiao Huang; Nisheeth Vishnoi", "abstract": "In a recent study, Friedler et al. pointed out that several fair classification algorithms are not stable with respect to variations in the training set \u2013 a crucial consideration in several applications. Motivated by their work, we study the problem of designing classification algorithms that are both fair and stable. We propose an extended framework based on fair classification algorithms that are formulated as optimization problems, by introducing a stability-focused regularization term. Theoretically, we prove an additional stability guarantee, that was lacking in fair classification algorithms, and also provide an accuracy guarantee for our extended framework. Our accuracy guarantee can be used to inform the selection of the regularization parameter in our framework. We assess the benefits of our approach empirically by extending several fair classification algorithms that are shown to achieve the best balance between fairness and accuracy over the \\textbf{Adult} dataset. Our empirical results show that our extended framework indeed improves the stability at only a slight sacrifice in accuracy.", "bibtex": "@InProceedings{pmlr-v97-huang19e,\n title = \t {Stable and Fair Classification},\n author = {Huang, Lingxiao and Vishnoi, Nisheeth},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2879--2890},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/huang19e/huang19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/huang19e.html},\n abstract = \t {In a recent study, Friedler et al. pointed out that several fair classification algorithms are not stable with respect to variations in the training set \u2013 a crucial consideration in several applications. Motivated by their work, we study the problem of designing classification algorithms that are both fair and stable. We propose an extended framework based on fair classification algorithms that are formulated as optimization problems, by introducing a stability-focused regularization term. Theoretically, we prove an additional stability guarantee, that was lacking in fair classification algorithms, and also provide an accuracy guarantee for our extended framework. Our accuracy guarantee can be used to inform the selection of the regularization parameter in our framework. We assess the benefits of our approach empirically by extending several fair classification algorithms that are shown to achieve the best balance between fairness and accuracy over the \\textbf{Adult} dataset. Our empirical results show that our extended framework indeed improves the stability at only a slight sacrifice in accuracy.}\n}", "pdf": "http://proceedings.mlr.press/v97/huang19e/huang19e.pdf", "supp": "", "pdf_size": 440678, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6209492851752994222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "EPFL, Switzerland; Yale University, USA", "aff_domain": ";yale.edu", "email": ";yale.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/huang19e.html", "aff_unique_index": "0;1", "aff_unique_norm": "EPFL;Yale University", "aff_unique_dep": ";", "aff_unique_url": "https://www.epfl.ch;https://www.yale.edu", "aff_unique_abbr": "EPFL;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Switzerland;United States" }, { "title": "Stable-Predictive Optimistic Counterfactual Regret Minimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3839", "id": "3839", "author_site": "Gabriele Farina, Christian Kroer, Noam Brown, Tuomas Sandholm", "author": "Gabriele Farina; Christian Kroer; Noam Brown; Tuomas Sandholm", "abstract": "The CFR framework has been a powerful tool for solving large-scale extensive-form games in practice. However, the theoretical rate at which past CFR-based algorithms converge to the Nash equilibrium is on the order of $O(T^{-1/2})$, where $T$ is the number of iterations. In contrast, first-order methods can be used to achieve a $O(T^{-1})$ dependence on iterations, yet these methods have been less successful in practice. In this work we present the first CFR variant that breaks the square-root dependence on iterations. By combining and extending recent advances on predictive and stable regret minimizers for the matrix-game setting we show that it is possible to leverage \u201coptimistic\u201d regret minimizers to achieve a $O(T^{-3/4})$ convergence rate within CFR. This is achieved by introducing a new notion of stable-predictivity, and by setting the stability of each counterfactual regret minimizer relative to its location in the decision tree. Experiments show that this method is faster than the original CFR algorithm, although not as fast as newer variants, in spite of their worst-case $O(T^{-1/2})$ dependence on iterations.", "bibtex": "@InProceedings{pmlr-v97-farina19a,\n title = \t {Stable-Predictive Optimistic Counterfactual Regret Minimization},\n author = {Farina, Gabriele and Kroer, Christian and Brown, Noam and Sandholm, Tuomas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1853--1862},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/farina19a/farina19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/farina19a.html},\n abstract = \t {The CFR framework has been a powerful tool for solving large-scale extensive-form games in practice. However, the theoretical rate at which past CFR-based algorithms converge to the Nash equilibrium is on the order of $O(T^{-1/2})$, where $T$ is the number of iterations. In contrast, first-order methods can be used to achieve a $O(T^{-1})$ dependence on iterations, yet these methods have been less successful in practice. In this work we present the first CFR variant that breaks the square-root dependence on iterations. By combining and extending recent advances on predictive and stable regret minimizers for the matrix-game setting we show that it is possible to leverage \u201coptimistic\u201d regret minimizers to achieve a $O(T^{-3/4})$ convergence rate within CFR. This is achieved by introducing a new notion of stable-predictivity, and by setting the stability of each counterfactual regret minimizer relative to its location in the decision tree. Experiments show that this method is faster than the original CFR algorithm, although not as fast as newer variants, in spite of their worst-case $O(T^{-1/2})$ dependence on iterations.}\n}", "pdf": "http://proceedings.mlr.press/v97/farina19a/farina19a.pdf", "supp": "", "pdf_size": 392753, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7711798214402478607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Computer Science Department, Carnegie Mellon University, Pittsburgh PA 15213 + Strategic Machine, Inc. + Strategy Robot, Inc. + Optimized Markets, Inc.; IEOR Department, Columbia University, New York NY 10027; Computer Science Department, Carnegie Mellon University, Pittsburgh PA 15213 + Strategic Machine, Inc. + Strategy Robot, Inc. + Optimized Markets, Inc.; Computer Science Department, Carnegie Mellon University, Pittsburgh PA 15213 + Strategic Machine, Inc. + Strategy Robot, Inc. + Optimized Markets, Inc.", "aff_domain": "cs.cmu.edu;columbia.edu;cs.cmu.edu;cs.cmu.edu", "email": "cs.cmu.edu;columbia.edu;cs.cmu.edu;cs.cmu.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/farina19a.html", "aff_unique_index": "0+1+2+3;4;0+1+2+3;0+1+2+3", "aff_unique_norm": "Carnegie Mellon University;Strategic Machine, Inc.;Strategy Robot, Inc.;Optimized Markets, Inc.;Columbia University", "aff_unique_dep": "Computer Science Department;;;;IEOR Department", "aff_unique_url": "https://www.cmu.edu;;;;https://www.columbia.edu", "aff_unique_abbr": "CMU;;;;Columbia", "aff_campus_unique_index": "0;2;0;0", "aff_campus_unique": "Pittsburgh;;New York", "aff_country_unique_index": "0+0+0+0;0;0+0+0+0;0+0+0+0", "aff_country_unique": "United States" }, { "title": "State-Regularized Recurrent Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3704", "id": "3704", "author_site": "Cheng Wang, Mathias Niepert", "author": "Cheng Wang; Mathias Niepert", "abstract": "Recurrent neural networks are a widely used class of neural architectures with two shortcomings. First, it is difficult to understand what exactly they learn. Second, they tend to work poorly on sequences requiring long-term memorization, despite having this capacity in principle. We aim to address both shortcomings with a class of recurrent networks that use a stochastic state transition mechanism between cell applications. This mechanism, which we term state-regularization, makes RNNs transition between a finite set of learnable states. We evaluate state-regularized RNNs on (1) regular languages for the purpose of automata extraction; (2) nonregular languages such as balanced parentheses, palindromes, and the copy task where external memory is required; and (3) real-word sequence learning tasks for sentiment analysis, visual object recognition, and language modeling. We show that state-regularization simplifies the extraction of finite state automata from the RNN\u2019s state transition dynamics; forces RNNs to operate more like automata with external memory and less like finite state machines; and makes RNNs more interpretable.", "bibtex": "@InProceedings{pmlr-v97-wang19j,\n title = \t {State-Regularized Recurrent Neural Networks},\n author = {Wang, Cheng and Niepert, Mathias},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6596--6606},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wang19j/wang19j.pdf},\n url = \t {https://proceedings.mlr.press/v97/wang19j.html},\n abstract = \t {Recurrent neural networks are a widely used class of neural architectures with two shortcomings. First, it is difficult to understand what exactly they learn. Second, they tend to work poorly on sequences requiring long-term memorization, despite having this capacity in principle. We aim to address both shortcomings with a class of recurrent networks that use a stochastic state transition mechanism between cell applications. This mechanism, which we term state-regularization, makes RNNs transition between a finite set of learnable states. We evaluate state-regularized RNNs on (1) regular languages for the purpose of automata extraction; (2) nonregular languages such as balanced parentheses, palindromes, and the copy task where external memory is required; and (3) real-word sequence learning tasks for sentiment analysis, visual object recognition, and language modeling. We show that state-regularization simplifies the extraction of finite state automata from the RNN\u2019s state transition dynamics; forces RNNs to operate more like automata with external memory and less like finite state machines; and makes RNNs more interpretable.}\n}", "pdf": "http://proceedings.mlr.press/v97/wang19j/wang19j.pdf", "supp": "", "pdf_size": 925531, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6298964603727251024&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "NEC Laboratories Europe, Heidelberg, Germany; NEC Laboratories Europe, Heidelberg, Germany", "aff_domain": "neclab.eu; ", "email": "neclab.eu; ", "github": "https://github.com/deepsemantic/sr-rnns", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/wang19j.html", "aff_unique_index": "0;0", "aff_unique_norm": "NEC Laboratories Europe", "aff_unique_dep": "", "aff_unique_url": "https://www.nec-labs.eu", "aff_unique_abbr": "NEC Europe", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Heidelberg", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "title": "State-Reification Networks: Improving Generalization by Modeling the Distribution of Hidden Representations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3949", "id": "3949", "author_site": "Alex Lamb, Jonathan Binas, Anirudh Goyal, Sandeep Subramanian, Ioannis Mitliagkas, Yoshua Bengio, Michael Mozer", "author": "Alex Lamb; Jonathan Binas; Anirudh Goyal; Sandeep Subramanian; Ioannis Mitliagkas; Yoshua Bengio; Michael Mozer", "abstract": "Machine learning promises methods that generalize well from finite labeled data. However, the brittleness of existing neural net approaches is revealed by notable failures, such as the existence of adversarial examples that are misclassified despite being nearly identical to a training example, or the inability of recurrent sequence-processing nets to stay on track without teacher forcing. We introduce a method, which we refer to as _state reification_, that involves modeling the distribution of hidden states over the training data and then projecting hidden states observed during testing toward this distribution. Our intuition is that if the network can remain in a familiar manifold of hidden space, subsequent layers of the net should be well trained to respond appropriately. We show that this state-reification method helps neural nets to generalize better, especially when labeled data are sparse, and also helps overcome the challenge of achieving robust generalization with adversarial training.", "bibtex": "@InProceedings{pmlr-v97-lamb19a,\n title = \t {State-Reification Networks: Improving Generalization by Modeling the Distribution of Hidden Representations},\n author = {Lamb, Alex and Binas, Jonathan and Goyal, Anirudh and Subramanian, Sandeep and Mitliagkas, Ioannis and Bengio, Yoshua and Mozer, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3622--3631},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lamb19a/lamb19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lamb19a.html},\n abstract = \t {Machine learning promises methods that generalize well from finite labeled data. However, the brittleness of existing neural net approaches is revealed by notable failures, such as the existence of adversarial examples that are misclassified despite being nearly identical to a training example, or the inability of recurrent sequence-processing nets to stay on track without teacher forcing. We introduce a method, which we refer to as _state reification_, that involves modeling the distribution of hidden states over the training data and then projecting hidden states observed during testing toward this distribution. Our intuition is that if the network can remain in a familiar manifold of hidden space, subsequent layers of the net should be well trained to respond appropriately. We show that this state-reification method helps neural nets to generalize better, especially when labeled data are sparse, and also helps overcome the challenge of achieving robust generalization with adversarial training.}\n}", "pdf": "http://proceedings.mlr.press/v97/lamb19a/lamb19a.pdf", "supp": "", "pdf_size": 553268, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10781148575630398498&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/lamb19a.html" }, { "title": "Static Automatic Batching In TensorFlow", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3703", "id": "3703", "author": "Ashish Agarwal", "abstract": "Dynamic neural networks are becoming increasingly common, and yet it is hard to implement them efficiently. On-the-fly operation batching for such models is sub-optimal and suffers from run time overheads, while writing manually batched versions can be hard and error-prone. To address this we extend TensorFlow with pfor, a parallel-for loop optimized using static loop vectorization. With pfor, users can express computation using nested loops and conditional constructs, but get performance resembling that of a manually batched version. Benchmarks demonstrate speedups of one to two orders of magnitude on range of tasks, from jacobian computation, to Graph Neural Networks.", "bibtex": "@InProceedings{pmlr-v97-agarwal19a,\n title = \t {Static Automatic Batching In {T}ensor{F}low},\n author = {Agarwal, Ashish},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {92--101},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/agarwal19a/agarwal19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/agarwal19a.html},\n abstract = \t {Dynamic neural networks are becoming increasingly common, and yet it is hard to implement them efficiently. On-the-fly operation batching for such models is sub-optimal and suffers from run time overheads, while writing manually batched versions can be hard and error-prone. To address this we extend TensorFlow with pfor, a parallel-for loop optimized using static loop vectorization. With pfor, users can express computation using nested loops and conditional constructs, but get performance resembling that of a manually batched version. Benchmarks demonstrate speedups of one to two orders of magnitude on range of tasks, from jacobian computation, to Graph Neural Networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/agarwal19a/agarwal19a.pdf", "supp": "", "pdf_size": 1456268, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6406830406097252195&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Google Inc.", "aff_domain": "google.com", "email": "google.com", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/agarwal19a.html", "aff_unique_index": "0", "aff_unique_norm": "Google", "aff_unique_dep": "Google", "aff_unique_url": "https://www.google.com", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Statistical Foundations of Virtual Democracy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3565", "id": "3565", "author_site": "Anson Kahng, Min Kyung Lee, Ritesh Noothigattu, Ariel Procaccia, Christos-Alexandros Psomas", "author": "Anson Kahng; Min Kyung Lee; Ritesh Noothigattu; Ariel Procaccia; Christos-Alexandros Psomas", "abstract": "Virtual democracy is an approach to automating decisions, by learning models of the preferences of individual people, and, at runtime, aggregating the predicted preferences of those people on the dilemma at hand. One of the key questions is which aggregation method \u2013 or voting rule \u2013 to use; we offer a novel statistical viewpoint that provides guidance. Specifically, we seek voting rules that are robust to prediction errors, in that their output on people\u2019s true preferences is likely to coincide with their output on noisy estimates thereof. We prove that the classic Borda count rule is robust in this sense, whereas any voting rule belonging to the wide family of pairwise-majority consistent rules is not. Our empirical results further support, and more precisely measure, the robustness of Borda count.", "bibtex": "@InProceedings{pmlr-v97-kahng19a,\n title = \t {Statistical Foundations of Virtual Democracy},\n author = {Kahng, Anson and Lee, Min Kyung and Noothigattu, Ritesh and Procaccia, Ariel and Psomas, Christos-Alexandros},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3173--3182},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kahng19a/kahng19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kahng19a.html},\n abstract = \t {Virtual democracy is an approach to automating decisions, by learning models of the preferences of individual people, and, at runtime, aggregating the predicted preferences of those people on the dilemma at hand. One of the key questions is which aggregation method \u2013 or voting rule \u2013 to use; we offer a novel statistical viewpoint that provides guidance. Specifically, we seek voting rules that are robust to prediction errors, in that their output on people\u2019s true preferences is likely to coincide with their output on noisy estimates thereof. We prove that the classic Borda count rule is robust in this sense, whereas any voting rule belonging to the wide family of pairwise-majority consistent rules is not. Our empirical results further support, and more precisely measure, the robustness of Borda count.}\n}", "pdf": "http://proceedings.mlr.press/v97/kahng19a/kahng19a.pdf", "supp": "", "pdf_size": 2615088, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5255896234615689205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "aff": "School of Computer Science, Carnegie Mellon University, Pittsburgh, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA; School of Computer Science, Carnegie Mellon University, Pittsburgh, USA", "aff_domain": "cs.cmu.edu; ; ; ; ", "email": "cs.cmu.edu; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kahng19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "School of Computer Science", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Statistics and Samples in Distributional Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3583", "id": "3583", "author_site": "Mark Rowland, Robert Dadashi, Saurabh Kumar, Remi Munos, Marc Bellemare, Will Dabney", "author": "Mark Rowland; Robert Dadashi; Saurabh Kumar; Remi Munos; Marc G. Bellemare; Will Dabney", "abstract": "We present a unifying framework for designing and analysing distributional reinforcement learning (DRL) algorithms in terms of recursively estimating statistics of the return distribution. Our key insight is that DRL algorithms can be decomposed as the combination of some statistical estimator and a method for imputing a return distribution consistent with that set of statistics. With this new understanding, we are able to provide improved analyses of existing DRL algorithms as well as construct a new algorithm (EDRL) based upon estimation of the expectiles of the return distribution. We compare EDRL with existing methods on a variety of MDPs to illustrate concrete aspects of our analysis, and develop a deep RL variant of the algorithm, ER-DQN, which we evaluate on the Atari-57 suite of games.", "bibtex": "@InProceedings{pmlr-v97-rowland19a,\n title = \t {Statistics and Samples in Distributional Reinforcement Learning},\n author = {Rowland, Mark and Dadashi, Robert and Kumar, Saurabh and Munos, Remi and Bellemare, Marc G. and Dabney, Will},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5528--5536},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/rowland19a/rowland19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/rowland19a.html},\n abstract = \t {We present a unifying framework for designing and analysing distributional reinforcement learning (DRL) algorithms in terms of recursively estimating statistics of the return distribution. Our key insight is that DRL algorithms can be decomposed as the combination of some statistical estimator and a method for imputing a return distribution consistent with that set of statistics. With this new understanding, we are able to provide improved analyses of existing DRL algorithms as well as construct a new algorithm (EDRL) based upon estimation of the expectiles of the return distribution. We compare EDRL with existing methods on a variety of MDPs to illustrate concrete aspects of our analysis, and develop a deep RL variant of the algorithm, ER-DQN, which we evaluate on the Atari-57 suite of games.}\n}", "pdf": "http://proceedings.mlr.press/v97/rowland19a/rowland19a.pdf", "supp": "", "pdf_size": 2413856, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3458241581369243667&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "aff": "DeepMind; Google Brain; Google Brain; DeepMind; Google Brain; DeepMind", "aff_domain": "google.com; ; ; ; ; ", "email": "google.com; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/rowland19a.html", "aff_unique_index": "0;1;1;0;1;0", "aff_unique_norm": "DeepMind;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": "https://deepmind.com;https://brain.google.com", "aff_unique_abbr": "DeepMind;Google Brain", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;0;1;0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Stay With Me: Lifetime Maximization Through Heteroscedastic Linear Bandits With Reneging", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3928", "id": "3928", "author_site": "Ping-Chun Hsieh, Xi Liu, Anirban Bhattacharya, P R Kumar", "author": "Ping-Chun Hsieh; Xi Liu; Anirban Bhattacharya; P R Kumar", "abstract": "Sequential decision making for lifetime maximization is a critical problem in many real-world applications, such as medical treatment and portfolio selection. In these applications, a \u201creneging\u201d phenomenon, where participants may disengage from future interactions after observing an unsatisfiable outcome, is rather prevalent. To address the above issue, this paper proposes a model of heteroscedastic linear bandits with reneging, which allows each participant to have a distinct \u201csatisfaction level,\" with any interaction outcome falling short of that level resulting in that participant reneging. Moreover, it allows the variance of the outcome to be context-dependent. Based on this model, we develop a UCB-type policy, namely HR-UCB, and prove that it achieves $\\mathcal{O}\\big(\\sqrt{{T}(\\log({T}))^{3}}\\big)$ regret. Finally, we validate the performance of HR-UCB via simulations.", "bibtex": "@InProceedings{pmlr-v97-hsieh19a,\n title = \t {Stay With Me: Lifetime Maximization Through Heteroscedastic Linear Bandits With Reneging},\n author = {Hsieh, Ping-Chun and Liu, Xi and Bhattacharya, Anirban and Kumar, P R},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2800--2809},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hsieh19a/hsieh19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hsieh19a.html},\n abstract = \t {Sequential decision making for lifetime maximization is a critical problem in many real-world applications, such as medical treatment and portfolio selection. In these applications, a \u201creneging\u201d phenomenon, where participants may disengage from future interactions after observing an unsatisfiable outcome, is rather prevalent. To address the above issue, this paper proposes a model of heteroscedastic linear bandits with reneging, which allows each participant to have a distinct \u201csatisfaction level,\" with any interaction outcome falling short of that level resulting in that participant reneging. Moreover, it allows the variance of the outcome to be context-dependent. Based on this model, we develop a UCB-type policy, namely HR-UCB, and prove that it achieves $\\mathcal{O}\\big(\\sqrt{{T}(\\log({T}))^{3}}\\big)$ regret. Finally, we validate the performance of HR-UCB via simulations.}\n}", "pdf": "http://proceedings.mlr.press/v97/hsieh19a/hsieh19a.pdf", "supp": "", "pdf_size": 1597195, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2506228425562142864&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Electrical and Computer Engineering, Texas A&M University; Department of Electrical and Computer Engineering, Texas A&M University; Department of Statistics, Texas A&M University; Department of Electrical and Computer Engineering, Texas A&M University", "aff_domain": "tamu.edu;gmail.com; ; ", "email": "tamu.edu;gmail.com; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/hsieh19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Texas A&M University", "aff_unique_dep": "Department of Electrical and Computer Engineering", "aff_unique_url": "https://www.tamu.edu", "aff_unique_abbr": "TAMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Stein Point Markov Chain Monte Carlo", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3730", "id": "3730", "author_site": "Wilson Ye Chen, Alessandro Barp, Francois-Xavier Briol, Jackson Gorham, Mark Girolami, Lester Mackey, Chris Oates", "author": "Wilson Ye Chen; Alessandro Barp; Francois-Xavier Briol; Jackson Gorham; Mark Girolami; Lester Mackey; Chris Oates", "abstract": "An important task in machine learning and statistics is the approximation of a probability measure by an empirical measure supported on a discrete point set. Stein Points are a class of algorithms for this task, which proceed by sequentially minimising a Stein discrepancy between the empirical measure and the target and, hence, require the solution of a non-convex optimisation problem to obtain each new point. This paper removes the need to solve this optimisation problem by, instead, selecting each new point based on a Markov chain sample path. This significantly reduces the computational cost of Stein Points and leads to a suite of algorithms that are straightforward to implement. The new algorithms are illustrated on a set of challenging Bayesian inference problems, and rigorous theoretical guarantees of consistency are established.", "bibtex": "@InProceedings{pmlr-v97-chen19b,\n title = \t {Stein Point {M}arkov Chain {M}onte {C}arlo},\n author = {Chen, Wilson Ye and Barp, Alessandro and Briol, Francois-Xavier and Gorham, Jackson and Girolami, Mark and Mackey, Lester and Oates, Chris},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1011--1021},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19b/chen19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19b.html},\n abstract = \t {An important task in machine learning and statistics is the approximation of a probability measure by an empirical measure supported on a discrete point set. Stein Points are a class of algorithms for this task, which proceed by sequentially minimising a Stein discrepancy between the empirical measure and the target and, hence, require the solution of a non-convex optimisation problem to obtain each new point. This paper removes the need to solve this optimisation problem by, instead, selecting each new point based on a Markov chain sample path. This significantly reduces the computational cost of Stein Points and leads to a suite of algorithms that are straightforward to implement. The new algorithms are illustrated on a set of challenging Bayesian inference problems, and rigorous theoretical guarantees of consistency are established.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19b/chen19b.pdf", "supp": "", "pdf_size": 1063727, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6889028915730960186&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": ";;;;;;", "aff_domain": ";;;;;;", "email": ";;;;;;", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/chen19b.html" }, { "title": "Stochastic Beams and Where To Find Them: The Gumbel-Top-k Trick for Sampling Sequences Without Replacement", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3878", "id": "3878", "author_site": "Wouter Kool, Herke van Hoof, Max Welling", "author": "Wouter Kool; Herke Van Hoof; Max Welling", "abstract": "The well-known Gumbel-Max trick for sampling from a categorical distribution can be extended to sample $k$ elements without replacement. We show how to implicitly apply this \u2019Gumbel-Top-$k$\u2019 trick on a factorized distribution over sequences, allowing to draw exact samples without replacement using a Stochastic Beam Search. Even for exponentially large domains, the number of model evaluations grows only linear in $k$ and the maximum sampled sequence length. The algorithm creates a theoretical connection between sampling and (deterministic) beam search and can be used as a principled intermediate alternative. In a translation task, the proposed method compares favourably against alternatives to obtain diverse yet good quality translations. We show that sequences sampled without replacement can be used to construct low-variance estimators for expected sentence-level BLEU score and model entropy.", "bibtex": "@InProceedings{pmlr-v97-kool19a,\n title = \t {Stochastic Beams and Where To Find Them: The {G}umbel-Top-k Trick for Sampling Sequences Without Replacement},\n author = {Kool, Wouter and Van Hoof, Herke and Welling, Max},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3499--3508},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kool19a/kool19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kool19a.html},\n abstract = \t {The well-known Gumbel-Max trick for sampling from a categorical distribution can be extended to sample $k$ elements without replacement. We show how to implicitly apply this \u2019Gumbel-Top-$k$\u2019 trick on a factorized distribution over sequences, allowing to draw exact samples without replacement using a Stochastic Beam Search. Even for exponentially large domains, the number of model evaluations grows only linear in $k$ and the maximum sampled sequence length. The algorithm creates a theoretical connection between sampling and (deterministic) beam search and can be used as a principled intermediate alternative. In a translation task, the proposed method compares favourably against alternatives to obtain diverse yet good quality translations. We show that sequences sampled without replacement can be used to construct low-variance estimators for expected sentence-level BLEU score and model entropy.}\n}", "pdf": "http://proceedings.mlr.press/v97/kool19a/kool19a.pdf", "supp": "", "pdf_size": 893586, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13121847178128779153&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "University of Amsterdam, The Netherlands+ORTEC, The Netherlands; University of Amsterdam, The Netherlands; University of Amsterdam, The Netherlands+CIFAR, Canada", "aff_domain": "uva.nl; ; ", "email": "uva.nl; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/kool19a.html", "aff_unique_index": "0+1;0;0+2", "aff_unique_norm": "University of Amsterdam;ORTEC;Canadian Institute for Advanced Research", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uva.nl;;https://www.cifar.ca", "aff_unique_abbr": "UvA;;CIFAR", "aff_campus_unique_index": ";", "aff_campus_unique": "", "aff_country_unique_index": "0+0;0;0+1", "aff_country_unique": "Netherlands;Canada" }, { "title": "Stochastic Blockmodels meet Graph Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3686", "id": "3686", "author_site": "Nikhil Mehta, Lawrence Carin, Piyush Rai", "author": "Nikhil Mehta; Lawrence Carin Duke; Piyush Rai", "abstract": "Stochastic blockmodels (SBM) and their variants, $e.g.$, mixed-membership and overlapping stochastic blockmodels, are latent variable based generative models for graphs. They have proven to be successful for various tasks, such as discovering the community structure and link prediction on graph-structured data. Recently, graph neural networks, $e.g.$, graph convolutional networks, have also emerged as a promising approach to learn powerful representations (embeddings) for the nodes in the graph, by exploiting graph properties such as locality and invariance. In this work, we unify these two directions by developing a", "bibtex": "@InProceedings{pmlr-v97-mehta19a,\n title = \t {Stochastic Blockmodels meet Graph Neural Networks},\n author = {Mehta, Nikhil and Duke, Lawrence Carin and Rai, Piyush},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4466--4474},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mehta19a/mehta19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mehta19a.html},\n abstract = \t {Stochastic blockmodels (SBM) and their variants, $e.g.$, mixed-membership and overlapping stochastic blockmodels, are latent variable based generative models for graphs. They have proven to be successful for various tasks, such as discovering the community structure and link prediction on graph-structured data. Recently, graph neural networks, $e.g.$, graph convolutional networks, have also emerged as a promising approach to learn powerful representations (embeddings) for the nodes in the graph, by exploiting graph properties such as locality and invariance. In this work, we unify these two directions by developing a", "pdf": "http://proceedings.mlr.press/v97/mehta19a/mehta19a.pdf", "supp": "", "pdf_size": 500773, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=852649190864167140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Electrical and Computer Engineering, Duke University + IIT Kanpur; Department of Electrical and Computer Engineering, Duke University; Department of Computer Science, IIT Kanpur", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mehta19a.html", "aff_unique_index": "0+1;0;1", "aff_unique_norm": "Duke University;Indian Institute of Technology Kanpur", "aff_unique_dep": "Department of Electrical and Computer Engineering;", "aff_unique_url": "https://www.duke.edu;https://www.iitk.ac.in", "aff_unique_abbr": "Duke;IITK", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Kanpur", "aff_country_unique_index": "0+1;0;1", "aff_country_unique": "United States;India" }, { "title": "Stochastic Deep Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3748", "id": "3748", "author_site": "Gwendoline De Bie, Gabriel Peyr\u00e9, Marco Cuturi", "author": "Gwendoline De Bie; Gabriel Peyr\u00e9; Marco Cuturi", "abstract": "Machine learning is increasingly targeting areas where input data cannot be accurately described by a single vector, but can be modeled instead using the more flexible concept of random vectors, namely probability measures or more simply point clouds of varying cardinality. Using deep architectures on measures poses, however, many challenging issues. Indeed, deep architectures are originally designed to handle fixed-length vectors, or, using recursive mechanisms, ordered sequences thereof. In sharp contrast, measures describe a varying number of weighted observations with no particular order. We propose in this work a deep framework designed to handle crucial aspects of measures, namely permutation invariances, variations in weights and cardinality. Architectures derived from this pipeline can (i) map measures to measures - using the concept of push-forward operators; (ii) bridge the gap between measures and Euclidean spaces - through integration steps. This allows to design discriminative networks (to classify or reduce the dimensionality of input measures), generative architectures (to synthesize measures) and recurrent pipelines (to predict measure dynamics). We provide a theoretical analysis of these building blocks, review our architectures\u2019 approximation abilities and robustness w.r.t. perturbation, and try them on various discriminative and generative tasks.", "bibtex": "@InProceedings{pmlr-v97-de-bie19a,\n title = \t {Stochastic Deep Networks},\n author = {De Bie, Gwendoline and Peyr{\\'e}, Gabriel and Cuturi, Marco},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1556--1565},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/de-bie19a/de-bie19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/de-bie19a.html},\n abstract = \t {Machine learning is increasingly targeting areas where input data cannot be accurately described by a single vector, but can be modeled instead using the more flexible concept of random vectors, namely probability measures or more simply point clouds of varying cardinality. Using deep architectures on measures poses, however, many challenging issues. Indeed, deep architectures are originally designed to handle fixed-length vectors, or, using recursive mechanisms, ordered sequences thereof. In sharp contrast, measures describe a varying number of weighted observations with no particular order. We propose in this work a deep framework designed to handle crucial aspects of measures, namely permutation invariances, variations in weights and cardinality. Architectures derived from this pipeline can (i) map measures to measures - using the concept of push-forward operators; (ii) bridge the gap between measures and Euclidean spaces - through integration steps. This allows to design discriminative networks (to classify or reduce the dimensionality of input measures), generative architectures (to synthesize measures) and recurrent pipelines (to predict measure dynamics). We provide a theoretical analysis of these building blocks, review our architectures\u2019 approximation abilities and robustness w.r.t. perturbation, and try them on various discriminative and generative tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/de-bie19a/de-bie19a.pdf", "supp": "", "pdf_size": 1824781, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6722999351730373191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "\u00b4Ecole Normale Sup \u00b4erieure, DMA, Paris, France; CNRS; CREST/ENSAE Paristech + Google Brain, Paris, France", "aff_domain": "ens.fr; ; ", "email": "ens.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/de-bie19a.html", "aff_unique_index": "0;1;2+3", "aff_unique_norm": "Ecole Normale Sup\u00e9rieure;Centre National de la Recherche Scientifique;CREST;Google", "aff_unique_dep": "DMA;;ENSAE Paristech;Google Brain", "aff_unique_url": "https://www.ens.fr;https://www.cnrs.fr;https://www.crest.fr;https://brain.google.com", "aff_unique_abbr": "ENS;CNRS;CREST;Google Brain", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Paris;", "aff_country_unique_index": "0;0;0+0", "aff_country_unique": "France" }, { "title": "Stochastic Gradient Push for Distributed Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3899", "id": "3899", "author_site": "Mahmoud Assran, Nicolas Loizou, Nicolas Ballas, Michael Rabbat", "author": "Mahmoud Assran; Nicolas Loizou; Nicolas Ballas; Mike Rabbat", "abstract": "Distributed data-parallel algorithms aim to accelerate the training of deep neural networks by parallelizing the computation of large mini-batch gradient updates across multiple nodes. Approaches that synchronize nodes using exact distributed averaging (e.g., via AllReduce) are sensitive to stragglers and communication delays. The PushSum gossip algorithm is robust to these issues, but only performs approximate distributed averaging. This paper studies Stochastic Gradient Push (SGP), which combines PushSum with stochastic gradient updates. We prove that SGP converges to a stationary point of smooth, non-convex objectives at the same sub-linear rate as SGD, and that all nodes achieve consensus. We empirically validate the performance of SGP on image classification (ResNet-50, ImageNet) and machine translation (Transformer, WMT\u201916 En-De) workloads.", "bibtex": "@InProceedings{pmlr-v97-assran19a,\n title = \t {Stochastic Gradient Push for Distributed Deep Learning},\n author = {Assran, Mahmoud and Loizou, Nicolas and Ballas, Nicolas and Rabbat, Mike},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {344--353},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/assran19a/assran19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/assran19a.html},\n abstract = \t {Distributed data-parallel algorithms aim to accelerate the training of deep neural networks by parallelizing the computation of large mini-batch gradient updates across multiple nodes. Approaches that synchronize nodes using exact distributed averaging (e.g., via AllReduce) are sensitive to stragglers and communication delays. The PushSum gossip algorithm is robust to these issues, but only performs approximate distributed averaging. This paper studies Stochastic Gradient Push (SGP), which combines PushSum with stochastic gradient updates. We prove that SGP converges to a stationary point of smooth, non-convex objectives at the same sub-linear rate as SGD, and that all nodes achieve consensus. We empirically validate the performance of SGP on image classification (ResNet-50, ImageNet) and machine translation (Transformer, WMT\u201916 En-De) workloads.}\n}", "pdf": "http://proceedings.mlr.press/v97/assran19a/assran19a.pdf", "supp": "", "pdf_size": 521623, "gs_citation": 437, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4514037379069260169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Facebook AI Research, Montr \u00b4eal, QC, Canada+Department of Electrical and Computer Engineering, McGill University, Montr \u00b4eal, QC, Canada; Facebook AI Research, Montr \u00b4eal, QC, Canada; Facebook AI Research, Montr \u00b4eal, QC, Canada; Facebook AI Research, Montr \u00b4eal, QC, Canada", "aff_domain": "mail.mcgill.ca; ; ; ", "email": "mail.mcgill.ca; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/assran19a.html", "aff_unique_index": "0+1;0;0;0", "aff_unique_norm": "Meta;McGill University", "aff_unique_dep": "AI Research;Department of Electrical and Computer Engineering", "aff_unique_url": "https://research.facebook.com;https://www.mcgill.ca", "aff_unique_abbr": "FAIR;McGill", "aff_campus_unique_index": "0+0;0;0;0", "aff_campus_unique": "Montr\u00e9al", "aff_country_unique_index": "0+0;0;0;0", "aff_country_unique": "Canada" }, { "title": "Stochastic Iterative Hard Thresholding for Graph-structured Sparsity Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3889", "id": "3889", "author_site": "Baojian Zhou, Feng Chen, Yiming Ying", "author": "Baojian Zhou; Feng Chen; Yiming Ying", "abstract": "Stochastic optimization algorithms update models with cheap per-iteration costs sequentially, which makes them amenable for large-scale data analysis. Such algorithms have been widely studied for structured sparse models where the sparsity information is very specific, e.g., convex sparsity-inducing norms or $\\ell^0$-norm. However, these norms cannot be directly applied to the problem of complex (non-convex) graph-structured sparsity models, which have important application in disease outbreak and social networks, etc. In this paper, we propose a stochastic gradient-based method for solving graph-structured sparsity constraint problems, not restricted to the least square loss. We prove that our algorithm enjoys a linear convergence up to a constant error, which is competitive with the counterparts in the batch learning setting. We conduct extensive experiments to show the efficiency and effectiveness of the proposed algorithms.", "bibtex": "@InProceedings{pmlr-v97-zhou19a,\n title = \t {Stochastic Iterative Hard Thresholding for Graph-structured Sparsity Optimization},\n author = {Zhou, Baojian and Chen, Feng and Ying, Yiming},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7563--7573},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhou19a/zhou19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhou19a.html},\n abstract = \t {Stochastic optimization algorithms update models with cheap per-iteration costs sequentially, which makes them amenable for large-scale data analysis. Such algorithms have been widely studied for structured sparse models where the sparsity information is very specific, e.g., convex sparsity-inducing norms or $\\ell^0$-norm. However, these norms cannot be directly applied to the problem of complex (non-convex) graph-structured sparsity models, which have important application in disease outbreak and social networks, etc. In this paper, we propose a stochastic gradient-based method for solving graph-structured sparsity constraint problems, not restricted to the least square loss. We prove that our algorithm enjoys a linear convergence up to a constant error, which is competitive with the counterparts in the batch learning setting. We conduct extensive experiments to show the efficiency and effectiveness of the proposed algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhou19a/zhou19a.pdf", "supp": "", "pdf_size": 646808, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4121937272467164287&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 17, "aff": "Department of Computer Science, SUNY at Albany, Albany, NY, USA; Department of Computer Science, SUNY at Albany, Albany, NY, USA; Department of Mathematics and Statistics, SUNY at Albany, Albany, NY, USA", "aff_domain": "albany.edu; ; ", "email": "albany.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhou19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "State University of New York at Albany", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.albany.edu", "aff_unique_abbr": "SUNY Albany", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Albany", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Stochastic Optimization for DC Functions and Non-smooth Non-convex Regularizers with Non-asymptotic Convergence", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3951", "id": "3951", "author_site": "Yi Xu, Qi Qi, Qihang Lin, rong jin, Tianbao Yang", "author": "Yi Xu; Qi Qi; Qihang Lin; Rong Jin; Tianbao Yang", "abstract": "Difference of convex (DC) functions cover a broad family of non-convex and possibly non-smooth and non-differentiable functions, and have wide applications in machine learning and statistics. Although deterministic algorithms for DC functions have been extensively studied, stochastic optimization that is more suitable for learning with big data remains under-explored. In this paper, we propose new stochastic optimization algorithms and study their first-order convergence theories for solving a broad family of DC functions. We improve the existing algorithms and theories of stochastic optimization for DC functions from both practical and theoretical perspectives. Moreover, we extend the proposed stochastic algorithms for DC functions to solve problems with a general non-convex non-differentiable regularizer, which does not necessarily have a DC decomposition but enjoys an efficient proximal mapping. To the best of our knowledge, this is the first work that gives the first non-asymptotic convergence for solving non-convex optimization whose objective has a general non-convex non-differentiable regularizer.", "bibtex": "@InProceedings{pmlr-v97-xu19c,\n title = \t {Stochastic Optimization for {DC} Functions and Non-smooth Non-convex Regularizers with Non-asymptotic Convergence},\n author = {Xu, Yi and Qi, Qi and Lin, Qihang and Jin, Rong and Yang, Tianbao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6942--6951},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xu19c/xu19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/xu19c.html},\n abstract = \t {Difference of convex (DC) functions cover a broad family of non-convex and possibly non-smooth and non-differentiable functions, and have wide applications in machine learning and statistics. Although deterministic algorithms for DC functions have been extensively studied, stochastic optimization that is more suitable for learning with big data remains under-explored. In this paper, we propose new stochastic optimization algorithms and study their first-order convergence theories for solving a broad family of DC functions. We improve the existing algorithms and theories of stochastic optimization for DC functions from both practical and theoretical perspectives. Moreover, we extend the proposed stochastic algorithms for DC functions to solve problems with a general non-convex non-differentiable regularizer, which does not necessarily have a DC decomposition but enjoys an efficient proximal mapping. To the best of our knowledge, this is the first work that gives the first non-asymptotic convergence for solving non-convex optimization whose objective has a general non-convex non-differentiable regularizer.}\n}", "pdf": "http://proceedings.mlr.press/v97/xu19c/xu19c.pdf", "supp": "", "pdf_size": 557424, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2085483635099791799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, University of Iowa, Iowa City, IA 52242, USA+Department of Management Sciences, University of Iowa, Iowa City, IA 52242, USA; Department of Computer Science, University of Iowa, Iowa City, IA 52242, USA+Department of Management Sciences, University of Iowa, Iowa City, IA 52242, USA; Department of Management Sciences, University of Iowa, Iowa City, IA 52242, USA; Machine Intelligence Technology, Alibaba Group, Bellevue, WA 98004, USA; Department of Computer Science, University of Iowa, Iowa City, IA 52242, USA", "aff_domain": "uiowa.edu; ; ; ;uiowa.edu", "email": "uiowa.edu; ; ; ;uiowa.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/xu19c.html", "aff_unique_index": "0+0;0+0;0;1;0", "aff_unique_norm": "University of Iowa;Alibaba Group", "aff_unique_dep": "Department of Computer Science;Machine Intelligence Technology", "aff_unique_url": "https://www.uiowa.edu;https://www.alibaba.com", "aff_unique_abbr": "UIowa;Alibaba", "aff_campus_unique_index": "0+0;0+0;0;1;0", "aff_campus_unique": "Iowa City;Bellevue", "aff_country_unique_index": "0+0;0+0;0;0;0", "aff_country_unique": "United States" }, { "title": "Structured agents for physical construction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4079", "id": "4079", "author_site": "Victor Bapst, Alvaro Sanchez-Gonzalez, Carl Doersch, Kimberly Stachenfeld, Pushmeet Kohli, Peter Battaglia, Jessica Hamrick", "author": "Victor Bapst; Alvaro Sanchez-Gonzalez; Carl Doersch; Kimberly Stachenfeld; Pushmeet Kohli; Peter Battaglia; Jessica Hamrick", "abstract": "Physical construction\u2014the ability to compose objects, subject to physical dynamics, to serve some function\u2014is fundamental to human intelligence. We introduce a suite of challenging physical construction tasks inspired by how children play with blocks, such as matching a target configuration, stacking blocks to connect objects together, and creating shelter-like structures over target objects. We examine how a range of deep reinforcement learning agents fare on these challenges, and introduce several new approaches which provide superior performance. Our results show that agents which use structured representations (e.g., objects and scene graphs) and structured policies (e.g., object-centric actions) outperform those which use less structured representations, and generalize better beyond their training when asked to reason about larger scenes. Model-based agents which use Monte-Carlo Tree Search also outperform strictly model-free agents in our most challenging construction problems. We conclude that approaches which combine structured representations and reasoning with powerful learning are a key path toward agents that possess rich intuitive physics, scene understanding, and planning.", "bibtex": "@InProceedings{pmlr-v97-bapst19a,\n title = \t {Structured agents for physical construction},\n author = {Bapst, Victor and Sanchez-Gonzalez, Alvaro and Doersch, Carl and Stachenfeld, Kimberly and Kohli, Pushmeet and Battaglia, Peter and Hamrick, Jessica},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {464--474},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bapst19a/bapst19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bapst19a.html},\n abstract = \t {Physical construction\u2014the ability to compose objects, subject to physical dynamics, to serve some function\u2014is fundamental to human intelligence. We introduce a suite of challenging physical construction tasks inspired by how children play with blocks, such as matching a target configuration, stacking blocks to connect objects together, and creating shelter-like structures over target objects. We examine how a range of deep reinforcement learning agents fare on these challenges, and introduce several new approaches which provide superior performance. Our results show that agents which use structured representations (e.g., objects and scene graphs) and structured policies (e.g., object-centric actions) outperform those which use less structured representations, and generalize better beyond their training when asked to reason about larger scenes. Model-based agents which use Monte-Carlo Tree Search also outperform strictly model-free agents in our most challenging construction problems. We conclude that approaches which combine structured representations and reasoning with powerful learning are a key path toward agents that possess rich intuitive physics, scene understanding, and planning.}\n}", "pdf": "http://proceedings.mlr.press/v97/bapst19a/bapst19a.pdf", "supp": "", "pdf_size": 1120179, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14298375533611799017&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind; DeepMind", "aff_domain": "google.com; ; ; ; ; ;google.com", "email": "google.com; ; ; ; ; ;google.com", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/bapst19a.html", "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "DeepMind", "aff_unique_dep": "", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Sublinear Space Private Algorithms Under the Sliding Window Model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3867", "id": "3867", "author": "Jalaj Upadhyay", "abstract": "The Differential privacy overview of Apple states, \u201cApple retains the collected data for a maximum of three months.\" Analysis of recent data is formalized by the", "bibtex": "@InProceedings{pmlr-v97-upadhyay19a,\n title = \t {Sublinear Space Private Algorithms Under the Sliding Window Model},\n author = {Upadhyay, Jalaj},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6363--6372},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/upadhyay19a/upadhyay19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/upadhyay19a.html},\n abstract = \t {The Differential privacy overview of Apple states, \u201cApple retains the collected data for a maximum of three months.\" Analysis of recent data is formalized by the", "pdf": "http://proceedings.mlr.press/v97/upadhyay19a/upadhyay19a.pdf", "supp": "", "pdf_size": 613270, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7491602672515539749&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Johns Hopkins University", "aff_domain": "jhu.edu", "email": "jhu.edu", "github": "", "project": "", "author_num": 1, "oa": "https://proceedings.mlr.press/v97/upadhyay19a.html", "aff_unique_index": "0", "aff_unique_norm": "Johns Hopkins University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.jhu.edu", "aff_unique_abbr": "JHU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "title": "Sublinear Time Nearest Neighbor Search over Generalized Weighted Space", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3631", "id": "3631", "author_site": "Yifan Lei, Qiang Huang, Mohan Kankanhalli, Anthony Tung", "author": "Yifan Lei; Qiang Huang; Mohan Kankanhalli; Anthony Tung", "abstract": "Nearest Neighbor Search (NNS) over generalized weighted space is a fundamental problem which has many applications in various fields. However, to the best of our knowledge, there is no sublinear time solution to this problem. Based on the idea of Asymmetric Locality-Sensitive Hashing (ALSH), we introduce a novel spherical asymmetric transformation and propose the first two novel weight-oblivious hashing schemes SL-ALSH and S2-ALSH accordingly. We further show that both schemes enjoy a quality guarantee and can answer the NNS queries in sublinear time. Evaluations over three real datasets demonstrate the superior performance of the two proposed schemes.", "bibtex": "@InProceedings{pmlr-v97-lei19a,\n title = \t {Sublinear Time Nearest Neighbor Search over Generalized Weighted Space},\n author = {Lei, Yifan and Huang, Qiang and Kankanhalli, Mohan and Tung, Anthony},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3773--3781},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lei19a/lei19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lei19a.html},\n abstract = \t {Nearest Neighbor Search (NNS) over generalized weighted space is a fundamental problem which has many applications in various fields. However, to the best of our knowledge, there is no sublinear time solution to this problem. Based on the idea of Asymmetric Locality-Sensitive Hashing (ALSH), we introduce a novel spherical asymmetric transformation and propose the first two novel weight-oblivious hashing schemes SL-ALSH and S2-ALSH accordingly. We further show that both schemes enjoy a quality guarantee and can answer the NNS queries in sublinear time. Evaluations over three real datasets demonstrate the superior performance of the two proposed schemes.}\n}", "pdf": "http://proceedings.mlr.press/v97/lei19a/lei19a.pdf", "supp": "", "pdf_size": 536149, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7950064557003467824&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Computing, National University of Singapore, Singapore; School of Computing, National University of Singapore, Singapore; School of Computing, National University of Singapore, Singapore; School of Computing, National University of Singapore, Singapore", "aff_domain": "comp.nus.edu.sg;comp.nus.edu.sg; ; ", "email": "comp.nus.edu.sg;comp.nus.edu.sg; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/lei19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "School of Computing", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Singapore" }, { "title": "Sublinear quantum algorithms for training linear and kernel-based classifiers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3977", "id": "3977", "author_site": "Tongyang Li, Shouvanik Chakrabarti, Xiaodi Wu", "author": "Tongyang Li; Shouvanik Chakrabarti; Xiaodi Wu", "abstract": "We investigate quantum algorithms for classification, a fundamental problem in machine learning, with provable guarantees. Given $n$ $d$-dimensional data points, the state-of-the-art (and optimal) classical algorithm for training classifiers with constant margin by Clarkson et al. runs in $\\tilde{O}(n +d)$, which is also optimal in its input/output model. We design sublinear quantum algorithms for the same task running in $\\tilde{O}(\\sqrt{n} +\\sqrt{d})$, a quadratic improvement in both $n$ and $d$. Moreover, our algorithms use the standard quantization of the classical input and generate the same classical output, suggesting minimal overheads when used as subroutines for end-to-end applications. We also demonstrate a tight lower bound (up to poly-log factors) and discuss the possibility of implementation on near-term quantum machines.", "bibtex": "@InProceedings{pmlr-v97-li19b,\n title = \t {Sublinear quantum algorithms for training linear and kernel-based classifiers},\n author = {Li, Tongyang and Chakrabarti, Shouvanik and Wu, Xiaodi},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3815--3824},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19b/li19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19b.html},\n abstract = \t {We investigate quantum algorithms for classification, a fundamental problem in machine learning, with provable guarantees. Given $n$ $d$-dimensional data points, the state-of-the-art (and optimal) classical algorithm for training classifiers with constant margin by Clarkson et al. runs in $\\tilde{O}(n +d)$, which is also optimal in its input/output model. We design sublinear quantum algorithms for the same task running in $\\tilde{O}(\\sqrt{n} +\\sqrt{d})$, a quadratic improvement in both $n$ and $d$. Moreover, our algorithms use the standard quantization of the classical input and generate the same classical output, suggesting minimal overheads when used as subroutines for end-to-end applications. We also demonstrate a tight lower bound (up to poly-log factors) and discuss the possibility of implementation on near-term quantum machines.}\n}", "pdf": "http://proceedings.mlr.press/v97/li19b/li19b.pdf", "supp": "", "pdf_size": 777698, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5166016811338778772&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, UMIACS, and Joint Center for Quantum Information and Computer Science, University of Maryland; Department of Computer Science, UMIACS, and Joint Center for Quantum Information and Computer Science, University of Maryland; Department of Computer Science, UMIACS, and Joint Center for Quantum Information and Computer Science, University of Maryland", "aff_domain": "cs.umd.edu; ;cs.umd.edu", "email": "cs.umd.edu; ;cs.umd.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/li19b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Submodular Cost Submodular Cover with an Approximate Oracle", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3591", "id": "3591", "author_site": "Victoria Crawford, Alan Kuhnle, My T Thai", "author": "Victoria Crawford; Alan Kuhnle; My Thai", "abstract": "In this work, we study the Submodular Cost Submodular Cover problem, which is to minimize the submodular cost required to ensure that the submodular benefit function exceeds a given threshold. Existing approximation ratios for the greedy algorithm assume a value oracle to the benefit function. However, access to a value oracle is not a realistic assumption for many applications of this problem, where the benefit function is difficult to compute. We present two incomparable approximation ratios for this problem with an approximate value oracle and demonstrate that the ratios take on empirically relevant values through a case study with the Influence Threshold problem in online social networks.", "bibtex": "@InProceedings{pmlr-v97-crawford19a,\n title = \t {Submodular Cost Submodular Cover with an Approximate Oracle},\n author = {Crawford, Victoria and Kuhnle, Alan and Thai, My},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1426--1435},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/crawford19a/crawford19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/crawford19a.html},\n abstract = \t {In this work, we study the Submodular Cost Submodular Cover problem, which is to minimize the submodular cost required to ensure that the submodular benefit function exceeds a given threshold. Existing approximation ratios for the greedy algorithm assume a value oracle to the benefit function. However, access to a value oracle is not a realistic assumption for many applications of this problem, where the benefit function is difficult to compute. We present two incomparable approximation ratios for this problem with an approximate value oracle and demonstrate that the ratios take on empirically relevant values through a case study with the Influence Threshold problem in online social networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/crawford19a/crawford19a.pdf", "supp": "", "pdf_size": 587896, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16215282094690849890&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer and Information Science and Engineering, University of Florida, Gainesville, Florida, United States; Department of Computer Science, Florida State University, Tallahassee, Florida, United States; Department of Computer and Information Science and Engineering, University of Florida, Gainesville, Florida, United States", "aff_domain": "ufl.edu;ufl.edu;cise.ufl.edu", "email": "ufl.edu;ufl.edu;cise.ufl.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/crawford19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Florida;Florida State University", "aff_unique_dep": "Department of Computer and Information Science and Engineering;Department of Computer Science", "aff_unique_url": "https://www.ufl.edu;https://www.fsu.edu", "aff_unique_abbr": "UF;FSU", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Gainesville;Tallahassee", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Submodular Maximization beyond Non-negativity: Guarantees, Fast Algorithms, and Applications", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4159", "id": "4159", "author_site": "Christopher Harshaw, Moran Feldman, Justin Ward, Amin Karbasi", "author": "Chris Harshaw; Moran Feldman; Justin Ward; Amin Karbasi", "abstract": "It is generally believed that submodular functions\u2013and the more general class of $\\gamma$-weakly submodular functions\u2013may only be optimized under the non-negativity assumption $f(S) \\geq 0$. In this paper, we show that once the function is expressed as the difference $f = g - c$, where $g$ is monotone, non-negative, and $\\gamma$-weakly submodular and $c$ is non-negative modular, then strong approximation guarantees may be obtained. We present an algorithm for maximizing $g - c$ under a $k$-cardinality constraint which produces a random feasible set $S$ such that $\\mathbb{E}[g(S) -c(S)] \\geq (1 - e^{-\\gamma} - \\epsilon) g(\\opt) - c(\\opt)$, whose running time is $O (\\frac{n}{\\epsilon} \\log^2 \\frac{1}{\\epsilon})$, independent of $k$. We extend these results to the unconstrained setting by describing an algorithm with the same approximation guarantees and faster $O(n \\frac{1}{\\epsilon} \\log\\frac{1}{\\epsilon})$ runtime. The main techniques underlying our algorithms are two-fold: the use of a surrogate objective which varies the relative importance between $g$ and $c$ throughout the algorithm, and a geometric sweep over possible $\\gamma$ values. Our algorithmic guarantees are complemented by a hardness result showing that no polynomial-time algorithm which accesses $g$ through a value oracle can do better. We empirically demonstrate the success of our algorithms by applying them to experimental design on the Boston Housing dataset and directed vertex cover on the Email EU dataset.", "bibtex": "@InProceedings{pmlr-v97-harshaw19a,\n title = \t {Submodular Maximization beyond Non-negativity: Guarantees, Fast Algorithms, and Applications},\n author = {Harshaw, Chris and Feldman, Moran and Ward, Justin and Karbasi, Amin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2634--2643},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/harshaw19a/harshaw19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/harshaw19a.html},\n abstract = \t {It is generally believed that submodular functions\u2013and the more general class of $\\gamma$-weakly submodular functions\u2013may only be optimized under the non-negativity assumption $f(S) \\geq 0$. In this paper, we show that once the function is expressed as the difference $f = g - c$, where $g$ is monotone, non-negative, and $\\gamma$-weakly submodular and $c$ is non-negative modular, then strong approximation guarantees may be obtained. We present an algorithm for maximizing $g - c$ under a $k$-cardinality constraint which produces a random feasible set $S$ such that $\\mathbb{E}[g(S) -c(S)] \\geq (1 - e^{-\\gamma} - \\epsilon) g(\\opt) - c(\\opt)$, whose running time is $O (\\frac{n}{\\epsilon} \\log^2 \\frac{1}{\\epsilon})$, independent of $k$. We extend these results to the unconstrained setting by describing an algorithm with the same approximation guarantees and faster $O(n \\frac{1}{\\epsilon} \\log\\frac{1}{\\epsilon})$ runtime. The main techniques underlying our algorithms are two-fold: the use of a surrogate objective which varies the relative importance between $g$ and $c$ throughout the algorithm, and a geometric sweep over possible $\\gamma$ values. Our algorithmic guarantees are complemented by a hardness result showing that no polynomial-time algorithm which accesses $g$ through a value oracle can do better. We empirically demonstrate the success of our algorithms by applying them to experimental design on the Boston Housing dataset and directed vertex cover on the Email EU dataset.}\n}", "pdf": "http://proceedings.mlr.press/v97/harshaw19a/harshaw19a.pdf", "supp": "", "pdf_size": 669605, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4032047436455480189&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Department of Computer Science, Yale University, New Haven, USA; Department of Mathematics and Computer Science, Open University of Israel, Raanana, Israel; School of Mathematical Sciences, Queen Mary University of London, London, UK; Department of Electrical Engineering, Yale University, New Haven, USA", "aff_domain": "yale.edu; ; ; ", "email": "yale.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/harshaw19a.html", "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Yale University;Open University of Israel;Queen Mary University of London", "aff_unique_dep": "Department of Computer Science;Department of Mathematics and Computer Science;School of Mathematical Sciences", "aff_unique_url": "https://www.yale.edu;https://www.openu.ac.il;https://www.qmul.ac.uk", "aff_unique_abbr": "Yale;OUI;QMUL", "aff_campus_unique_index": "0;1;2;0", "aff_campus_unique": "New Haven;Raanana;London", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "United States;Israel;United Kingdom" }, { "title": "Submodular Observation Selection and Information Gathering for Quadratic Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4291", "id": "4291", "author_site": "Abolfazl Hashemi, Mahsa Ghasemi, Haris Vikalo, Ufuk Topcu", "author": "Abolfazl Hashemi; Mahsa Ghasemi; Haris Vikalo; Ufuk Topcu", "abstract": "We study the problem of selecting most informative subset of a large observation set to enable accurate estimation of unknown parameters. This problem arises in a variety of settings in machine learning and signal processing including feature selection, phase retrieval, and target localization. Since for quadratic measurement models the moment matrix of the optimal estimator is generally unknown, majority of prior work resorts to approximation techniques such as linearization of the observation model to optimize the alphabetical optimality criteria of an approximate moment matrix. Conversely, by exploiting a connection to the classical Van Trees\u2019 inequality, we derive new alphabetical optimality criteria without distorting the relational structure of the observation model. We further show that under certain conditions on parameters of the problem these optimality criteria are monotone and (weak) submodular set functions. These results enable us to develop an efficient greedy observation selection algorithm uniquely tailored for quadratic models, and provide theoretical bounds on its achievable utility.", "bibtex": "@InProceedings{pmlr-v97-hashemi19a,\n title = \t {Submodular Observation Selection and Information Gathering for Quadratic Models},\n author = {Hashemi, Abolfazl and Ghasemi, Mahsa and Vikalo, Haris and Topcu, Ufuk},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2653--2662},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hashemi19a/hashemi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hashemi19a.html},\n abstract = \t {We study the problem of selecting most informative subset of a large observation set to enable accurate estimation of unknown parameters. This problem arises in a variety of settings in machine learning and signal processing including feature selection, phase retrieval, and target localization. Since for quadratic measurement models the moment matrix of the optimal estimator is generally unknown, majority of prior work resorts to approximation techniques such as linearization of the observation model to optimize the alphabetical optimality criteria of an approximate moment matrix. Conversely, by exploiting a connection to the classical Van Trees\u2019 inequality, we derive new alphabetical optimality criteria without distorting the relational structure of the observation model. We further show that under certain conditions on parameters of the problem these optimality criteria are monotone and (weak) submodular set functions. These results enable us to develop an efficient greedy observation selection algorithm uniquely tailored for quadratic models, and provide theoretical bounds on its achievable utility.}\n}", "pdf": "http://proceedings.mlr.press/v97/hashemi19a/hashemi19a.pdf", "supp": "", "pdf_size": 758930, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4620874328769572568&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "University of Texas at Austin; University of Texas at Austin; University of Texas at Austin; University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu; ; ", "email": "utexas.edu;utexas.edu; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/hashemi19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Submodular Streaming in All Its Glory: Tight Approximation, Minimum Memory and Low Adaptive Complexity", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3925", "id": "3925", "author_site": "Ehsan Kazemi, Marko Mitrovic, Morteza Zadimoghaddam, Silvio Lattanzi, Amin Karbasi", "author": "Ehsan Kazemi; Marko Mitrovic; Morteza Zadimoghaddam; Silvio Lattanzi; Amin Karbasi", "abstract": "Streaming algorithms are generally judged by the quality of their solution, memory footprint, and computational complexity. In this paper, we study the problem of maximizing a monotone submodular function in the streaming setting with a cardinality constraint $k$. We first propose SIEVE-STREAMING++, which requires just one pass over the data, keeps only $O(k)$ elements and achieves the tight $\\frac{1}{2}$-approximation guarantee. The best previously known streaming algorithms either achieve a suboptimal $\\frac{1}{4}$-approximation with $\\Theta(k)$ memory or the optimal $\\frac{1}{2}$-approximation with $O(k\\log k)$ memory. Next, we show that by buffering a small fraction of the stream and applying a careful filtering procedure, one can heavily reduce the number of adaptive computational rounds, thus substantially lowering the computational complexity of SIEVE-STREAMING++. We then generalize our results to the more challenging multi-source streaming setting. We show how one can achieve the tight $\\frac{1}{2}$-approximation guarantee with $O(k)$ shared memory, while minimizing not only the rounds of computations but also the total number of communicated bits. Finally, we demonstrate the efficiency of our algorithms on real-world data summarization tasks for multi-source streams of tweets and of YouTube videos.", "bibtex": "@InProceedings{pmlr-v97-kazemi19a,\n title = \t {Submodular Streaming in All Its Glory: Tight Approximation, Minimum Memory and Low Adaptive Complexity},\n author = {Kazemi, Ehsan and Mitrovic, Marko and Zadimoghaddam, Morteza and Lattanzi, Silvio and Karbasi, Amin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3311--3320},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kazemi19a/kazemi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kazemi19a.html},\n abstract = \t {Streaming algorithms are generally judged by the quality of their solution, memory footprint, and computational complexity. In this paper, we study the problem of maximizing a monotone submodular function in the streaming setting with a cardinality constraint $k$. We first propose SIEVE-STREAMING++, which requires just one pass over the data, keeps only $O(k)$ elements and achieves the tight $\\frac{1}{2}$-approximation guarantee. The best previously known streaming algorithms either achieve a suboptimal $\\frac{1}{4}$-approximation with $\\Theta(k)$ memory or the optimal $\\frac{1}{2}$-approximation with $O(k\\log k)$ memory. Next, we show that by buffering a small fraction of the stream and applying a careful filtering procedure, one can heavily reduce the number of adaptive computational rounds, thus substantially lowering the computational complexity of SIEVE-STREAMING++. We then generalize our results to the more challenging multi-source streaming setting. We show how one can achieve the tight $\\frac{1}{2}$-approximation guarantee with $O(k)$ shared memory, while minimizing not only the rounds of computations but also the total number of communicated bits. Finally, we demonstrate the efficiency of our algorithms on real-world data summarization tasks for multi-source streams of tweets and of YouTube videos.}\n}", "pdf": "http://proceedings.mlr.press/v97/kazemi19a/kazemi19a.pdf", "supp": "", "pdf_size": 2089729, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13074444339016629116&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Yale University; Yale University; Google Research; Google Research; Yale University", "aff_domain": "yale.edu; ; ; ;yale.edu", "email": "yale.edu; ; ; ;yale.edu", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/kazemi19a.html", "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Yale University;Google", "aff_unique_dep": ";Google Research", "aff_unique_url": "https://www.yale.edu;https://research.google", "aff_unique_abbr": "Yale;Google Research", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Subspace Robust Wasserstein Distances", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4320", "id": "4320", "author_site": "Fran\u00e7ois-Pierre Paty, Marco Cuturi", "author": "Fran\u00e7ois-Pierre Paty; Marco Cuturi", "abstract": "Making sense of Wasserstein distances between discrete measures in high-dimensional settings remains a challenge. Recent work has advocated a two-step approach to improve robustness and facilitate the computation of optimal transport, using for instance projections on random real lines, or a preliminary quantization of the measures to reduce the size of their support. We propose in this work a \u201cmax-min\u201d robust variant of the Wasserstein distance by considering the maximal possible distance that can be realized between two measures, assuming they can be projected orthogonally on a lower k-dimensional subspace. Alternatively, we show that the corresponding \u201cmin-max\u201d OT problem has a tight convex relaxation which can be cast as that of finding an optimal transport plan with a low transportation cost, where the cost is alternatively defined as the sum of the k largest eigenvalues of the second order moment matrix of the displacements (or matchings) corresponding to that plan (the usual OT definition only considers the trace of that matrix). We show that both quantities inherit several favorable properties from the OT geometry. We propose two algorithms to compute the latter formulation using entropic regularization, and illustrate the interest of this approach empirically.", "bibtex": "@InProceedings{pmlr-v97-paty19a,\n title = \t {Subspace Robust {W}asserstein Distances},\n author = {Paty, Fran{\\c{c}}ois-Pierre and Cuturi, Marco},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5072--5081},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/paty19a/paty19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/paty19a.html},\n abstract = \t {Making sense of Wasserstein distances between discrete measures in high-dimensional settings remains a challenge. Recent work has advocated a two-step approach to improve robustness and facilitate the computation of optimal transport, using for instance projections on random real lines, or a preliminary quantization of the measures to reduce the size of their support. We propose in this work a \u201cmax-min\u201d robust variant of the Wasserstein distance by considering the maximal possible distance that can be realized between two measures, assuming they can be projected orthogonally on a lower k-dimensional subspace. Alternatively, we show that the corresponding \u201cmin-max\u201d OT problem has a tight convex relaxation which can be cast as that of finding an optimal transport plan with a low transportation cost, where the cost is alternatively defined as the sum of the k largest eigenvalues of the second order moment matrix of the displacements (or matchings) corresponding to that plan (the usual OT definition only considers the trace of that matrix). We show that both quantities inherit several favorable properties from the OT geometry. We propose two algorithms to compute the latter formulation using entropic regularization, and illustrate the interest of this approach empirically.}\n}", "pdf": "http://proceedings.mlr.press/v97/paty19a/paty19a.pdf", "supp": "", "pdf_size": 1587237, "gs_citation": 185, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8524549909755400152&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "CREST-ENSAE, Palaiseau, France; Google Brain, Paris, France", "aff_domain": "ensae.fr; ", "email": "ensae.fr; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/paty19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "CREST-ENSAE;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": ";https://brain.google.com", "aff_unique_abbr": ";Google Brain", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Palaiseau;Paris", "aff_country_unique_index": "0;0", "aff_country_unique": "France" }, { "title": "Sum-of-Squares Polynomial Flow", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3638", "id": "3638", "author_site": "Priyank Jaini, Kira A. Selby, Yaoliang Yu", "author": "Priyank Jaini; Kira A. Selby; Yaoliang Yu", "abstract": "Triangular map is a recent construct in probability theory that allows one to transform any source probability density function to any target density function. Based on triangular maps, we propose a general framework for high-dimensional density estimation, by specifying one-dimensional transformations (equivalently conditional densities) and appropriate conditioner networks. This framework (a) reveals the commonalities and differences of existing autoregressive and flow based methods, (b) allows a unified understanding of the limitations and representation power of these recent approaches and, (c) motivates us to uncover a new Sum-of-Squares (SOS) flow that is interpretable, universal, and easy to train. We perform several synthetic experiments on various density geometries to demonstrate the benefits (and short-comings) of such transformations. SOS flows achieve competitive results in simulations and several real-world datasets.", "bibtex": "@InProceedings{pmlr-v97-jaini19a,\n title = \t {Sum-of-Squares Polynomial Flow},\n author = {Jaini, Priyank and Selby, Kira A. and Yu, Yaoliang},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3009--3018},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jaini19a/jaini19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/jaini19a.html},\n abstract = \t {Triangular map is a recent construct in probability theory that allows one to transform any source probability density function to any target density function. Based on triangular maps, we propose a general framework for high-dimensional density estimation, by specifying one-dimensional transformations (equivalently conditional densities) and appropriate conditioner networks. This framework (a) reveals the commonalities and differences of existing autoregressive and flow based methods, (b) allows a unified understanding of the limitations and representation power of these recent approaches and, (c) motivates us to uncover a new Sum-of-Squares (SOS) flow that is interpretable, universal, and easy to train. We perform several synthetic experiments on various density geometries to demonstrate the benefits (and short-comings) of such transformations. SOS flows achieve competitive results in simulations and several real-world datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/jaini19a/jaini19a.pdf", "supp": "", "pdf_size": 513676, "gs_citation": 167, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10951504254019207523&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;", "aff_domain": ";;", "email": ";;", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/jaini19a.html" }, { "title": "Supervised Hierarchical Clustering with Exponential Linkage", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4257", "id": "4257", "author_site": "Nishant Yadav, Ari Kobren, Nicholas Monath, Andrew McCallum", "author": "Nishant Yadav; Ari Kobren; Nicholas Monath; Andrew Mccallum", "abstract": "In supervised clustering, standard techniques for learning a pairwise dissimilarity function often suffer from a discrepancy between the training and clustering objectives, leading to poor cluster quality. Rectifying this discrepancy necessitates matching the procedure for training the dissimilarity function to the clustering algorithm. In this paper, we introduce a method for training the dissimilarity function in a way that is tightly coupled with hierarchical clustering, in particular single linkage. However, the appropriate clustering algorithm for a given dataset is often unknown. Thus we introduce an approach to supervised hierarchical clustering that smoothly interpolates between single, average, and complete linkage, and we give a training procedure that simultaneously learns a linkage function and a dissimilarity function. We accomplish this with a novel Exponential Linkage function that has a learnable parameter that controls the interpolation. In experiments on four datasets, our joint training procedure consistently matches or outperforms the next best training procedure/linkage function pair and gives up to 8 points improvement in dendrogram purity over discrepant pairs.", "bibtex": "@InProceedings{pmlr-v97-yadav19a,\n title = \t {Supervised Hierarchical Clustering with Exponential Linkage},\n author = {Yadav, Nishant and Kobren, Ari and Monath, Nicholas and Mccallum, Andrew},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6973--6983},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yadav19a/yadav19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yadav19a.html},\n abstract = \t {In supervised clustering, standard techniques for learning a pairwise dissimilarity function often suffer from a discrepancy between the training and clustering objectives, leading to poor cluster quality. Rectifying this discrepancy necessitates matching the procedure for training the dissimilarity function to the clustering algorithm. In this paper, we introduce a method for training the dissimilarity function in a way that is tightly coupled with hierarchical clustering, in particular single linkage. However, the appropriate clustering algorithm for a given dataset is often unknown. Thus we introduce an approach to supervised hierarchical clustering that smoothly interpolates between single, average, and complete linkage, and we give a training procedure that simultaneously learns a linkage function and a dissimilarity function. We accomplish this with a novel Exponential Linkage function that has a learnable parameter that controls the interpolation. In experiments on four datasets, our joint training procedure consistently matches or outperforms the next best training procedure/linkage function pair and gives up to 8 points improvement in dendrogram purity over discrepant pairs.}\n}", "pdf": "http://proceedings.mlr.press/v97/yadav19a/yadav19a.pdf", "supp": "", "pdf_size": 409476, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14591272843062718088&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "College of Information and Computer Sciences, University of Massachusetts Amherst, USA; College of Information and Computer Sciences, University of Massachusetts Amherst, USA; College of Information and Computer Sciences, University of Massachusetts Amherst, USA; College of Information and Computer Sciences, University of Massachusetts Amherst, USA", "aff_domain": "cs.umass.edu;cs.umass.edu;cs.umass.edu;cs.umass.edu", "email": "cs.umass.edu;cs.umass.edu;cs.umass.edu;cs.umass.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/yadav19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Massachusetts Amherst", "aff_unique_dep": "College of Information and Computer Sciences", "aff_unique_url": "https://www.umass.edu", "aff_unique_abbr": "UMass Amherst", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Amherst", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Surrogate Losses for Online Learning of Stepsizes in Stochastic Non-Convex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3935", "id": "3935", "author_site": "zhenxun zhuang, Ashok Cutkosky, Francesco Orabona", "author": "Zhenxun Zhuang; Ashok Cutkosky; Francesco Orabona", "abstract": "Stochastic Gradient Descent (SGD) has played a central role in machine learning. However, it requires a carefully hand-picked stepsize for fast convergence, which is notoriously tedious and time-consuming to tune. Over the last several years, a plethora of adaptive gradient-based algorithms have emerged to ameliorate this problem. In this paper, we propose new surrogate losses to cast the problem of learning the optimal stepsizes for the stochastic optimization of a non-convex smooth objective function onto an online convex optimization problem. This allows the use of no-regret online algorithms to compute optimal stepsizes on the fly. In turn, this results in a SGD algorithm with self-tuned stepsizes that guarantees convergence rates that are automatically adaptive to the level of noise.", "bibtex": "@InProceedings{pmlr-v97-zhuang19a,\n title = \t {Surrogate Losses for Online Learning of Stepsizes in Stochastic Non-Convex Optimization},\n author = {Zhuang, Zhenxun and Cutkosky, Ashok and Orabona, Francesco},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7664--7672},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhuang19a/zhuang19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhuang19a.html},\n abstract = \t {Stochastic Gradient Descent (SGD) has played a central role in machine learning. However, it requires a carefully hand-picked stepsize for fast convergence, which is notoriously tedious and time-consuming to tune. Over the last several years, a plethora of adaptive gradient-based algorithms have emerged to ameliorate this problem. In this paper, we propose new surrogate losses to cast the problem of learning the optimal stepsizes for the stochastic optimization of a non-convex smooth objective function onto an online convex optimization problem. This allows the use of no-regret online algorithms to compute optimal stepsizes on the fly. In turn, this results in a SGD algorithm with self-tuned stepsizes that guarantees convergence rates that are automatically adaptive to the level of noise.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhuang19a/zhuang19a.pdf", "supp": "", "pdf_size": 380178, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15826908669665145589&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computer Science, Boston University, Boston, MA, US; Google, Mountain View, CA, US; Department of Electrical & Computer Engineering, Boston University, Boston, MA, US", "aff_domain": "bu.edu; ; ", "email": "bu.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhuang19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Boston University;Google", "aff_unique_dep": "Department of Computer Science;Google", "aff_unique_url": "https://www.bu.edu;https://www.google.com", "aff_unique_abbr": "BU;Google", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Boston;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Switching Linear Dynamics for Variational Bayes Filtering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3860", "id": "3860", "author_site": "Philip Becker-Ehmck, Jan Peters, Patrick van der Smagt", "author": "Philip Becker-Ehmck; Jan Peters; Patrick Van Der Smagt", "abstract": "System identification of complex and nonlinear systems is a central problem for model predictive control and model-based reinforcement learning. Despite their complexity, such systems can often be approximated well by a set of linear dynamical systems if broken into appropriate subsequences. This mechanism not only helps us find good approximations of dynamics, but also gives us deeper insight into the underlying system. Leveraging Bayesian inference, Variational Autoencoders and Concrete relaxations, we show how to learn a richer and more meaningful state space, e.g. encoding joint constraints and collisions with walls in a maze, from partial and high-dimensional observations. This representation translates into a gain of accuracy of learned dynamics showcased on various simulated tasks.", "bibtex": "@InProceedings{pmlr-v97-becker-ehmck19a,\n title = \t {Switching Linear Dynamics for Variational {B}ayes Filtering},\n author = {Becker-Ehmck, Philip and Peters, Jan and Van Der Smagt, Patrick},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {553--562},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/becker-ehmck19a/becker-ehmck19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/becker-ehmck19a.html},\n abstract = \t {System identification of complex and nonlinear systems is a central problem for model predictive control and model-based reinforcement learning. Despite their complexity, such systems can often be approximated well by a set of linear dynamical systems if broken into appropriate subsequences. This mechanism not only helps us find good approximations of dynamics, but also gives us deeper insight into the underlying system. Leveraging Bayesian inference, Variational Autoencoders and Concrete relaxations, we show how to learn a richer and more meaningful state space, e.g. encoding joint constraints and collisions with walls in a maze, from partial and high-dimensional observations. This representation translates into a gain of accuracy of learned dynamics showcased on various simulated tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/becker-ehmck19a/becker-ehmck19a.pdf", "supp": "", "pdf_size": 972593, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8387734584820293660&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Machine Learning Research Lab, Volkswagen Group, Munich, Germany+Department of Computer Science, Technische Universit\u00e4t Darmstadt, Darmstadt, Germany; Department of Computer Science, Technische Universit\u00e4t Darmstadt, Darmstadt, Germany; Machine Learning Research Lab, Volkswagen Group, Munich, Germany", "aff_domain": "volkswagen.de; ; ", "email": "volkswagen.de; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/becker-ehmck19a.html", "aff_unique_index": "0+1;1;0", "aff_unique_norm": "Volkswagen Group;Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "Machine Learning Research Lab;Department of Computer Science", "aff_unique_url": "https://www.volkswagenag.com;https://www.tu-darmstadt.de", "aff_unique_abbr": "VW Group;TUD", "aff_campus_unique_index": "0+1;1;0", "aff_campus_unique": "Munich;Darmstadt", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "Germany" }, { "title": "Taming MAML: Efficient unbiased meta-reinforcement learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4182", "id": "4182", "author_site": "Hao Liu, Richard Socher, Caiming Xiong", "author": "Hao Liu; Richard Socher; Caiming Xiong", "abstract": "While meta reinforcement learning (Meta-RL) methods have achieved remarkable success, obtaining correct and low variance estimates for policy gradients remains a significant challenge. In particular, estimating a large Hessian, poor sample efficiency and unstable training continue to make Meta-RL difficult. We propose a surrogate objective function named, Taming MAML (TMAML), that adds control variates into gradient estimation via automatic differentiation. TMAML improves the quality of gradient estimation by reducing variance without introducing bias. We further propose a version of our method that extends the meta-learning framework to learning the control variates themselves, enabling efficient and scalable learning from a distribution of MDPs. We empirically compare our approach with MAML and other variance-bias trade-off methods including DICE, LVC, and action-dependent control variates. Our approach is easy to implement and outperforms existing methods in terms of the variance and accuracy of gradient estimation, ultimately yielding higher performance across a variety of challenging Meta-RL environments.", "bibtex": "@InProceedings{pmlr-v97-liu19g,\n title = \t {Taming {MAML}: Efficient unbiased meta-reinforcement learning},\n author = {Liu, Hao and Socher, Richard and Xiong, Caiming},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4061--4071},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19g/liu19g.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19g.html},\n abstract = \t {While meta reinforcement learning (Meta-RL) methods have achieved remarkable success, obtaining correct and low variance estimates for policy gradients remains a significant challenge. In particular, estimating a large Hessian, poor sample efficiency and unstable training continue to make Meta-RL difficult. We propose a surrogate objective function named, Taming MAML (TMAML), that adds control variates into gradient estimation via automatic differentiation. TMAML improves the quality of gradient estimation by reducing variance without introducing bias. We further propose a version of our method that extends the meta-learning framework to learning the control variates themselves, enabling efficient and scalable learning from a distribution of MDPs. We empirically compare our approach with MAML and other variance-bias trade-off methods including DICE, LVC, and action-dependent control variates. Our approach is easy to implement and outperforms existing methods in terms of the variance and accuracy of gradient estimation, ultimately yielding higher performance across a variety of challenging Meta-RL environments.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19g/liu19g.pdf", "supp": "", "pdf_size": 2686812, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13296853276515035880&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff": "Salesforce Research, Palo Alto, USA; Salesforce Research, Palo Alto, USA; Salesforce Research, Palo Alto, USA", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/liu19g.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Salesforce Research", "aff_unique_dep": "Research", "aff_unique_url": "https://research.salesforce.com", "aff_unique_abbr": "Salesforce", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Palo Alto", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "TapNet: Neural Network Augmented with Task-Adaptive Projection for Few-Shot Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4024", "id": "4024", "author_site": "Sung Whan Yoon, Jun Seo, Jaekyun Moon", "author": "Sung Whan Yoon; Jun Seo; Jaekyun Moon", "abstract": "Handling previously unseen tasks after given only a few training examples continues to be a tough challenge in machine learning. We propose TapNets, neural networks augmented with task-adaptive projection for improved few-shot learning. Here, employing a meta-learning strategy with episode-based training, a network and a set of per-class reference vectors are learned across widely varying tasks. At the same time, for every episode, features in the embedding space are linearly projected into a new space as a form of quick task-specific conditioning. The training loss is obtained based on a distance metric between the query and the reference vectors in the projection space. Excellent generalization results in this way. When tested on the Omniglot, miniImageNet and tieredImageNet datasets, we obtain state of the art classification accuracies under various few-shot scenarios.", "bibtex": "@InProceedings{pmlr-v97-yoon19a,\n title = \t {{T}ap{N}et: Neural Network Augmented with Task-Adaptive Projection for Few-Shot Learning},\n author = {Yoon, Sung Whan and Seo, Jun and Moon, Jaekyun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7115--7123},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yoon19a/yoon19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yoon19a.html},\n abstract = \t {Handling previously unseen tasks after given only a few training examples continues to be a tough challenge in machine learning. We propose TapNets, neural networks augmented with task-adaptive projection for improved few-shot learning. Here, employing a meta-learning strategy with episode-based training, a network and a set of per-class reference vectors are learned across widely varying tasks. At the same time, for every episode, features in the embedding space are linearly projected into a new space as a form of quick task-specific conditioning. The training loss is obtained based on a distance metric between the query and the reference vectors in the projection space. Excellent generalization results in this way. When tested on the Omniglot, miniImageNet and tieredImageNet datasets, we obtain state of the art classification accuracies under various few-shot scenarios.}\n}", "pdf": "http://proceedings.mlr.press/v97/yoon19a/yoon19a.pdf", "supp": "", "pdf_size": 645711, "gs_citation": 331, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12575801957058912486&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Korea; School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Korea; School of Electrical Engineering, Korea Advanced Institute of Science and Technology (KAIST), Daejeon, Korea", "aff_domain": "kaist.ac.kr; ; ", "email": "kaist.ac.kr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yoon19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "School of Electrical Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "TarMAC: Targeted Multi-Agent Communication", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3610", "id": "3610", "author_site": "Abhishek Das, Theophile Gervet, Joshua Romoff, Dhruv Batra, Devi Parikh, Michael Rabbat, Joelle Pineau", "author": "Abhishek Das; Th\u00e9ophile Gervet; Joshua Romoff; Dhruv Batra; Devi Parikh; Mike Rabbat; Joelle Pineau", "abstract": "We propose a targeted communication architecture for multi-agent reinforcement learning, where agents learn both", "bibtex": "@InProceedings{pmlr-v97-das19a,\n title = \t {{T}ar{MAC}: Targeted Multi-Agent Communication},\n author = {Das, Abhishek and Gervet, Th{\\'e}ophile and Romoff, Joshua and Batra, Dhruv and Parikh, Devi and Rabbat, Mike and Pineau, Joelle},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1538--1546},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/das19a/das19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/das19a.html},\n abstract = \t {We propose a targeted communication architecture for multi-agent reinforcement learning, where agents learn both", "pdf": "http://proceedings.mlr.press/v97/das19a/das19a.pdf", "supp": "", "pdf_size": 3322465, "gs_citation": 536, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12185105573950195413&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "Georgia Tech; McGill University; Facebook AI Research; Georgia Tech; Facebook AI Research; McGill University; Facebook AI Research", "aff_domain": "gatech.edu; ; ; ; ; ; ", "email": "gatech.edu; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/das19a.html", "aff_unique_index": "0;1;2;0;2;1;2", "aff_unique_norm": "Georgia Institute of Technology;McGill University;Meta", "aff_unique_dep": ";;Facebook AI Research", "aff_unique_url": "https://www.gatech.edu;https://www.mcgill.ca;https://research.facebook.com", "aff_unique_abbr": "Georgia Tech;McGill;FAIR", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "title": "Target Tracking for Contextual Bandits: Application to Demand Side Management", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4050", "id": "4050", "author_site": "Margaux Br\u00e9g\u00e8re, Pierre Gaillard, Yannig Goude, Gilles Stoltz", "author": "Margaux Br\u00e9g\u00e8re; Pierre Gaillard; Yannig Goude; Gilles Stoltz", "abstract": "We propose a contextual-bandit approach for demand side management by offering price incentives. More precisely, a target mean consumption is set at each round and the mean consumption is modeled as a complex function of the distribution of prices sent and of some contextual variables such as the temperature, weather, and so on. The performance of our strategies is measured in quadratic losses through a regret criterion. We offer $T^{2/3}$ upper bounds on this regret (up to poly-logarithmic terms)\u2014and even faster rates under stronger assumptions\u2014for strategies inspired by standard strategies for contextual bandits (like LinUCB, see Li et al., 2010). Simulations on a real data set gathered by UK Power Networks, in which price incentives were offered, show that our strategies are effective and may indeed manage demand response by suitably picking the price levels.", "bibtex": "@InProceedings{pmlr-v97-bregere19a,\n title = \t {Target Tracking for Contextual Bandits: Application to Demand Side Management},\n author = {Br{\\'e}g{\\`e}re, Margaux and Gaillard, Pierre and Goude, Yannig and Stoltz, Gilles},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {754--763},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bregere19a/bregere19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bregere19a.html},\n abstract = \t {We propose a contextual-bandit approach for demand side management by offering price incentives. More precisely, a target mean consumption is set at each round and the mean consumption is modeled as a complex function of the distribution of prices sent and of some contextual variables such as the temperature, weather, and so on. The performance of our strategies is measured in quadratic losses through a regret criterion. We offer $T^{2/3}$ upper bounds on this regret (up to poly-logarithmic terms)\u2014and even faster rates under stronger assumptions\u2014for strategies inspired by standard strategies for contextual bandits (like LinUCB, see Li et al., 2010). Simulations on a real data set gathered by UK Power Networks, in which price incentives were offered, show that our strategies are effective and may indeed manage demand response by suitably picking the price levels.}\n}", "pdf": "http://proceedings.mlr.press/v97/bregere19a/bregere19a.pdf", "supp": "", "pdf_size": 4138032, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=329211335115743460&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "EDF R&D, Palaiseau, France+Laboratoire de math\u00e9matiques d\u2019Orsay, Universit\u00e9 Paris-Sud, CNRS, Universit\u00e9 Paris-Saclay, Orsay, France; INRIA - D\u00e9partement d\u2019Informatique de l\u2019\u00c9cole Normale Sup\u00e9rieure, PSL Research University, Paris, France; EDF R&D, Palaiseau, France+Laboratoire de math\u00e9matiques d\u2019Orsay, Universit\u00e9 Paris-Sud, CNRS, Universit\u00e9 Paris-Saclay, Orsay, France; Laboratoire de math\u00e9matiques d\u2019Orsay, Universit\u00e9 Paris-Sud, CNRS, Universit\u00e9 Paris-Saclay, Orsay, France", "aff_domain": "edf.fr; ; ; ", "email": "edf.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/bregere19a.html", "aff_unique_index": "0+1;2;0+1;1", "aff_unique_norm": "EDF R&D;Universit\u00e9 Paris-Sud;INRIA", "aff_unique_dep": ";Laboratoire de math\u00e9matiques d\u2019Orsay;D\u00e9partement d\u2019Informatique de l\u2019\u00c9cole Normale Sup\u00e9rieure", "aff_unique_url": "https://www.edf.com;https://www.universite-paris-sud.fr;https://www.inria.fr", "aff_unique_abbr": "EDF;UPS;INRIA", "aff_campus_unique_index": "0+1;2;0+1;1", "aff_campus_unique": "Palaiseau;Orsay;Paris", "aff_country_unique_index": "0+0;0;0+0;0", "aff_country_unique": "France" }, { "title": "Target-Based Temporal-Difference Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4092", "id": "4092", "author_site": "Donghwan Lee, Niao He", "author": "Donghwan Lee; Niao He", "abstract": "The use of target networks has been a popular and key component of recent deep Q-learning algorithms for reinforcement learning, yet little is known from the theory side. In this work, we introduce a new family of target-based temporal difference (TD) learning algorithms that maintain two separate learning parameters {\u2013} the target variable and online variable. We propose three members in the family, the averaging TD, double TD, and periodic TD, where the target variable is updated through an averaging, symmetric, or periodic fashion, respectively, mirroring those techniques used in deep Q-learning practice. We establish asymptotic convergence analyses for both averaging TD and double TD and a finite sample analysis for periodic TD. In addition, we provide some simulation results showing potentially superior convergence of these target-based TD algorithms compared to the standard TD-learning. While this work focuses on linear function approximation and policy evaluation setting, we consider this as a meaningful step towards the theoretical understanding of deep Q-learning variants with target networks.", "bibtex": "@InProceedings{pmlr-v97-lee19a,\n title = \t {Target-Based Temporal-Difference Learning},\n author = {Lee, Donghwan and He, Niao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3713--3722},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/lee19a/lee19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/lee19a.html},\n abstract = \t {The use of target networks has been a popular and key component of recent deep Q-learning algorithms for reinforcement learning, yet little is known from the theory side. In this work, we introduce a new family of target-based temporal difference (TD) learning algorithms that maintain two separate learning parameters {\u2013} the target variable and online variable. We propose three members in the family, the averaging TD, double TD, and periodic TD, where the target variable is updated through an averaging, symmetric, or periodic fashion, respectively, mirroring those techniques used in deep Q-learning practice. We establish asymptotic convergence analyses for both averaging TD and double TD and a finite sample analysis for periodic TD. In addition, we provide some simulation results showing potentially superior convergence of these target-based TD algorithms compared to the standard TD-learning. While this work focuses on linear function approximation and policy evaluation setting, we consider this as a meaningful step towards the theoretical understanding of deep Q-learning variants with target networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/lee19a/lee19a.pdf", "supp": "", "pdf_size": 385271, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7698668899801526138&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Coordinated Science Laboratory, University of Illinois at Urbana-Champaign, USA+Department of Industrial and Enterprise Systems Engineering, University of Illinois at Urbana-Champaign, USA; Department of Industrial and Enterprise Systems Engineering, University of Illinois at Urbana-Champaign, USA", "aff_domain": "illinois.edu;illinois.edu", "email": "illinois.edu;illinois.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/lee19a.html", "aff_unique_index": "0+0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "Coordinated Science Laboratory", "aff_unique_url": "https://www illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Task-Agnostic Dynamics Priors for Deep Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4136", "id": "4136", "author_site": "Yilun Du, Karthik Narasimhan", "author": "Yilun Du; Karthic Narasimhan", "abstract": "While model-based deep reinforcement learning (RL) holds great promise for sample efficiency and generalization, learning an accurate dynamics model is often challenging and requires substantial interaction with the environment. A wide variety of domains have dynamics that share common foundations like the laws of classical mechanics, which are rarely exploited by existing algorithms. In fact, humans continuously acquire and use such dynamics priors to easily adapt to operating in new environments. In this work, we propose an approach to learn task-agnostic dynamics priors from videos and incorporate them into an RL agent. Our method involves pre-training a frame predictor on task-agnostic physics videos to initialize dynamics models (and fine-tune them) for unseen target environments. Our frame prediction architecture, SpatialNet, is designed specifically to capture localized physical phenomena and interactions. Our approach allows for both faster policy learning and convergence to better policies, outperforming competitive approaches on several different environments. We also demonstrate that incorporating this prior allows for more effective transfer between environments.", "bibtex": "@InProceedings{pmlr-v97-du19e,\n title = \t {Task-Agnostic Dynamics Priors for Deep Reinforcement Learning},\n author = {Du, Yilun and Narasimhan, Karthic},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1696--1705},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/du19e/du19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/du19e.html},\n abstract = \t {While model-based deep reinforcement learning (RL) holds great promise for sample efficiency and generalization, learning an accurate dynamics model is often challenging and requires substantial interaction with the environment. A wide variety of domains have dynamics that share common foundations like the laws of classical mechanics, which are rarely exploited by existing algorithms. In fact, humans continuously acquire and use such dynamics priors to easily adapt to operating in new environments. In this work, we propose an approach to learn task-agnostic dynamics priors from videos and incorporate them into an RL agent. Our method involves pre-training a frame predictor on task-agnostic physics videos to initialize dynamics models (and fine-tune them) for unseen target environments. Our frame prediction architecture, SpatialNet, is designed specifically to capture localized physical phenomena and interactions. Our approach allows for both faster policy learning and convergence to better policies, outperforming competitive approaches on several different environments. We also demonstrate that incorporating this prior allows for more effective transfer between environments.}\n}", "pdf": "http://proceedings.mlr.press/v97/du19e/du19e.pdf", "supp": "", "pdf_size": 775379, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2869858217562916387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Massachusetts Institute of Technology (Work partially done at OpenAI); Princeton University", "aff_domain": "mit.com;cs.princeton.edu", "email": "mit.com;cs.princeton.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/du19e.html", "aff_unique_index": "0;1", "aff_unique_norm": "Massachusetts Institute of Technology;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://www.mit.edu;https://www.princeton.edu", "aff_unique_abbr": "MIT;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Teaching a black-box learner", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3964", "id": "3964", "author_site": "Sanjoy Dasgupta, Daniel Hsu, Stefanos Poulis, Jerry Zhu", "author": "Sanjoy Dasgupta; Daniel Hsu; Stefanos Poulis; Xiaojin Zhu", "abstract": "One widely-studied model of", "bibtex": "@InProceedings{pmlr-v97-dasgupta19a,\n title = \t {Teaching a black-box learner},\n author = {Dasgupta, Sanjoy and Hsu, Daniel and Poulis, Stefanos and Zhu, Xiaojin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1547--1555},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dasgupta19a/dasgupta19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dasgupta19a.html},\n abstract = \t {One widely-studied model of", "pdf": "http://proceedings.mlr.press/v97/dasgupta19a/dasgupta19a.pdf", "supp": "", "pdf_size": 469423, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2232293952350454430&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14, "aff": "University of California, San Diego; Columbia University; NTENT; University of Wisconsin\u2013Madison", "aff_domain": "eng.ucsd.edu; ; ; ", "email": "eng.ucsd.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/dasgupta19a.html", "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of California, San Diego;Columbia University;NTENT;University of Wisconsin\u2013Madison", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.ucsd.edu;https://www.columbia.edu;;https://www.wisc.edu", "aff_unique_abbr": "UCSD;Columbia;;UW\u2013Madison", "aff_campus_unique_index": "0;2", "aff_campus_unique": "San Diego;;Madison", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "title": "Temporal Gaussian Mixture Layer for Videos", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3578", "id": "3578", "author_site": "AJ Piergiovanni, Michael Ryoo", "author": "Aj Piergiovanni; Michael Ryoo", "abstract": "We introduce a new convolutional layer named the Temporal Gaussian Mixture (TGM) layer and present how it can be used to efficiently capture longer-term temporal information in continuous activity videos. The TGM layer is a temporal convolutional layer governed by a much smaller set of parameters (e.g., location/variance of Gaussians) that are fully differentiable. We present our fully convolutional video models with multiple TGM layers for activity detection. The extensive experiments on multiple datasets, including Charades and MultiTHUMOS, confirm the effectiveness of TGM layers, significantly outperforming the state-of-the-arts.", "bibtex": "@InProceedings{pmlr-v97-piergiovanni19a,\n title = \t {Temporal {G}aussian Mixture Layer for Videos},\n author = {Piergiovanni, Aj and Ryoo, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5152--5161},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/piergiovanni19a/piergiovanni19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/piergiovanni19a.html},\n abstract = \t {We introduce a new convolutional layer named the Temporal Gaussian Mixture (TGM) layer and present how it can be used to efficiently capture longer-term temporal information in continuous activity videos. The TGM layer is a temporal convolutional layer governed by a much smaller set of parameters (e.g., location/variance of Gaussians) that are fully differentiable. We present our fully convolutional video models with multiple TGM layers for activity detection. The extensive experiments on multiple datasets, including Charades and MultiTHUMOS, confirm the effectiveness of TGM layers, significantly outperforming the state-of-the-arts.}\n}", "pdf": "http://proceedings.mlr.press/v97/piergiovanni19a/piergiovanni19a.pdf", "supp": "", "pdf_size": 684171, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7515216755463628280&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, Indiana University; Department of Computer Science, Indiana University", "aff_domain": "indiana.edu;indiana.edu", "email": "indiana.edu;indiana.edu", "github": "https://github.com/piergiaj/tgm-icml19", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/piergiovanni19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Indiana University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.indiana.edu", "aff_unique_abbr": "IU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Tensor Variable Elimination for Plated Factor Graphs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4278", "id": "4278", "author_site": "Fritz Obermeyer, Elias Bingham, Martin Jankowiak, Neeraj Pradhan, Justin Chiu, Alexander Rush, Noah Goodman", "author": "Fritz Obermeyer; Eli Bingham; Martin Jankowiak; Neeraj Pradhan; Justin Chiu; Alexander Rush; Noah Goodman", "abstract": "A wide class of machine learning algorithms can be reduced to variable elimination on factor graphs. While factor graphs provide a unifying notation for these algorithms, they do not provide a compact way to express repeated structure when compared to plate diagrams for directed graphical models. To exploit efficient tensor algebra in graphs with plates of variables, we generalize undirected factor graphs to plated factor graphs and variable elimination to a tensor variable elimination algorithm that operates directly on plated factor graphs. Moreover, we generalize complexity bounds based on treewidth and characterize the class of plated factor graphs for which inference is tractable. As an application, we integrate tensor variable elimination into the Pyro probabilistic programming language to enable exact inference in discrete latent variable models with repeated structure. We validate our methods with experiments on both directed and undirected graphical models, including applications to polyphonic music modeling, animal movement modeling, and latent sentiment analysis.", "bibtex": "@InProceedings{pmlr-v97-obermeyer19a,\n title = \t {Tensor Variable Elimination for Plated Factor Graphs},\n author = {Obermeyer, Fritz and Bingham, Eli and Jankowiak, Martin and Pradhan, Neeraj and Chiu, Justin and Rush, Alexander and Goodman, Noah},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4871--4880},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/obermeyer19a/obermeyer19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/obermeyer19a.html},\n abstract = \t {A wide class of machine learning algorithms can be reduced to variable elimination on factor graphs. While factor graphs provide a unifying notation for these algorithms, they do not provide a compact way to express repeated structure when compared to plate diagrams for directed graphical models. To exploit efficient tensor algebra in graphs with plates of variables, we generalize undirected factor graphs to plated factor graphs and variable elimination to a tensor variable elimination algorithm that operates directly on plated factor graphs. Moreover, we generalize complexity bounds based on treewidth and characterize the class of plated factor graphs for which inference is tractable. As an application, we integrate tensor variable elimination into the Pyro probabilistic programming language to enable exact inference in discrete latent variable models with repeated structure. We validate our methods with experiments on both directed and undirected graphical models, including applications to polyphonic music modeling, animal movement modeling, and latent sentiment analysis.}\n}", "pdf": "http://proceedings.mlr.press/v97/obermeyer19a/obermeyer19a.pdf", "supp": "", "pdf_size": 464749, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14246613930429251611&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Uber AI Labs; Uber AI Labs; Uber AI Labs; Harvard University; Uber AI Labs; Harvard University; Stanford University", "aff_domain": "uber.com; ; ; ; ; ; ", "email": "uber.com; ; ; ; ; ; ", "github": "", "project": "", "author_num": 7, "oa": "https://proceedings.mlr.press/v97/obermeyer19a.html", "aff_unique_index": "0;0;0;1;0;1;2", "aff_unique_norm": "Uber;Harvard University;Stanford University", "aff_unique_dep": "Uber AI Labs;;", "aff_unique_url": "https://www.uber.com;https://www.harvard.edu;https://www.stanford.edu", "aff_unique_abbr": "Uber AI Labs;Harvard;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "TensorFuzz: Debugging Neural Networks with Coverage-Guided Fuzzing", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4139", "id": "4139", "author_site": "Augustus Odena, Catherine Olsson, David Andersen, Ian Goodfellow", "author": "Augustus Odena; Catherine Olsson; David Andersen; Ian Goodfellow", "abstract": "Neural networks are difficult to interpret and debug. We introduce testing techniques for neural networks that can discover errors occurring only for rare inputs. Specifically, we develop coverage-guided fuzzing (CGF) methods for neural networks. In CGF, random mutations of inputs are guided by a coverage metric toward the goal of satisfying user-specified constraints. We describe how approximate nearest neighbor (ANN) algorithms can provide this coverage metric for neural networks. We then combine these methods with techniques for property-based testing (PBT). In PBT, one asserts properties that a function should satisfy and the system automatically generates tests exercising those properties. We then apply this system to practical goals including (but not limited to) surfacing broken loss functions in popular GitHub repositories and making performance improvements to TensorFlow. Finally, we release an open source library called TensorFuzz that implements the described techniques.", "bibtex": "@InProceedings{pmlr-v97-odena19a,\n title = \t {{T}ensor{F}uzz: Debugging Neural Networks with Coverage-Guided Fuzzing},\n author = {Odena, Augustus and Olsson, Catherine and Andersen, David and Goodfellow, Ian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4901--4911},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/odena19a/odena19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/odena19a.html},\n abstract = \t {Neural networks are difficult to interpret and debug. We introduce testing techniques for neural networks that can discover errors occurring only for rare inputs. Specifically, we develop coverage-guided fuzzing (CGF) methods for neural networks. In CGF, random mutations of inputs are guided by a coverage metric toward the goal of satisfying user-specified constraints. We describe how approximate nearest neighbor (ANN) algorithms can provide this coverage metric for neural networks. We then combine these methods with techniques for property-based testing (PBT). In PBT, one asserts properties that a function should satisfy and the system automatically generates tests exercising those properties. We then apply this system to practical goals including (but not limited to) surfacing broken loss functions in popular GitHub repositories and making performance improvements to TensorFlow. Finally, we release an open source library called TensorFuzz that implements the described techniques.}\n}", "pdf": "http://proceedings.mlr.press/v97/odena19a/odena19a.pdf", "supp": "", "pdf_size": 1077027, "gs_citation": 413, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13306601112355106792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Google Brain; Open Philanthropy Project (work done while at Google Brain); Google Brain; Work done while at Google Brain", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/odena19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Anisotropic Noise in Stochastic Gradient Descent: Its Behavior of Escaping from Sharp Minima and Regularization Effects", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3792", "id": "3792", "author_site": "Zhanxing Zhu, Jingfeng Wu, Bing Yu, Lei Wu, Jinwen Ma", "author": "Zhanxing Zhu; Jingfeng Wu; Bing Yu; Lei Wu; Jinwen Ma", "abstract": "Understanding the behavior of stochastic gradient descent (SGD) in the context of deep neural networks has raised lots of concerns recently. Along this line, we study a general form of gradient based optimization dynamics with unbiased noise, which unifies SGD and standard Langevin dynamics. Through investigating this general optimization dynamics, we analyze the behavior of SGD on escaping from minima and its regularization effects. A novel indicator is derived to characterize the efficiency of escaping from minima through measuring the alignment of noise covariance and the curvature of loss function. Based on this indicator, two conditions are established to show which type of noise structure is superior to isotropic noise in term of escaping efficiency. We further show that the anisotropic noise in SGD satisfies the two conditions, and thus helps to escape from sharp and poor minima effectively, towards more stable and flat minima that typically generalize well. We systematically design various experiments to verify the benefits of the anisotropic noise, compared with full gradient descent plus isotropic diffusion (i.e. Langevin dynamics).", "bibtex": "@InProceedings{pmlr-v97-zhu19e,\n title = \t {The Anisotropic Noise in Stochastic Gradient Descent: Its Behavior of Escaping from Sharp Minima and Regularization Effects},\n author = {Zhu, Zhanxing and Wu, Jingfeng and Yu, Bing and Wu, Lei and Ma, Jinwen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7654--7663},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhu19e/zhu19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhu19e.html},\n abstract = \t {Understanding the behavior of stochastic gradient descent (SGD) in the context of deep neural networks has raised lots of concerns recently. Along this line, we study a general form of gradient based optimization dynamics with unbiased noise, which unifies SGD and standard Langevin dynamics. Through investigating this general optimization dynamics, we analyze the behavior of SGD on escaping from minima and its regularization effects. A novel indicator is derived to characterize the efficiency of escaping from minima through measuring the alignment of noise covariance and the curvature of loss function. Based on this indicator, two conditions are established to show which type of noise structure is superior to isotropic noise in term of escaping efficiency. We further show that the anisotropic noise in SGD satisfies the two conditions, and thus helps to escape from sharp and poor minima effectively, towards more stable and flat minima that typically generalize well. We systematically design various experiments to verify the benefits of the anisotropic noise, compared with full gradient descent plus isotropic diffusion (i.e. Langevin dynamics).}\n}", "pdf": "http://proceedings.mlr.press/v97/zhu19e/zhu19e.pdf", "supp": "", "pdf_size": 7044982, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8530319537943237114&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "School of Mathematical Sciences, Peking University, Beijing, China+Center for Data Science, Peking University, Beijing, China+Beijing Institute of Big Data Research, Beijing, China; School of Mathematical Sciences, Peking University, Beijing, China; School of Mathematical Sciences, Peking University, Beijing, China; School of Mathematical Sciences, Peking University, Beijing, China; School of Mathematical Sciences, Peking University, Beijing, China", "aff_domain": "pku.edu.cn;pku.edu.cn; ; ; ", "email": "pku.edu.cn;pku.edu.cn; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zhu19e.html", "aff_unique_index": "0+0+1;0;0;0;0", "aff_unique_norm": "Peking University;Beijing Institute of Big Data Research", "aff_unique_dep": "School of Mathematical Sciences;", "aff_unique_url": "http://www.pku.edu.cn;", "aff_unique_abbr": "PKU;", "aff_campus_unique_index": "0+0+0;0;0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0+0+0;0;0;0;0", "aff_country_unique": "China" }, { "title": "The Effect of Network Width on Stochastic Gradient Descent and Generalization: an Empirical Study", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3994", "id": "3994", "author_site": "Daniel Park, Jascha Sohl-Dickstein, Quoc Le, Samuel Smith", "author": "Daniel Park; Jascha Sohl-Dickstein; Quoc Le; Samuel Smith", "abstract": "We investigate how the final parameters found by stochastic gradient descent are influenced by over-parameterization. We generate families of models by increasing the number of channels in a base network, and then perform a large hyper-parameter search to study how the test error depends on learning rate, batch size, and network width. We find that the optimal SGD hyper-parameters are determined by a \"normalized noise scale,\" which is a function of the batch size, learning rate, and initialization conditions. In the absence of batch normalization, the optimal normalized noise scale is directly proportional to width. Wider networks, with their higher optimal noise scale, also achieve higher test accuracy. These observations hold for MLPs, ConvNets, and ResNets, and for two different parameterization schemes (\"Standard\" and \"NTK\"). We observe a similar trend with batch normalization for ResNets. Surprisingly, since the largest stable learning rate is bounded, the largest batch size consistent with the optimal normalized noise scale decreases as the width increases.", "bibtex": "@InProceedings{pmlr-v97-park19b,\n title = \t {The Effect of Network Width on Stochastic Gradient Descent and Generalization: an Empirical Study},\n author = {Park, Daniel and Sohl-Dickstein, Jascha and Le, Quoc and Smith, Samuel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5042--5051},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/park19b/park19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/park19b.html},\n abstract = \t {We investigate how the final parameters found by stochastic gradient descent are influenced by over-parameterization. We generate families of models by increasing the number of channels in a base network, and then perform a large hyper-parameter search to study how the test error depends on learning rate, batch size, and network width. We find that the optimal SGD hyper-parameters are determined by a \"normalized noise scale,\" which is a function of the batch size, learning rate, and initialization conditions. In the absence of batch normalization, the optimal normalized noise scale is directly proportional to width. Wider networks, with their higher optimal noise scale, also achieve higher test accuracy. These observations hold for MLPs, ConvNets, and ResNets, and for two different parameterization schemes (\"Standard\" and \"NTK\"). We observe a similar trend with batch normalization for ResNets. Surprisingly, since the largest stable learning rate is bounded, the largest batch size consistent with the optimal normalized noise scale decreases as the width increases.}\n}", "pdf": "http://proceedings.mlr.press/v97/park19b/park19b.pdf", "supp": "", "pdf_size": 1182402, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6441269188348514639&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Google Brain, Mountain View, USA+Work done as a member of the Google AI Residency Program; Google Brain, Mountain View, USA; Google Brain, Mountain View, USA; DeepMind, London, UK", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/park19b.html", "aff_unique_index": "0+0;0;0;1", "aff_unique_norm": "Google;DeepMind", "aff_unique_dep": "Google Brain;", "aff_unique_url": "https://brain.google.com;https://deepmind.com", "aff_unique_abbr": "Google Brain;DeepMind", "aff_campus_unique_index": "0+0;0;0;1", "aff_campus_unique": "Mountain View;London", "aff_country_unique_index": "0+0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "title": "The Evolved Transformer", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4285", "id": "4285", "author_site": "David So, Quoc Le, Chen Liang", "author": "David So; Quoc Le; Chen Liang", "abstract": "Recent works have highlighted the strength of the Transformer architecture on sequence tasks while, at the same time, neural architecture search (NAS) has begun to outperform human-designed models. Our goal is to apply NAS to search for a better alternative to the Transformer. We first construct a large search space inspired by the recent advances in feed-forward sequence models and then run evolutionary architecture search with warm starting by seeding our initial population with the Transformer. To directly search on the computationally expensive WMT 2014 English-German translation task, we develop the Progressive Dynamic Hurdles method, which allows us to dynamically allocate more resources to more promising candidate models. The architecture found in our experiments \u2013 the Evolved Transformer \u2013 demonstrates consistent improvement over the Transformer on four well-established language tasks: WMT 2014 English-German, WMT 2014 English-French, WMT 2014 English-Czech and LM1B. At a big model size, the Evolved Transformer establishes a new state-of-the-art BLEU score of 29.8 on WMT\u201914 English-German; at smaller sizes, it achieves the same quality as the original \"big\" Transformer with 37.6% less parameters and outperforms the Transformer by 0.7 BLEU at a mobile-friendly model size of 7M parameters.", "bibtex": "@InProceedings{pmlr-v97-so19a,\n title = \t {The Evolved Transformer},\n author = {So, David and Le, Quoc and Liang, Chen},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5877--5886},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/so19a/so19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/so19a.html},\n abstract = \t {Recent works have highlighted the strength of the Transformer architecture on sequence tasks while, at the same time, neural architecture search (NAS) has begun to outperform human-designed models. Our goal is to apply NAS to search for a better alternative to the Transformer. We first construct a large search space inspired by the recent advances in feed-forward sequence models and then run evolutionary architecture search with warm starting by seeding our initial population with the Transformer. To directly search on the computationally expensive WMT 2014 English-German translation task, we develop the Progressive Dynamic Hurdles method, which allows us to dynamically allocate more resources to more promising candidate models. The architecture found in our experiments \u2013 the Evolved Transformer \u2013 demonstrates consistent improvement over the Transformer on four well-established language tasks: WMT 2014 English-German, WMT 2014 English-French, WMT 2014 English-Czech and LM1B. At a big model size, the Evolved Transformer establishes a new state-of-the-art BLEU score of 29.8 on WMT\u201914 English-German; at smaller sizes, it achieves the same quality as the original \"big\" Transformer with 37.6% less parameters and outperforms the Transformer by 0.7 BLEU at a mobile-friendly model size of 7M parameters.}\n}", "pdf": "http://proceedings.mlr.press/v97/so19a/so19a.pdf", "supp": "", "pdf_size": 502186, "gs_citation": 580, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12069106626021161148&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Google Research, Brain Team, Mountain View, California, USA; Google Research, Brain Team, Mountain View, California, USA; Google Research, Brain Team, Mountain View, California, USA", "aff_domain": "google.com; ; ", "email": "google.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/so19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Research, Brain Team", "aff_unique_url": "https://research.google", "aff_unique_abbr": "Google", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "The Implicit Fairness Criterion of Unconstrained Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4223", "id": "4223", "author_site": "Lydia T. Liu, Max Simchowitz, Moritz Hardt", "author": "Lydia T. Liu; Max Simchowitz; Moritz Hardt", "abstract": "We clarify what fairness guarantees we can and cannot expect to follow from unconstrained machine learning. Specifically, we show that in many settings, unconstrained learning on its own implies group calibration, that is, the outcome variable is conditionally independent of group membership given the score. A lower bound confirms the optimality of our upper bound. Moreover, we prove that as the excess risk of the learned score decreases, the more strongly it violates separation and independence, two other standard fairness criteria. Our results challenge the view that group calibration necessitates an active intervention, suggesting that often we ought to think of it as a byproduct of unconstrained machine learning.", "bibtex": "@InProceedings{pmlr-v97-liu19f,\n title = \t {The Implicit Fairness Criterion of Unconstrained Learning},\n author = {Liu, Lydia T. and Simchowitz, Max and Hardt, Moritz},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4051--4060},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19f/liu19f.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19f.html},\n abstract = \t {We clarify what fairness guarantees we can and cannot expect to follow from unconstrained machine learning. Specifically, we show that in many settings, unconstrained learning on its own implies group calibration, that is, the outcome variable is conditionally independent of group membership given the score. A lower bound confirms the optimality of our upper bound. Moreover, we prove that as the excess risk of the learned score decreases, the more strongly it violates separation and independence, two other standard fairness criteria. Our results challenge the view that group calibration necessitates an active intervention, suggesting that often we ought to think of it as a byproduct of unconstrained machine learning.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19f/liu19f.pdf", "supp": "", "pdf_size": 5652948, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10835172487821087193&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, USA; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, USA; Department of Electrical Engineering and Computer Sciences, University of California, Berkeley, USA", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/liu19f.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "Department of Electrical Engineering and Computer Sciences", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "The Kernel Interaction Trick: Fast Bayesian Discovery of Pairwise Interactions in High Dimensions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3785", "id": "3785", "author_site": "Raj Agrawal, Brian Trippe, Jonathan Huggins, Tamara Broderick", "author": "Raj Agrawal; Brian Trippe; Jonathan Huggins; Tamara Broderick", "abstract": "Discovering interaction effects on a response of interest is a fundamental problem faced in biology, medicine, economics, and many other scientific disciplines. In theory, Bayesian methods for discovering pairwise interactions enjoy many benefits such as coherent uncertainty quantification, the ability to incorporate background knowledge, and desirable shrinkage properties. In practice, however, Bayesian methods are often computationally intractable for even moderate- dimensional problems. Our key insight is that many hierarchical models of practical interest admit a Gaussian process representation such that rather than maintaining a posterior over all O(p^2) interactions, we need only maintain a vector of O(p) kernel hyper-parameters. This implicit representation allows us to run Markov chain Monte Carlo (MCMC) over model hyper-parameters in time and memory linear in p per iteration. We focus on sparsity-inducing models and show on datasets with a variety of covariate behaviors that our method: (1) reduces runtime by orders of magnitude over naive applications of MCMC, (2) provides lower Type I and Type II error relative to state-of-the-art LASSO-based approaches, and (3) offers improved computational scaling in high dimensions relative to existing Bayesian and LASSO-based approaches.", "bibtex": "@InProceedings{pmlr-v97-agrawal19a,\n title = \t {The Kernel Interaction Trick: Fast {B}ayesian Discovery of Pairwise Interactions in High Dimensions},\n author = {Agrawal, Raj and Trippe, Brian and Huggins, Jonathan and Broderick, Tamara},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {141--150},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/agrawal19a/agrawal19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/agrawal19a.html},\n abstract = \t {Discovering interaction effects on a response of interest is a fundamental problem faced in biology, medicine, economics, and many other scientific disciplines. In theory, Bayesian methods for discovering pairwise interactions enjoy many benefits such as coherent uncertainty quantification, the ability to incorporate background knowledge, and desirable shrinkage properties. In practice, however, Bayesian methods are often computationally intractable for even moderate- dimensional problems. Our key insight is that many hierarchical models of practical interest admit a Gaussian process representation such that rather than maintaining a posterior over all O(p^2) interactions, we need only maintain a vector of O(p) kernel hyper-parameters. This implicit representation allows us to run Markov chain Monte Carlo (MCMC) over model hyper-parameters in time and memory linear in p per iteration. We focus on sparsity-inducing models and show on datasets with a variety of covariate behaviors that our method: (1) reduces runtime by orders of magnitude over naive applications of MCMC, (2) provides lower Type I and Type II error relative to state-of-the-art LASSO-based approaches, and (3) offers improved computational scaling in high dimensions relative to existing Bayesian and LASSO-based approaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/agrawal19a/agrawal19a.pdf", "supp": "", "pdf_size": 1125773, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12867361949557728462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, MA; Department of Biostatistics, Harvard, Cambridge, MA; Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, MA; Computer Science and Artificial Intelligence Laboratory, Massachusetts Institute of Technology, Cambridge, MA", "aff_domain": "csail.mit.edu; ; ; ", "email": "csail.mit.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/agrawal19a.html", "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Harvard University", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;Department of Biostatistics", "aff_unique_url": "https://web.mit.edu;https://www.harvard.edu", "aff_unique_abbr": "MIT;Harvard", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "The Natural Language of Actions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3873", "id": "3873", "author_site": "Guy Tennenholtz, Shie Mannor", "author": "Guy Tennenholtz; Shie Mannor", "abstract": "We introduce Act2Vec, a general framework for learning context-based action representation for Reinforcement Learning. Representing actions in a vector space help reinforcement learning algorithms achieve better performance by grouping similar actions and utilizing relations between different actions. We show how prior knowledge of an environment can be extracted from demonstrations and injected into action vector representations that encode natural compatible behavior. We then use these for augmenting state representations as well as improving function approximation of Q-values. We visualize and test action embeddings in three domains including a drawing task, a high dimensional navigation task, and the large action space domain of StarCraft II.", "bibtex": "@InProceedings{pmlr-v97-tennenholtz19a,\n title = \t {The Natural Language of Actions},\n author = {Tennenholtz, Guy and Mannor, Shie},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6196--6205},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tennenholtz19a/tennenholtz19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tennenholtz19a.html},\n abstract = \t {We introduce Act2Vec, a general framework for learning context-based action representation for Reinforcement Learning. Representing actions in a vector space help reinforcement learning algorithms achieve better performance by grouping similar actions and utilizing relations between different actions. We show how prior knowledge of an environment can be extracted from demonstrations and injected into action vector representations that encode natural compatible behavior. We then use these for augmenting state representations as well as improving function approximation of Q-values. We visualize and test action embeddings in three domains including a drawing task, a high dimensional navigation task, and the large action space domain of StarCraft II.}\n}", "pdf": "http://proceedings.mlr.press/v97/tennenholtz19a/tennenholtz19a.pdf", "supp": "", "pdf_size": 2297217, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4045789923174184793&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Faculty of Electrical Engineering, Technion Institute of Technology, Israel; Faculty of Electrical Engineering, Technion Institute of Technology, Israel", "aff_domain": "gmail.com;technion.ac.il", "email": "gmail.com;technion.ac.il", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/tennenholtz19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Technion Institute of Technology", "aff_unique_dep": "Faculty of Electrical Engineering", "aff_unique_url": "https://www.technion.ac.il", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "The Odds are Odd: A Statistical Test for Detecting Adversarial Examples", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4113", "id": "4113", "author_site": "Kevin Roth, Yannic Kilcher, Thomas Hofmann", "author": "Kevin Roth; Yannic Kilcher; Thomas Hofmann", "abstract": "We investigate conditions under which test statistics exist that can reliably detect examples, which have been adversarially manipulated in a white-box attack. These statistics can be easily computed and calibrated by randomly corrupting inputs. They exploit certain anomalies that adversarial attacks introduce, in particular if they follow the paradigm of choosing perturbations optimally under p-norm constraints. Access to the log-odds is the only requirement to defend models. We justify our approach empirically, but also provide conditions under which detectability via the suggested test statistics is guaranteed to be effective. In our experiments, we show that it is even possible to correct test time predictions for adversarial attacks with high accuracy.", "bibtex": "@InProceedings{pmlr-v97-roth19a,\n title = \t {The Odds are Odd: A Statistical Test for Detecting Adversarial Examples},\n author = {Roth, Kevin and Kilcher, Yannic and Hofmann, Thomas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5498--5507},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/roth19a/roth19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/roth19a.html},\n abstract = \t {We investigate conditions under which test statistics exist that can reliably detect examples, which have been adversarially manipulated in a white-box attack. These statistics can be easily computed and calibrated by randomly corrupting inputs. They exploit certain anomalies that adversarial attacks introduce, in particular if they follow the paradigm of choosing perturbations optimally under p-norm constraints. Access to the log-odds is the only requirement to defend models. We justify our approach empirically, but also provide conditions under which detectability via the suggested test statistics is guaranteed to be effective. In our experiments, we show that it is even possible to correct test time predictions for adversarial attacks with high accuracy.}\n}", "pdf": "http://proceedings.mlr.press/v97/roth19a/roth19a.pdf", "supp": "", "pdf_size": 6532540, "gs_citation": 228, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6673355422445965167&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, ETH Z\u00fcrich; Department of Computer Science, ETH Z\u00fcrich; Department of Computer Science, ETH Z\u00fcrich", "aff_domain": "inf.ethz.ch;inf.ethz.ch;inf.ethz.ch", "email": "inf.ethz.ch;inf.ethz.ch;inf.ethz.ch", "github": "https://github.com/yk/icml19_public", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/roth19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "title": "The Value Function Polytope in Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4096", "id": "4096", "author_site": "Robert Dadashi, Marc Bellemare, Adrien Ali Taiga, Nicolas Le Roux, Dale Schuurmans", "author": "Robert Dadashi; Adrien Ali Taiga; Nicolas Le Roux; Dale Schuurmans; Marc G. Bellemare", "abstract": "We establish geometric and topological properties of the space of value functions in finite state-action Markov decision processes. Our main contribution is the characterization of the nature of its shape: a general polytope (Aigner et al., 2010). To demonstrate this result, we exhibit several properties of the structural relationship between policies and value functions including the line theorem, which shows that the value functions of policies constrained on all but one state describe a line segment. Finally, we use this novel perspective and introduce visualizations to enhance the understanding of the dynamics of reinforcement learning algorithms.", "bibtex": "@InProceedings{pmlr-v97-dadashi19a,\n title = \t {The Value Function Polytope in Reinforcement Learning},\n author = {Dadashi, Robert and Taiga, Adrien Ali and Roux, Nicolas Le and Schuurmans, Dale and Bellemare, Marc G.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1486--1495},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dadashi19a/dadashi19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dadashi19a.html},\n abstract = \t {We establish geometric and topological properties of the space of value functions in finite state-action Markov decision processes. Our main contribution is the characterization of the nature of its shape: a general polytope (Aigner et al., 2010). To demonstrate this result, we exhibit several properties of the structural relationship between policies and value functions including the line theorem, which shows that the value functions of policies constrained on all but one state describe a line segment. Finally, we use this novel perspective and introduce visualizations to enhance the understanding of the dynamics of reinforcement learning algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/dadashi19a/dadashi19a.pdf", "supp": "", "pdf_size": 5671212, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7244923175822753415&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "Google Brain; Mila, Universit\u00e9 de Montr\u00e9al+Google Brain; Google Brain; Department of Computing Science, University of Alberta+Google Brain; Google Brain", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/dadashi19a.html", "aff_unique_index": "0;1+0;0;2+0;0", "aff_unique_norm": "Google;Universit\u00e9 de Montr\u00e9al;University of Alberta", "aff_unique_dep": "Google Brain;Mila;Department of Computing Science", "aff_unique_url": "https://brain.google.com;https://umontreal.ca;https://www.ualberta.ca", "aff_unique_abbr": "Google Brain;UdeM;UAlberta", "aff_campus_unique_index": "0;1+0;0;0;0", "aff_campus_unique": "Mountain View;Montr\u00e9al;", "aff_country_unique_index": "0;1+0;0;1+0;0", "aff_country_unique": "United States;Canada" }, { "title": "The Variational Predictive Natural Gradient", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3950", "id": "3950", "author_site": "Da Tang, Rajesh Ranganath", "author": "Da Tang; Rajesh Ranganath", "abstract": "Variational inference transforms posterior inference into parametric optimization thereby enabling the use of latent variable models where otherwise impractical. However, variational inference can be finicky when different variational parameters control variables that are strongly correlated under the model. Traditional natural gradients based on the variational approximation fail to correct for correlations when the approximation is not the true posterior. To address this, we construct a new natural gradient called the Variational Predictive Natural Gradient (VPNG). Unlike traditional natural gradients for variational inference, this natural gradient accounts for the relationship between model parameters and variational parameters. We demonstrate the insight with a simple example as well as the empirical value on a classification task, a deep generative model of images, and probabilistic matrix factorization for recommendation.", "bibtex": "@InProceedings{pmlr-v97-tang19c,\n title = \t {The Variational Predictive Natural Gradient},\n author = {Tang, Da and Ranganath, Rajesh},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6145--6154},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tang19c/tang19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/tang19c.html},\n abstract = \t {Variational inference transforms posterior inference into parametric optimization thereby enabling the use of latent variable models where otherwise impractical. However, variational inference can be finicky when different variational parameters control variables that are strongly correlated under the model. Traditional natural gradients based on the variational approximation fail to correct for correlations when the approximation is not the true posterior. To address this, we construct a new natural gradient called the Variational Predictive Natural Gradient (VPNG). Unlike traditional natural gradients for variational inference, this natural gradient accounts for the relationship between model parameters and variational parameters. We demonstrate the insight with a simple example as well as the empirical value on a classification task, a deep generative model of images, and probabilistic matrix factorization for recommendation.}\n}", "pdf": "http://proceedings.mlr.press/v97/tang19c/tang19c.pdf", "supp": "", "pdf_size": 2492037, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6073859204913275725&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, Columbia University, New York, New York, USA; The Courant Institute, New York University, New York, New York, USA", "aff_domain": "cs.columbia.edu; ", "email": "cs.columbia.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/tang19c.html", "aff_unique_index": "0;1", "aff_unique_norm": "Columbia University;New York University", "aff_unique_dep": "Department of Computer Science;The Courant Institute", "aff_unique_url": "https://www.columbia.edu;https://www.nyu.edu", "aff_unique_abbr": "Columbia;NYU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New York", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "The Wasserstein Transform", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3637", "id": "3637", "author_site": "Facundo Memoli, Zane Smith, Zhengchao Wan", "author": "Facundo Memoli; Zane Smith; Zhengchao Wan", "abstract": "We introduce the Wasserstein transform, a method for enhancing and denoising datasets defined on general metric spaces. The construction draws inspiration from Optimal Transportation ideas. We establish the stability of our method under data perturbation and, when the dataset is assumed to be Euclidean, we also exhibit a precise connection between the Wasserstein transform and the mean shift family of algorithms. We then use this connection to prove that mean shift also inherits stability under perturbations. We study the performance of the Wasserstein transform method on different datasets as a preprocessing step prior to clustering and classification tasks.", "bibtex": "@InProceedings{pmlr-v97-memoli19a,\n title = \t {The {W}asserstein Transform},\n author = {Memoli, Facundo and Smith, Zane and Wan, Zhengchao},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4496--4504},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/memoli19a/memoli19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/memoli19a.html},\n abstract = \t {We introduce the Wasserstein transform, a method for enhancing and denoising datasets defined on general metric spaces. The construction draws inspiration from Optimal Transportation ideas. We establish the stability of our method under data perturbation and, when the dataset is assumed to be Euclidean, we also exhibit a precise connection between the Wasserstein transform and the mean shift family of algorithms. We then use this connection to prove that mean shift also inherits stability under perturbations. We study the performance of the Wasserstein transform method on different datasets as a preprocessing step prior to clustering and classification tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/memoli19a/memoli19a.pdf", "supp": "", "pdf_size": 6133815, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12589521440111530801&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Mathematics, The Ohio State University, Ohio, USA+Department of Computer Science and Engineering, University of Minnesota, Minnesota, USA; Department of Computer Science and Engineering, The Ohio State University, Ohio, USA; Department of Computer Science and Engineering, The Ohio State University, Ohio, USA", "aff_domain": "math.osu.edu;umn.edu;osu.edu", "email": "math.osu.edu;umn.edu;osu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/memoli19a.html", "aff_unique_index": "0+1;0;0", "aff_unique_norm": "Ohio State University;University of Minnesota", "aff_unique_dep": "Department of Mathematics;Department of Computer Science and Engineering", "aff_unique_url": "https://www.osu.edu;https://www.umn.edu", "aff_unique_abbr": "OSU;UMN", "aff_campus_unique_index": "0+1;0;0", "aff_campus_unique": "Ohio;Minnesota", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "The advantages of multiple classes for reducing overfitting from test set reuse", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4321", "id": "4321", "author_site": "Vitaly Feldman, Roy Frostig, Moritz Hardt", "author": "Vitaly Feldman; Roy Frostig; Moritz Hardt", "abstract": "Excessive reuse of holdout data can lead to overfitting. However, there is little concrete evidence of significant overfitting due to holdout reuse in popular multiclass benchmarks today. Known results show that, in the worst-case, revealing the accuracy of $k$ adaptively chosen classifiers on a data set of size $n$ allows to create a classifier with bias of $\\Theta(\\sqrt{k/n})$ for any binary prediction problem. We show a new upper bound of $\\tilde O(\\max\\{\\sqrt{k\\log(n)/(mn)}, k/n\\})$ on the worst-case bias that any attack can achieve in a prediction problem with $m$ classes. Moreover, we present an efficient attack that achieve a bias of $\\Omega(\\sqrt{k/(m^2 n)})$ and improves on previous work for the binary setting ($m=2$). We also present an inefficient attack that achieves a bias of $\\tilde\\Omega(k/n)$. Complementing our theoretical work, we give new practical attacks to stress-test multiclass benchmarks by aiming to create as large a bias as possible with a given number of queries. Our experiments show that the additional uncertainty of prediction with a large number of classes indeed mitigates the effect of our best attacks.", "bibtex": "@InProceedings{pmlr-v97-feldman19a,\n title = \t {The advantages of multiple classes for reducing overfitting from test set reuse},\n author = {Feldman, Vitaly and Frostig, Roy and Hardt, Moritz},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1892--1900},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/feldman19a/feldman19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/feldman19a.html},\n abstract = \t {Excessive reuse of holdout data can lead to overfitting. However, there is little concrete evidence of significant overfitting due to holdout reuse in popular multiclass benchmarks today. Known results show that, in the worst-case, revealing the accuracy of $k$ adaptively chosen classifiers on a data set of size $n$ allows to create a classifier with bias of $\\Theta(\\sqrt{k/n})$ for any binary prediction problem. We show a new upper bound of $\\tilde O(\\max\\{\\sqrt{k\\log(n)/(mn)}, k/n\\})$ on the worst-case bias that any attack can achieve in a prediction problem with $m$ classes. Moreover, we present an efficient attack that achieve a bias of $\\Omega(\\sqrt{k/(m^2 n)})$ and improves on previous work for the binary setting ($m=2$). We also present an inefficient attack that achieves a bias of $\\tilde\\Omega(k/n)$. Complementing our theoretical work, we give new practical attacks to stress-test multiclass benchmarks by aiming to create as large a bias as possible with a given number of queries. Our experiments show that the additional uncertainty of prediction with a large number of classes indeed mitigates the effect of our best attacks.}\n}", "pdf": "http://proceedings.mlr.press/v97/feldman19a/feldman19a.pdf", "supp": "", "pdf_size": 390346, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12352673084507713955&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Google Brain + Simons Institute for the Theory of Computing; Google Brain; University of California, Berkeley + Google", "aff_domain": "google.com;google.com;berkeley.edu", "email": "google.com;google.com;berkeley.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/feldman19a.html", "aff_unique_index": "0+1;0;2+0", "aff_unique_norm": "Google;Simons Institute for the Theory of Computing;University of California, Berkeley", "aff_unique_dep": "Google Brain;;", "aff_unique_url": "https://brain.google.com;https://simons.berkeley.edu;https://www.berkeley.edu", "aff_unique_abbr": "Google Brain;;UC Berkeley", "aff_campus_unique_index": "0;0;2+0", "aff_campus_unique": "Mountain View;;Berkeley", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United States" }, { "title": "The information-theoretic value of unlabeled data in semi-supervised learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3652", "id": "3652", "author_site": "Alexander Golovnev, David Pal, Balazs Szorenyi", "author": "Alexander Golovnev; David Pal; Balazs Szorenyi", "abstract": "We quantify the separation between the numbers of labeled examples required to learn in two settings: Settings with and without the knowledge of the distribution of the unlabeled data. More specifically, we prove a separation by $\\Theta(\\log n)$ multiplicative factor for the class of projections over the Boolean hypercube of dimension $n$. We prove that there is no separation for the class of all functions on domain of any size. Learning with the knowledge of the distribution (a.k.a. fixed-distribution learning) can be viewed as an idealized scenario of semi-supervised learning where the number of unlabeled data points is so great that the unlabeled distribution is known exactly. For this reason, we call the separation the value of unlabeled data.", "bibtex": "@InProceedings{pmlr-v97-golovnev19a,\n title = \t {The information-theoretic value of unlabeled data in semi-supervised learning},\n author = {Golovnev, Alexander and Pal, David and Szorenyi, Balazs},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2328--2336},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/golovnev19a/golovnev19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/golovnev19a.html},\n abstract = \t {We quantify the separation between the numbers of labeled examples required to learn in two settings: Settings with and without the knowledge of the distribution of the unlabeled data. More specifically, we prove a separation by $\\Theta(\\log n)$ multiplicative factor for the class of projections over the Boolean hypercube of dimension $n$. We prove that there is no separation for the class of all functions on domain of any size. Learning with the knowledge of the distribution (a.k.a. fixed-distribution learning) can be viewed as an idealized scenario of semi-supervised learning where the number of unlabeled data points is so great that the unlabeled distribution is known exactly. For this reason, we call the separation the value of unlabeled data.}\n}", "pdf": "http://proceedings.mlr.press/v97/golovnev19a/golovnev19a.pdf", "supp": "", "pdf_size": 385361, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1681653012845127645&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "aff": "Harvard University, Cambridge, MA, USA; Yahoo Research, New York, NY, USA; Yahoo Research, New York, NY, USA", "aff_domain": "gmail.com; ; ", "email": "gmail.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/golovnev19a.html", "aff_unique_index": "0;1;1", "aff_unique_norm": "Harvard University;Yahoo Research", "aff_unique_dep": ";", "aff_unique_url": "https://www.harvard.edu;https://research.yahoo.com", "aff_unique_abbr": "Harvard;Yahoo Res.", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Cambridge;New York", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Theoretically Principled Trade-off between Robustness and Accuracy", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3942", "id": "3942", "author_site": "Hongyang Zhang, Yaodong Yu, Jiantao Jiao, Eric Xing, Laurent El Ghaoui, Michael Jordan", "author": "Hongyang Zhang; Yaodong Yu; Jiantao Jiao; Eric Xing; Laurent El Ghaoui; Michael Jordan", "abstract": "We identify a trade-off between robustness and accuracy that serves as a guiding principle in the design of defenses against adversarial examples. Although this problem has been widely studied empirically, much remains unknown concerning the theory underlying this trade-off. In this work, we decompose the prediction error for adversarial examples (robust error) as the sum of the natural (classification) error and boundary error, and provide a differentiable upper bound using the theory of classification-calibrated loss, which is shown to be the tightest possible upper bound uniform over all probability distributions and measurable predictors. Inspired by our theoretical analysis, we also design a new defense method, TRADES, to trade adversarial robustness off against accuracy. Our proposed algorithm performs well experimentally in real-world datasets. The methodology is the foundation of our entry to the NeurIPS 2018 Adversarial Vision Challenge in which we won the 1st place out of \u00a02,000 submissions, surpassing the runner-up approach by 11.41% in terms of mean L_2 perturbation distance.", "bibtex": "@InProceedings{pmlr-v97-zhang19p,\n title = \t {Theoretically Principled Trade-off between Robustness and Accuracy},\n author = {Zhang, Hongyang and Yu, Yaodong and Jiao, Jiantao and Xing, Eric and Ghaoui, Laurent El and Jordan, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7472--7482},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19p/zhang19p.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19p.html},\n abstract = \t {We identify a trade-off between robustness and accuracy that serves as a guiding principle in the design of defenses against adversarial examples. Although this problem has been widely studied empirically, much remains unknown concerning the theory underlying this trade-off. In this work, we decompose the prediction error for adversarial examples (robust error) as the sum of the natural (classification) error and boundary error, and provide a differentiable upper bound using the theory of classification-calibrated loss, which is shown to be the tightest possible upper bound uniform over all probability distributions and measurable predictors. Inspired by our theoretical analysis, we also design a new defense method, TRADES, to trade adversarial robustness off against accuracy. Our proposed algorithm performs well experimentally in real-world datasets. The methodology is the foundation of our entry to the NeurIPS 2018 Adversarial Vision Challenge in which we won the 1st place out of \u00a02,000 submissions, surpassing the runner-up approach by 11.41% in terms of mean L_2 perturbation distance.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19p/zhang19p.pdf", "supp": "", "pdf_size": 768600, "gs_citation": 3190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3311622924435738798&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/zhang19p.html" }, { "title": "TibGM: A Transferable and Information-Based Graphical Model Approach for Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3581", "id": "3581", "author_site": "Tameem Adel, Adrian Weller", "author": "Tameem Adel; Adrian Weller", "abstract": "One of the challenges to reinforcement learning (RL) is scalable transferability among complex tasks. Incorporating a graphical model (GM), along with the rich family of related methods, as a basis for RL frameworks provides potential to address issues such as transferability, generalisation and exploration. Here we propose a flexible GM-based RL framework which leverages efficient inference procedures to enhance generalisation and transfer power. In our proposed transferable and information-based graphical model framework \u2018TibGM\u2019, we show the equivalence between our mutual information-based objective in the GM, and an RL consolidated objective consisting of a standard reward maximisation target and a generalisation/transfer objective. In settings where there is a sparse or deceptive reward signal, our TibGM framework is flexible enough to incorporate exploration bonuses depicting intrinsic rewards. We empirically verify improved performance and exploration power.", "bibtex": "@InProceedings{pmlr-v97-adel19a,\n title = \t {{T}ib{GM}: A Transferable and Information-Based Graphical Model Approach for Reinforcement Learning},\n author = {Adel, Tameem and Weller, Adrian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {71--81},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/adel19a/adel19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/adel19a.html},\n abstract = \t {One of the challenges to reinforcement learning (RL) is scalable transferability among complex tasks. Incorporating a graphical model (GM), along with the rich family of related methods, as a basis for RL frameworks provides potential to address issues such as transferability, generalisation and exploration. Here we propose a flexible GM-based RL framework which leverages efficient inference procedures to enhance generalisation and transfer power. In our proposed transferable and information-based graphical model framework \u2018TibGM\u2019, we show the equivalence between our mutual information-based objective in the GM, and an RL consolidated objective consisting of a standard reward maximisation target and a generalisation/transfer objective. In settings where there is a sparse or deceptive reward signal, our TibGM framework is flexible enough to incorporate exploration bonuses depicting intrinsic rewards. We empirically verify improved performance and exploration power.}\n}", "pdf": "http://proceedings.mlr.press/v97/adel19a/adel19a.pdf", "supp": "", "pdf_size": 4305775, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1091712310314521780&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Engineering, University of Cambridge, UK; The Alan Turing Institute, UK", "aff_domain": "cam.ac.uk; ", "email": "cam.ac.uk; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/adel19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "University of Cambridge;Alan Turing Institute", "aff_unique_dep": "Department of Engineering;", "aff_unique_url": "https://www.cam.ac.uk;https://www.turing.ac.uk", "aff_unique_abbr": "Cambridge;ATI", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "title": "Tight Kernel Query Complexity of Kernel Ridge Regression and Kernel $k$-means Clustering", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4309", "id": "4309", "author_site": "Taisuke Yasuda, David Woodruff, Manuel Fernandez", "author": "Taisuke Yasuda; David Woodruff; Manuel Fernandez", "abstract": "Kernel methods generalize machine learning algorithms that only depend on the pairwise inner products of the dataset by replacing inner products with kernel evaluations, a function that passes input points through a nonlinear feature map before taking the inner product in a higher dimensional space. In this work, we present nearly tight lower bounds on the number of kernel evaluations required to approximately solve kernel ridge regression (KRR) and kernel $k$-means clustering (KKMC) on $n$ input points. For KRR, our bound for relative error approximation the argmin of the objective function is $\\Omega(nd_{\\mathrm{eff}}^\\lambda/\\varepsilon)$ where $d_{\\mathrm{eff}}^\\lambda$ is the effective statistical dimension, tight up to a $\\log(d_{\\mathrm{eff}}^\\lambda/\\varepsilon)$ factor. For KKMC, our bound for finding a $k$-clustering achieving a relative error approximation of the objective function is $\\Omega(nk/\\varepsilon)$, tight up to a $\\log(k/\\varepsilon)$ factor. Our KRR result resolves a variant of an open question of El Alaoui and Mahoney, asking whether the effective statistical dimension is a lower bound on the sampling complexity or not. Furthermore, for the important input distribution case of mixtures of Gaussians, we provide algorithms that bypass the above lower bounds.", "bibtex": "@InProceedings{pmlr-v97-yasuda19a,\n title = \t {Tight Kernel Query Complexity of Kernel Ridge Regression and Kernel $k$-means Clustering},\n author = {Yasuda, Taisuke and Woodruff, David and Fernandez, Manuel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7055--7063},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yasuda19a/yasuda19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yasuda19a.html},\n abstract = \t {Kernel methods generalize machine learning algorithms that only depend on the pairwise inner products of the dataset by replacing inner products with kernel evaluations, a function that passes input points through a nonlinear feature map before taking the inner product in a higher dimensional space. In this work, we present nearly tight lower bounds on the number of kernel evaluations required to approximately solve kernel ridge regression (KRR) and kernel $k$-means clustering (KKMC) on $n$ input points. For KRR, our bound for relative error approximation the argmin of the objective function is $\\Omega(nd_{\\mathrm{eff}}^\\lambda/\\varepsilon)$ where $d_{\\mathrm{eff}}^\\lambda$ is the effective statistical dimension, tight up to a $\\log(d_{\\mathrm{eff}}^\\lambda/\\varepsilon)$ factor. For KKMC, our bound for finding a $k$-clustering achieving a relative error approximation of the objective function is $\\Omega(nk/\\varepsilon)$, tight up to a $\\log(k/\\varepsilon)$ factor. Our KRR result resolves a variant of an open question of El Alaoui and Mahoney, asking whether the effective statistical dimension is a lower bound on the sampling complexity or not. Furthermore, for the important input distribution case of mixtures of Gaussians, we provide algorithms that bypass the above lower bounds.}\n}", "pdf": "http://proceedings.mlr.press/v97/yasuda19a/yasuda19a.pdf", "supp": "", "pdf_size": 337772, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12967250782948269352&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "aff": "Computer Science Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA+Department of Mathematical Sciences, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA; Computer Science Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA+Department of Mathematical Sciences, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA; Department of Mathematical Sciences, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA", "aff_domain": "andrew.cmu.edu; ;andrew.cmu.edu", "email": "andrew.cmu.edu; ;andrew.cmu.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/yasuda19a.html", "aff_unique_index": "0+0;0+0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "0+0;0+0;0", "aff_campus_unique": "Pittsburgh", "aff_country_unique_index": "0+0;0+0;0", "aff_country_unique": "United States" }, { "title": "Tighter Problem-Dependent Regret Bounds in Reinforcement Learning without Domain Knowledge using Value Function Bounds", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4172", "id": "4172", "author_site": "Andrea Zanette, Emma Brunskill", "author": "Andrea Zanette; Emma Brunskill", "abstract": "Strong worst-case performance bounds for episodic reinforcement learning exist but fortunately in practice RL algorithms perform much better than such bounds would predict. Algorithms and theory that provide strong problem-dependent bounds could help illuminate the key features of what makes a RL problem hard and reduce the barrier to using RL algorithms in practice. As a step towards this we derive an algorithm and analysis for finite horizon discrete MDPs with state-of-the-art worst-case regret bounds and substantially tighter bounds if the RL environment has special features but without apriori knowledge of the environment from the algorithm. As a result of our analysis, we also help address an open learning theory question\u00a0\\cite{jiang2018open} about episodic MDPs with a constant upper-bound on the sum of rewards, providing a regret bound function of the number of episodes with no dependence on the horizon.", "bibtex": "@InProceedings{pmlr-v97-zanette19a,\n title = \t {Tighter Problem-Dependent Regret Bounds in Reinforcement Learning without Domain Knowledge using Value Function Bounds},\n author = {Zanette, Andrea and Brunskill, Emma},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7304--7312},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zanette19a/zanette19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zanette19a.html},\n abstract = \t {Strong worst-case performance bounds for episodic reinforcement learning exist but fortunately in practice RL algorithms perform much better than such bounds would predict. Algorithms and theory that provide strong problem-dependent bounds could help illuminate the key features of what makes a RL problem hard and reduce the barrier to using RL algorithms in practice. As a step towards this we derive an algorithm and analysis for finite horizon discrete MDPs with state-of-the-art worst-case regret bounds and substantially tighter bounds if the RL environment has special features but without apriori knowledge of the environment from the algorithm. As a result of our analysis, we also help address an open learning theory question\u00a0\\cite{jiang2018open} about episodic MDPs with a constant upper-bound on the sum of rewards, providing a regret bound function of the number of episodes with no dependence on the horizon.}\n}", "pdf": "http://proceedings.mlr.press/v97/zanette19a/zanette19a.pdf", "supp": "", "pdf_size": 505713, "gs_citation": 325, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10391433256429837168&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "aff": "Institute for Computational and Mathematical Engineering, Stanford University, USA+Department of Computer Science, Stanford University, USA; Department of Computer Science, Stanford University, USA", "aff_domain": "stanford.edu;cs.stanford.edu", "email": "stanford.edu;cs.stanford.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/zanette19a.html", "aff_unique_index": "0+0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Institute for Computational and Mathematical Engineering", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0+0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Topological Data Analysis of Decision Boundaries with Application to Model Selection", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3938", "id": "3938", "author_site": "Karthikeyan Ramamurthy, Kush Varshney, Krishnan Mody", "author": "Karthikeyan Natesan Ramamurthy; Kush Varshney; Krishnan Mody", "abstract": "We propose the labeled Cech complex, the plain labeled Vietoris-Rips complex, and the locally scaled labeled Vietoris-Rips complex to perform persistent homology inference of decision boundaries in classification tasks. We provide theoretical conditions and analysis for recovering the homology of a decision boundary from samples. Our main objective is quantification of deep neural network complexity to enable matching of datasets to pre-trained models to facilitate the functioning of AI marketplaces; we report results for experiments using MNIST, FashionMNIST, and CIFAR10.", "bibtex": "@InProceedings{pmlr-v97-ramamurthy19a,\n title = \t {Topological Data Analysis of Decision Boundaries with Application to Model Selection},\n author = {Ramamurthy, Karthikeyan Natesan and Varshney, Kush and Mody, Krishnan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5351--5360},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ramamurthy19a/ramamurthy19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ramamurthy19a.html},\n abstract = \t {We propose the labeled Cech complex, the plain labeled Vietoris-Rips complex, and the locally scaled labeled Vietoris-Rips complex to perform persistent homology inference of decision boundaries in classification tasks. We provide theoretical conditions and analysis for recovering the homology of a decision boundary from samples. Our main objective is quantification of deep neural network complexity to enable matching of datasets to pre-trained models to facilitate the functioning of AI marketplaces; we report results for experiments using MNIST, FashionMNIST, and CIFAR10.}\n}", "pdf": "http://proceedings.mlr.press/v97/ramamurthy19a/ramamurthy19a.pdf", "supp": "", "pdf_size": 439254, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16310684424372533537&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "IBM Research, Yorktown Heights, NY, USA+Courant Institute, New York University, New York City, NY, USA; IBM Research, Yorktown Heights, NY, USA; Courant Institute, New York University, New York City, NY, USA", "aff_domain": "us.ibm.com; ; ", "email": "us.ibm.com; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ramamurthy19a.html", "aff_unique_index": "0+1;0;1", "aff_unique_norm": "IBM;New York University", "aff_unique_dep": "IBM Research;Courant Institute", "aff_unique_url": "https://www.ibm.com/research;https://www.nyu.edu", "aff_unique_abbr": "IBM;NYU", "aff_campus_unique_index": "0+1;0;1", "aff_campus_unique": "Yorktown Heights;New York City", "aff_country_unique_index": "0+0;0;0", "aff_country_unique": "United States" }, { "title": "Toward Controlling Discrimination in Online Ad Auctions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4311", "id": "4311", "author_site": "L. Elisa Celis, Anay Mehrotra, Nisheeth Vishnoi", "author": "Elisa Celis; Anay Mehrotra; Nisheeth Vishnoi", "abstract": "Online advertising platforms are thriving due to the customizable audiences they offer advertisers. However, recent studies show that advertisements can be discriminatory with respect to the gender or race of the audience that sees the ad, and may inadvertently cross ethical and/or legal boundaries. To prevent this, we propose a constrained ad auction framework that maximizes the platform\u2019s revenue conditioned on ensuring that the audience seeing an advertiser\u2019s ad is distributed appropriately across sensitive types such as gender or race. Building upon Myerson\u2019s classic work, we first present an optimal auction mechanism for a large class of fairness constraints. Finding the parameters of this optimal auction, however, turns out to be a non-convex problem. We show that this non-convex problem can be reformulated as a more structured non-convex problem with no saddle points or local-maxima; this allows us to develop a gradient-descent-based algorithm to solve it. Our empirical results on the A1 Yahoo! dataset demonstrate that our algorithm can obtain uniform coverage across different user types for each advertiser at a minor loss to the revenue of the platform, and a small change to the size of the audience each advertiser reaches.", "bibtex": "@InProceedings{pmlr-v97-mehrotra19a,\n title = \t {Toward Controlling Discrimination in Online Ad Auctions},\n author = {Celis, Elisa and Mehrotra, Anay and Vishnoi, Nisheeth},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4456--4465},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mehrotra19a/mehrotra19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mehrotra19a.html},\n abstract = \t {Online advertising platforms are thriving due to the customizable audiences they offer advertisers. However, recent studies show that advertisements can be discriminatory with respect to the gender or race of the audience that sees the ad, and may inadvertently cross ethical and/or legal boundaries. To prevent this, we propose a constrained ad auction framework that maximizes the platform\u2019s revenue conditioned on ensuring that the audience seeing an advertiser\u2019s ad is distributed appropriately across sensitive types such as gender or race. Building upon Myerson\u2019s classic work, we first present an optimal auction mechanism for a large class of fairness constraints. Finding the parameters of this optimal auction, however, turns out to be a non-convex problem. We show that this non-convex problem can be reformulated as a more structured non-convex problem with no saddle points or local-maxima; this allows us to develop a gradient-descent-based algorithm to solve it. Our empirical results on the A1 Yahoo! dataset demonstrate that our algorithm can obtain uniform coverage across different user types for each advertiser at a minor loss to the revenue of the platform, and a small change to the size of the audience each advertiser reaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/mehrotra19a/mehrotra19a.pdf", "supp": "", "pdf_size": 559763, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3881350113786532991&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Yale University, USA; Indian Institute of Technology Kanpur, India; Yale University, USA", "aff_domain": "yale.edu; ; ", "email": "yale.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/mehrotra19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Yale University;Indian Institute of Technology Kanpur", "aff_unique_dep": ";", "aff_unique_url": "https://www.yale.edu;https://www.iitk.ac.in", "aff_unique_abbr": "Yale;IIT Kanpur", "aff_campus_unique_index": "1", "aff_campus_unique": ";Kanpur", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;India" }, { "title": "Toward Understanding the Importance of Noise in Training Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4316", "id": "4316", "author_site": "Mo Zhou, Tianyi Liu, Yan Li, Dachao Lin, Enlu Zhou, Tuo Zhao", "author": "Mo Zhou; Tianyi Liu; Yan Li; Dachao Lin; Enlu Zhou; Tuo Zhao", "abstract": "Numerous empirical evidence has corroborated that the noise plays a crucial rule in effective and efficient training of deep neural networks. The theory behind, however, is still largely unknown. This paper studies this fundamental problem through training a simple two-layer convolutional neural network model. Although training such a network requires to solve a non-convex optimization problem with a spurious local optimum and a global optimum, we prove that a perturbed gradient descent algorithm in conjunction with noise annealing is guaranteed to converge to a global optimum in polynomial time with arbitrary initialization. This implies that the noise enables the algorithm to efficiently escape from the spurious local optimum. Numerical experiments are provided to support our theory.", "bibtex": "@InProceedings{pmlr-v97-zhou19d,\n title = \t {Toward Understanding the Importance of Noise in Training Neural Networks},\n author = {Zhou, Mo and Liu, Tianyi and Li, Yan and Lin, Dachao and Zhou, Enlu and Zhao, Tuo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7594--7602},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhou19d/zhou19d.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhou19d.html},\n abstract = \t {Numerous empirical evidence has corroborated that the noise plays a crucial rule in effective and efficient training of deep neural networks. The theory behind, however, is still largely unknown. This paper studies this fundamental problem through training a simple two-layer convolutional neural network model. Although training such a network requires to solve a non-convex optimization problem with a spurious local optimum and a global optimum, we prove that a perturbed gradient descent algorithm in conjunction with noise annealing is guaranteed to converge to a global optimum in polynomial time with arbitrary initialization. This implies that the noise enables the algorithm to efficiently escape from the spurious local optimum. Numerical experiments are provided to support our theory.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhou19d/zhou19d.pdf", "supp": "", "pdf_size": 2108705, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4733049575813936962&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Peking University; Georgia Institute of Technology; Georgia Institute of Technology; Peking University; Georgia Institute of Technology; Georgia Institute of Technology", "aff_domain": "pku.edu;gatech.edu;gatech.edu;pku.edu;gatech.edu;gatech.edu", "email": "pku.edu;gatech.edu;gatech.edu;pku.edu;gatech.edu;gatech.edu", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/zhou19d.html", "aff_unique_index": "0;1;1;0;1;1", "aff_unique_norm": "Peking University;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "http://www.pku.edu.cn;https://www.gatech.edu", "aff_unique_abbr": "Peking U;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1;1", "aff_country_unique": "China;United States" }, { "title": "Towards Accurate Model Selection in Deep Unsupervised Domain Adaptation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3708", "id": "3708", "author_site": "Kaichao You, Ximei Wang, Mingsheng Long, Michael Jordan", "author": "Kaichao You; Ximei Wang; Mingsheng Long; Michael Jordan", "abstract": "Deep unsupervised domain adaptation (Deep UDA) methods successfully leverage rich labeled data in a source domain to boost the performance on related but unlabeled data in a target domain. However, algorithm comparison is cumbersome in Deep UDA due to the absence of accurate and standardized model selection method, posing an obstacle to further advances in the field. Existing model selection methods for Deep UDA are either highly biased, restricted, unstable, or even controversial (requiring labeled target data). To this end, we propose Deep Embedded Validation (DEV), which embeds adapted feature representation into the validation procedure to obtain unbiased estimation of the target risk with bounded variance. The variance is further reduced by the technique of control variate. The efficacy of the method has been justified both theoretically and empirically.", "bibtex": "@InProceedings{pmlr-v97-you19a,\n title = \t {Towards Accurate Model Selection in Deep Unsupervised Domain Adaptation},\n author = {You, Kaichao and Wang, Ximei and Long, Mingsheng and Jordan, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7124--7133},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/you19a/you19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/you19a.html},\n abstract = \t {Deep unsupervised domain adaptation (Deep UDA) methods successfully leverage rich labeled data in a source domain to boost the performance on related but unlabeled data in a target domain. However, algorithm comparison is cumbersome in Deep UDA due to the absence of accurate and standardized model selection method, posing an obstacle to further advances in the field. Existing model selection methods for Deep UDA are either highly biased, restricted, unstable, or even controversial (requiring labeled target data). To this end, we propose Deep Embedded Validation (DEV), which embeds adapted feature representation into the validation procedure to obtain unbiased estimation of the target risk with bounded variance. The variance is further reduced by the technique of control variate. The efficacy of the method has been justified both theoretically and empirically.}\n}", "pdf": "http://proceedings.mlr.press/v97/you19a/you19a.pdf", "supp": "", "pdf_size": 524468, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2565642679287912484&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "School of Software+BNRist, Research Center for Big Data, Tsinghua University, Beijing, China; School of Software+BNRist, Research Center for Big Data, Tsinghua University, Beijing, China; School of Software+BNRist, Research Center for Big Data, Tsinghua University, Beijing, China; University of California, Berkeley, USA", "aff_domain": "gmail.com; ;tsinghua.edu.cn; ", "email": "gmail.com; ;tsinghua.edu.cn; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/you19a.html", "aff_unique_index": "0+1;0+1;0+1;2", "aff_unique_norm": "School of Software;Tsinghua University;University of California, Berkeley", "aff_unique_dep": "Software;Research Center for Big Data;", "aff_unique_url": ";https://www.tsinghua.edu.cn;https://www.berkeley.edu", "aff_unique_abbr": ";THU;UC Berkeley", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Beijing;Berkeley", "aff_country_unique_index": "1;1;1;2", "aff_country_unique": ";China;United States" }, { "title": "Towards Understanding Knowledge Distillation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3824", "id": "3824", "author_site": "Mary Phuong, Christoph H. Lampert", "author": "Mary Phuong; Christoph Lampert", "abstract": "Knowledge distillation, i.e., one classifier being trained on the outputs of another classifier, is an empirically very successful technique for knowledge transfer between classifiers. It has even been observed that classifiers learn much faster and more reliably if trained with the outputs of another classifier as soft labels, instead of from ground truth data. So far, however, there is no satisfactory theoretical explanation of this phenomenon. In this work, we provide the first insights into the working mechanisms of distillation by studying the special case of linear and deep linear classifiers. Specifically, we prove a generalization bound that establishes fast convergence of the expected risk of a distillation-trained linear classifier. From the bound and its proof we extract three key factors that determine the success of distillation: * data geometry \u2013 geometric properties of the data distribution, in particular class separation, has a direct influence on the convergence speed of the risk; * optimization bias \u2013 gradient descent optimization finds a very favorable minimum of the distillation objective; and * strong monotonicity \u2013 the expected risk of the student classifier always decreases when the size of the training set grows.", "bibtex": "@InProceedings{pmlr-v97-phuong19a,\n title = \t {Towards Understanding Knowledge Distillation},\n author = {Phuong, Mary and Lampert, Christoph},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5142--5151},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/phuong19a/phuong19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/phuong19a.html},\n abstract = \t {Knowledge distillation, i.e., one classifier being trained on the outputs of another classifier, is an empirically very successful technique for knowledge transfer between classifiers. It has even been observed that classifiers learn much faster and more reliably if trained with the outputs of another classifier as soft labels, instead of from ground truth data. So far, however, there is no satisfactory theoretical explanation of this phenomenon. In this work, we provide the first insights into the working mechanisms of distillation by studying the special case of linear and deep linear classifiers. Specifically, we prove a generalization bound that establishes fast convergence of the expected risk of a distillation-trained linear classifier. From the bound and its proof we extract three key factors that determine the success of distillation: * data geometry \u2013 geometric properties of the data distribution, in particular class separation, has a direct influence on the convergence speed of the risk; * optimization bias \u2013 gradient descent optimization finds a very favorable minimum of the distillation objective; and * strong monotonicity \u2013 the expected risk of the student classifier always decreases when the size of the training set grows.}\n}", "pdf": "http://proceedings.mlr.press/v97/phuong19a/phuong19a.pdf", "supp": "", "pdf_size": 686432, "gs_citation": 398, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9876481199860739592&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "IST Austria; IST Austria", "aff_domain": "ist.ac.at; ", "email": "ist.ac.at; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/phuong19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Institute of Science and Technology Austria", "aff_unique_dep": "", "aff_unique_url": "https://www.ist.ac.at", "aff_unique_abbr": "IST Austria", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Austria" }, { "title": "Towards a Deep and Unified Understanding of Deep Neural Models in NLP", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3999", "id": "3999", "author_site": "Chaoyu Guan, Xiting Wang, Quanshi Zhang, Runjin Chen, Di He, Xing Xie", "author": "Chaoyu Guan; Xiting Wang; Quanshi Zhang; Runjin Chen; Di He; Xing Xie", "abstract": "We define a unified information-based measure to provide quantitative explanations on how intermediate layers of deep Natural Language Processing (NLP) models leverage information of input words. Our method advances existing explanation methods by addressing issues in coherency and generality. Explanations generated by using our method are consistent and faithful across different timestamps, layers, and models. We show how our method can be applied to four widely used models in NLP and explain their performances on three real-world benchmark datasets.", "bibtex": "@InProceedings{pmlr-v97-guan19a,\n title = \t {Towards a Deep and Unified Understanding of Deep Neural Models in {NLP}},\n author = {Guan, Chaoyu and Wang, Xiting and Zhang, Quanshi and Chen, Runjin and He, Di and Xie, Xing},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2454--2463},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/guan19a/guan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/guan19a.html},\n abstract = \t {We define a unified information-based measure to provide quantitative explanations on how intermediate layers of deep Natural Language Processing (NLP) models leverage information of input words. Our method advances existing explanation methods by addressing issues in coherency and generality. Explanations generated by using our method are consistent and faithful across different timestamps, layers, and models. We show how our method can be applied to four widely used models in NLP and explain their performances on three real-world benchmark datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/guan19a/guan19a.pdf", "supp": "", "pdf_size": 3030999, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1223258590582324449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "John Hopcroft Center and the MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, at the Shanghai Jiao Tong University, Shanghai, China; Microsoft Research Asia, Beijing, China; Microsoft Research Asia, Beijing, China; John Hopcroft Center and the MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute, at the Shanghai Jiao Tong University, Shanghai, China; Peking University, Beijing, China; Microsoft Research Asia, Beijing, China", "aff_domain": "sjtu.edu.cn; ; ; ; ; ", "email": "sjtu.edu.cn; ; ; ; ; ", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/guan19a.html", "aff_unique_index": "0;1;1;0;2;1", "aff_unique_norm": "Shanghai Jiao Tong University;Microsoft;Peking University", "aff_unique_dep": "John Hopcroft Center and the MoE Key Lab of Arti\ufb01cial Intelligence, AI Institute;Research;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.microsoft.com/en-us/research/group/asia;http://www.pku.edu.cn", "aff_unique_abbr": "SJTU;MSRA;Peking U", "aff_campus_unique_index": "0;1;1;0;1;1", "aff_campus_unique": "Shanghai;Beijing", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "title": "Towards a Unified Analysis of Random Fourier Features", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4235", "id": "4235", "author_site": "Zhu Li, Jean-Francois Ton, Dino Oglic, Dino Sejdinovic", "author": "Zhu Li; Jean-Francois Ton; Dino Oglic; Dino Sejdinovic", "abstract": "Random Fourier features is a widely used, simple, and effective technique for scaling up kernel methods. The existing theoretical analysis of the approach, however, remains focused on specific learning tasks and typically gives pessimistic bounds which are at odds with the empirical results. We tackle these problems and provide the first unified risk analysis of learning with random Fourier features using the squared error and Lipschitz continuous loss functions. In our bounds, the trade-off between the computational cost and the expected risk convergence rate is problem specific and expressed in terms of the regularization parameter and the", "bibtex": "@InProceedings{pmlr-v97-li19k,\n title = \t {Towards a Unified Analysis of Random {F}ourier Features},\n author = {Li, Zhu and Ton, Jean-Francois and Oglic, Dino and Sejdinovic, Dino},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3905--3914},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/li19k/li19k.pdf},\n url = \t {https://proceedings.mlr.press/v97/li19k.html},\n abstract = \t {Random Fourier features is a widely used, simple, and effective technique for scaling up kernel methods. The existing theoretical analysis of the approach, however, remains focused on specific learning tasks and typically gives pessimistic bounds which are at odds with the empirical results. We tackle these problems and provide the first unified risk analysis of learning with random Fourier features using the squared error and Lipschitz continuous loss functions. In our bounds, the trade-off between the computational cost and the expected risk convergence rate is problem specific and expressed in terms of the regularization parameter and the", "pdf": "http://proceedings.mlr.press/v97/li19k/li19k.pdf", "supp": "", "pdf_size": 689556, "gs_citation": 162, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13161967453282502156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Statistics, University of Oxford; Department of Statistics, University of Oxford; Department of Informatics, King\u2019s College London; Department of Statistics, University of Oxford", "aff_domain": "stats.ox.ac.uk; ; ;stats.ox.ac.uk", "email": "stats.ox.ac.uk; ; ;stats.ox.ac.uk", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/li19k.html", "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Oxford;King\u2019s College London", "aff_unique_dep": "Department of Statistics;Department of Informatics", "aff_unique_url": "https://www.ox.ac.uk;https://www.kcl.ac.uk", "aff_unique_abbr": "Oxford;KCL", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "Oxford;London", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Trading Redundancy for Communication: Speeding up Distributed SGD for Non-convex Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4289", "id": "4289", "author_site": "Farzin Haddadpour, Mohammad Mahdi Kamani, Mehrdad Mahdavi, Viveck Cadambe", "author": "Farzin Haddadpour; Mohammad Mahdi Kamani; Mehrdad Mahdavi; Viveck Cadambe", "abstract": "Communication overhead is one of the key challenges that hinders the scalability of distributed optimization algorithms to train large neural networks. In recent years, there has been a great deal of research to alleviate communication cost by compressing the gradient vector or using local updates and periodic model averaging. In this paper, we advocate the use of redundancy towards communication-efficient distributed stochastic algorithms for non-convex optimization. In particular, we, both theoretically and practically, show that by properly infusing redundancy to the training data with model averaging, it is possible to significantly reduce the number of communication rounds. To be more precise, we show that redundancy reduces residual error in local averaging, thereby reaching the same level of accuracy with fewer rounds of communication as compared with previous algorithms. Empirical studies on CIFAR10, CIFAR100 and ImageNet datasets in a distributed environment complement our theoretical results; they show that our algorithms have additional beneficial aspects including tolerance to failures, as well as greater gradient diversity.", "bibtex": "@InProceedings{pmlr-v97-haddadpour19a,\n title = \t {Trading Redundancy for Communication: Speeding up Distributed {SGD} for Non-convex Optimization},\n author = {Haddadpour, Farzin and Kamani, Mohammad Mahdi and Mahdavi, Mehrdad and Cadambe, Viveck},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2545--2554},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/haddadpour19a/haddadpour19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/haddadpour19a.html},\n abstract = \t {Communication overhead is one of the key challenges that hinders the scalability of distributed optimization algorithms to train large neural networks. In recent years, there has been a great deal of research to alleviate communication cost by compressing the gradient vector or using local updates and periodic model averaging. In this paper, we advocate the use of redundancy towards communication-efficient distributed stochastic algorithms for non-convex optimization. In particular, we, both theoretically and practically, show that by properly infusing redundancy to the training data with model averaging, it is possible to significantly reduce the number of communication rounds. To be more precise, we show that redundancy reduces residual error in local averaging, thereby reaching the same level of accuracy with fewer rounds of communication as compared with previous algorithms. Empirical studies on CIFAR10, CIFAR100 and ImageNet datasets in a distributed environment complement our theoretical results; they show that our algorithms have additional beneficial aspects including tolerance to failures, as well as greater gradient diversity.}\n}", "pdf": "http://proceedings.mlr.press/v97/haddadpour19a/haddadpour19a.pdf", "supp": "", "pdf_size": 0, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17830914639239189469&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": ";;;", "aff_domain": ";;;", "email": ";;;", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/haddadpour19a.html" }, { "title": "Traditional and Heavy Tailed Self Regularization in Neural Network Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3986", "id": "3986", "author_site": "Michael Mahoney, Charles H Martin", "author": "Michael Mahoney; Charles Martin", "abstract": "Random Matrix Theory (RMT) is applied to analyze the weight matrices of Deep Neural Networks (DNNs), including both production quality, pre-trained models such as AlexNet and Inception, and smaller models trained from scratch, such as LeNet5 and a miniature-AlexNet. Empirical and theoretical results clearly indicate that the empirical spectral density (ESD) of DNN layer matrices displays signatures of traditionally-regularized statistical models, even in the absence of exogenously specifying traditional forms of regularization, such as Dropout or Weight Norm constraints. Building on recent results in RMT, most notably its extension to Universality classes of Heavy-Tailed matrices, we develop a theory to identify", "bibtex": "@InProceedings{pmlr-v97-mahoney19a,\n title = \t {Traditional and Heavy Tailed Self Regularization in Neural Network Models},\n author = {Mahoney, Michael and Martin, Charles},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4284--4293},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mahoney19a/mahoney19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mahoney19a.html},\n abstract = \t {Random Matrix Theory (RMT) is applied to analyze the weight matrices of Deep Neural Networks (DNNs), including both production quality, pre-trained models such as AlexNet and Inception, and smaller models trained from scratch, such as LeNet5 and a miniature-AlexNet. Empirical and theoretical results clearly indicate that the empirical spectral density (ESD) of DNN layer matrices displays signatures of traditionally-regularized statistical models, even in the absence of exogenously specifying traditional forms of regularization, such as Dropout or Weight Norm constraints. Building on recent results in RMT, most notably its extension to Universality classes of Heavy-Tailed matrices, we develop a theory to identify", "pdf": "http://proceedings.mlr.press/v97/mahoney19a/mahoney19a.pdf", "supp": "", "pdf_size": 643051, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2495666934042672938&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Calculation Consulting, 8 Locksley Ave, 6B, San Francisco, CA 94122+ICSI and Department of Statistics, University of California at Berkeley, Berkeley, CA 94720; ICSI and Department of Statistics, University of California at Berkeley, Berkeley, CA 94720", "aff_domain": "CalculationConsulting.com;stat.berkeley.edu", "email": "CalculationConsulting.com;stat.berkeley.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/mahoney19a.html", "aff_unique_index": "0+1;1", "aff_unique_norm": "Calculation Consulting;University of California, Berkeley", "aff_unique_dep": ";Department of Statistics", "aff_unique_url": ";https://www.berkeley.edu", "aff_unique_abbr": ";UC Berkeley", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0+0;0", "aff_country_unique": "United States" }, { "title": "Trainable Decoding of Sets of Sequences for Neural Sequence Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4105", "id": "4105", "author_site": "Ashwin Kalyan, Peter Anderson, Stefan Lee, Dhruv Batra", "author": "Ashwin Kalyan; Peter Anderson; Stefan Lee; Dhruv Batra", "abstract": "Many sequence prediction tasks admit multiple correct outputs and so, it is often useful to decode a set of outputs that maximize some task-specific set-level metric. However, retooling standard sequence prediction procedures tailored towards predicting the single best output leads to the decoding of sets containing very similar sequences; failing to capture the variation in the output space. To address this, we propose $\\nabla$BS, a trainable decoding procedure that outputs a set of sequences, highly valued according to the metric. Our method tightly integrates the training and decoding phases and further allows for the optimization of the task-specific metric addressing the shortcomings of standard sequence prediction. Further, we discuss the trade-offs of commonly used set-level metrics and motivate a new set-level metric that naturally evaluates the notion of \u201ccapturing the variation in the output space\u201d. Finally, we show results on the image captioning task and find that our model outperforms standard techniques and natural ablations.", "bibtex": "@InProceedings{pmlr-v97-kalyan19a,\n title = \t {Trainable Decoding of Sets of Sequences for Neural Sequence Models},\n author = {Kalyan, Ashwin and Anderson, Peter and Lee, Stefan and Batra, Dhruv},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3211--3221},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kalyan19a/kalyan19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/kalyan19a.html},\n abstract = \t {Many sequence prediction tasks admit multiple correct outputs and so, it is often useful to decode a set of outputs that maximize some task-specific set-level metric. However, retooling standard sequence prediction procedures tailored towards predicting the single best output leads to the decoding of sets containing very similar sequences; failing to capture the variation in the output space. To address this, we propose $\\nabla$BS, a trainable decoding procedure that outputs a set of sequences, highly valued according to the metric. Our method tightly integrates the training and decoding phases and further allows for the optimization of the task-specific metric addressing the shortcomings of standard sequence prediction. Further, we discuss the trade-offs of commonly used set-level metrics and motivate a new set-level metric that naturally evaluates the notion of \u201ccapturing the variation in the output space\u201d. Finally, we show results on the image captioning task and find that our model outperforms standard techniques and natural ablations.}\n}", "pdf": "http://proceedings.mlr.press/v97/kalyan19a/kalyan19a.pdf", "supp": "", "pdf_size": 514749, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12874705220951436817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Interactive Computing, Georgia Tech, Atlanta, GA, USA+Facebook AI Research, Menlo Park, CA, USA; School of Interactive Computing, Georgia Tech, Atlanta, GA, USA; School of Interactive Computing, Georgia Tech, Atlanta, GA, USA; School of Interactive Computing, Georgia Tech, Atlanta, GA, USA+Facebook AI Research, Menlo Park, CA, USA", "aff_domain": "gatech.edu; ; ; ", "email": "gatech.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kalyan19a.html", "aff_unique_index": "0+1;0;0;0+1", "aff_unique_norm": "Georgia Tech;Meta", "aff_unique_dep": "School of Interactive Computing;Facebook AI Research", "aff_unique_url": "https://www.gatech.edu;https://research.facebook.com", "aff_unique_abbr": "GT;FAIR", "aff_campus_unique_index": "0+1;0;0;0+1", "aff_campus_unique": "Atlanta;Menlo Park", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "United States" }, { "title": "Training CNNs with Selective Allocation of Channels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3973", "id": "3973", "author_site": "Jongheon Jeong, Jinwoo Shin", "author": "Jongheon Jeong; Jinwoo Shin", "abstract": "Recent progress in deep convolutional neural networks (CNNs) have enabled a simple paradigm of architecture design: larger models typically achieve better accuracy. Due to this, in modern CNN architectures, it becomes more important to design models that generalize well under certain resource constraints, e.g. the number of parameters. In this paper, we propose a simple way to improve the capacity of any CNN model having large-scale features, without adding more parameters. In particular, we modify a standard convolutional layer to have a new functionality of channel-selectivity, so that the layer is trained to select important channels to re-distribute their parameters. Our experimental results under various CNN architectures and datasets demonstrate that the proposed new convolutional layer allows new optima that generalize better via efficient resource utilization, compared to the baseline.", "bibtex": "@InProceedings{pmlr-v97-jeong19c,\n title = \t {Training {CNN}s with Selective Allocation of Channels},\n author = {Jeong, Jongheon and Shin, Jinwoo},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3080--3090},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/jeong19c/jeong19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/jeong19c.html},\n abstract = \t {Recent progress in deep convolutional neural networks (CNNs) have enabled a simple paradigm of architecture design: larger models typically achieve better accuracy. Due to this, in modern CNN architectures, it becomes more important to design models that generalize well under certain resource constraints, e.g. the number of parameters. In this paper, we propose a simple way to improve the capacity of any CNN model having large-scale features, without adding more parameters. In particular, we modify a standard convolutional layer to have a new functionality of channel-selectivity, so that the layer is trained to select important channels to re-distribute their parameters. Our experimental results under various CNN architectures and datasets demonstrate that the proposed new convolutional layer allows new optima that generalize better via efficient resource utilization, compared to the baseline.}\n}", "pdf": "http://proceedings.mlr.press/v97/jeong19c/jeong19c.pdf", "supp": "", "pdf_size": 1702393, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4084149718123624723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "School of Electrical Engineering, KAIST, Daejeon, South Korea+Graduate School of AI, KAIST, Daejeon, South Korea; School of Electrical Engineering, KAIST, Daejeon, South Korea+Graduate School of AI, KAIST, Daejeon, South Korea+AITRICS, Seoul, South Korea", "aff_domain": "kaist.ac.kr;kaist.ac.kr", "email": "kaist.ac.kr;kaist.ac.kr", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/jeong19c.html", "aff_unique_index": "0+0;0+0+1", "aff_unique_norm": "KAIST;AITRICS", "aff_unique_dep": "School of Electrical Engineering;", "aff_unique_url": "https://www.kaist.ac.kr;", "aff_unique_abbr": "KAIST;", "aff_campus_unique_index": "0+0;0+0+1", "aff_campus_unique": "Daejeon;Seoul", "aff_country_unique_index": "0+0;0+0+0", "aff_country_unique": "South Korea" }, { "title": "Training Neural Networks with Local Error Signals", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3575", "id": "3575", "author_site": "Arild N\u00f8kland, Lars Hiller Eidnes", "author": "Arild N\u00f8kland; Lars Hiller Eidnes", "abstract": "Supervised training of neural networks for classification is typically performed with a global loss function. The loss function provides a gradient for the output layer, and this gradient is back-propagated to hidden layers to dictate an update direction for the weights. An alternative approach is to train the network with layer-wise loss functions. In this paper we demonstrate, for the first time, that layer-wise training can approach the state-of-the-art on a variety of image datasets. We use single-layer sub-networks and two different supervised loss functions to generate local error signals for the hidden layers, and we show that the combination of these losses help with optimization in the context of local learning. Using local errors could be a step towards more biologically plausible deep learning because the global error does not have to be transported back to hidden layers. A completely backprop free variant outperforms previously reported results among methods aiming for higher biological plausibility.", "bibtex": "@InProceedings{pmlr-v97-nokland19a,\n title = \t {Training Neural Networks with Local Error Signals},\n author = {N{\\o}kland, Arild and Eidnes, Lars Hiller},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4839--4850},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nokland19a/nokland19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nokland19a.html},\n abstract = \t {Supervised training of neural networks for classification is typically performed with a global loss function. The loss function provides a gradient for the output layer, and this gradient is back-propagated to hidden layers to dictate an update direction for the weights. An alternative approach is to train the network with layer-wise loss functions. In this paper we demonstrate, for the first time, that layer-wise training can approach the state-of-the-art on a variety of image datasets. We use single-layer sub-networks and two different supervised loss functions to generate local error signals for the hidden layers, and we show that the combination of these losses help with optimization in the context of local learning. Using local errors could be a step towards more biologically plausible deep learning because the global error does not have to be transported back to hidden layers. A completely backprop free variant outperforms previously reported results among methods aiming for higher biological plausibility.}\n}", "pdf": "http://proceedings.mlr.press/v97/nokland19a/nokland19a.pdf", "supp": "", "pdf_size": 397571, "gs_citation": 279, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11332176056919584070&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Kongsberg Seatex, Trondheim, Norway; Trondheim, Norway", "aff_domain": "gmail.com;gmail.com", "email": "gmail.com;gmail.com", "github": "https://github.com/anokland/local-loss", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/nokland19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Kongsberg Seatex;Norwegian University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": ";https://www.ntnu.no", "aff_unique_abbr": ";NTNU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Trondheim", "aff_country_unique_index": "0;0", "aff_country_unique": "Norway" }, { "title": "Training Well-Generalizing Classifiers for Fairness Metrics and Other Data-Dependent Constraints", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3659", "id": "3659", "author_site": "Andrew Cotter, Maya Gupta, Heinrich Jiang, Nati Srebro, Karthik Sridharan, Serena Wang, Blake Woodworth, Seungil You", "author": "Andrew Cotter; Maya Gupta; Heinrich Jiang; Nathan Srebro; Karthik Sridharan; Serena Wang; Blake Woodworth; Seungil You", "abstract": "Classifiers can be trained with data-dependent constraints to satisfy fairness goals, reduce churn, achieve a targeted false positive rate, or other policy goals. We study the generalization performance for such constrained optimization problems, in terms of how well the constraints are satisfied at evaluation time, given that they are satisfied at training time. To improve generalization, we frame the problem as a two-player game where one player optimizes the model parameters on a training dataset, and the other player enforces the constraints on an independent validation dataset. We build on recent work in two-player constrained optimization to show that if one uses this two-dataset approach, then constraint generalization can be significantly improved. As we illustrate experimentally, this approach works not only in theory, but also in practice.", "bibtex": "@InProceedings{pmlr-v97-cotter19b,\n title = \t {Training Well-Generalizing Classifiers for Fairness Metrics and Other Data-Dependent Constraints},\n author = {Cotter, Andrew and Gupta, Maya and Jiang, Heinrich and Srebro, Nathan and Sridharan, Karthik and Wang, Serena and Woodworth, Blake and You, Seungil},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1397--1405},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/cotter19b/cotter19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/cotter19b.html},\n abstract = \t {Classifiers can be trained with data-dependent constraints to satisfy fairness goals, reduce churn, achieve a targeted false positive rate, or other policy goals. We study the generalization performance for such constrained optimization problems, in terms of how well the constraints are satisfied at evaluation time, given that they are satisfied at training time. To improve generalization, we frame the problem as a two-player game where one player optimizes the model parameters on a training dataset, and the other player enforces the constraints on an independent validation dataset. We build on recent work in two-player constrained optimization to show that if one uses this two-dataset approach, then constraint generalization can be significantly improved. As we illustrate experimentally, this approach works not only in theory, but also in practice.}\n}", "pdf": "http://proceedings.mlr.press/v97/cotter19b/cotter19b.pdf", "supp": "", "pdf_size": 399850, "gs_citation": 125, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17069679222017430874&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Google AI, Mountain View, CA, USA; Google AI, Mountain View, CA, USA; Google AI, Mountain View, CA, USA; Toyota Technological Institute at Chicago, Chicago, IL, USA; Cornell University, Computer Science Department, Ithaca, NY, USA; Google AI, Mountain View, CA, USA; Toyota Technological Institute at Chicago, Chicago, IL, USA; Kakao Mobility, Seongnam-si, Geyonggi-do, South Korea", "aff_domain": "google.com; ; ; ; ; ; ; ", "email": "google.com; ; ; ; ; ; ; ", "github": "", "project": "", "author_num": 8, "oa": "https://proceedings.mlr.press/v97/cotter19b.html", "aff_unique_index": "0;0;0;1;2;0;1;3", "aff_unique_norm": "Google;Toyota Technological Institute at Chicago;Cornell University;Kakao Mobility", "aff_unique_dep": "Google AI;;Computer Science Department;", "aff_unique_url": "https://ai.google;https://www.tti-chicago.org;https://www.cornell.edu;", "aff_unique_abbr": "Google AI;TTI Chicago;Cornell;", "aff_campus_unique_index": "0;0;0;1;2;0;1", "aff_campus_unique": "Mountain View;Chicago;Ithaca;", "aff_country_unique_index": "0;0;0;0;0;0;0;1", "aff_country_unique": "United States;South Korea" }, { "title": "Trajectory-Based Off-Policy Deep Reinforcement Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4203", "id": "4203", "author_site": "Andreas Doerr, Michael Volpp, Marc Toussaint, Sebastian Trimpe, Christian Daniel", "author": "Andreas Doerr; Michael Volpp; Marc Toussaint; Trimpe Sebastian; Christian Daniel", "abstract": "Policy gradient methods are powerful reinforcement learning algorithms and have been demonstrated to solve many complex tasks. However, these methods are also data-inefficient, afflicted with high variance gradient estimates, and frequently get stuck in local optima. This work addresses these weaknesses by combining recent improvements in the reuse of off-policy data and exploration in parameter space with deterministic behavioral policies. The resulting objective is amenable to standard neural network optimization strategies like stochastic gradient descent or stochastic gradient Hamiltonian Monte Carlo. Incorporation of previous rollouts via importance sampling greatly improves data-efficiency, whilst stochastic optimization schemes facilitate the escape from local optima. We evaluate the proposed approach on a series of continuous control benchmark tasks. The results show that the proposed algorithm is able to successfully and reliably learn solutions using fewer system interactions than standard policy gradient methods.", "bibtex": "@InProceedings{pmlr-v97-doerr19a,\n title = \t {Trajectory-Based Off-Policy Deep Reinforcement Learning},\n author = {Doerr, Andreas and Volpp, Michael and Toussaint, Marc and Sebastian, Trimpe and Daniel, Christian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1636--1645},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/doerr19a/doerr19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/doerr19a.html},\n abstract = \t {Policy gradient methods are powerful reinforcement learning algorithms and have been demonstrated to solve many complex tasks. However, these methods are also data-inefficient, afflicted with high variance gradient estimates, and frequently get stuck in local optima. This work addresses these weaknesses by combining recent improvements in the reuse of off-policy data and exploration in parameter space with deterministic behavioral policies. The resulting objective is amenable to standard neural network optimization strategies like stochastic gradient descent or stochastic gradient Hamiltonian Monte Carlo. Incorporation of previous rollouts via importance sampling greatly improves data-efficiency, whilst stochastic optimization schemes facilitate the escape from local optima. We evaluate the proposed approach on a series of continuous control benchmark tasks. The results show that the proposed algorithm is able to successfully and reliably learn solutions using fewer system interactions than standard policy gradient methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/doerr19a/doerr19a.pdf", "supp": "", "pdf_size": 594695, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3089333550231775288&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Bosch Center for Artificial Intelligence, Renningen, Germany + Max Planck Institute for Intelligent Systems, Stuttgart/T\u00fcbingen, Germany + Machine Learning and Robotics Lab, University of Stuttgart, Germany; Bosch Center for Artificial Intelligence, Renningen, Germany; Machine Learning and Robotics Lab, University of Stuttgart, Germany; Max Planck Institute for Intelligent Systems, Stuttgart/T\u00fcbingen, Germany; Bosch Center for Artificial Intelligence, Renningen, Germany", "aff_domain": "gmx.net; ; ; ; ", "email": "gmx.net; ; ; ; ", "github": "https://github.com/boschresearch/DD_OPG", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/doerr19a.html", "aff_unique_index": "0+1+2;0;2;1;0", "aff_unique_norm": "Bosch Center for Artificial Intelligence;Max Planck Institute for Intelligent Systems;University of Stuttgart", "aff_unique_dep": "Artificial Intelligence;;Machine Learning and Robotics Lab", "aff_unique_url": "https://www.bosch-ai.com;https://www.mpi-is.mpg.de;https://www.ira.uka.de", "aff_unique_abbr": "BCAI;MPI-IS;", "aff_campus_unique_index": "0+1;0;1;0", "aff_campus_unique": "Renningen;Stuttgart/T\u00fcbingen;", "aff_country_unique_index": "0+0+0;0;0;0;0", "aff_country_unique": "Germany" }, { "title": "Transfer Learning for Related Reinforcement Learning Tasks via Image-to-Image Translation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3859", "id": "3859", "author_site": "Shani Gamrian, Yoav Goldberg", "author": "Shani Gamrian; Yoav Goldberg", "abstract": "Despite the remarkable success of Deep RL in learning control policies from raw pixels, the resulting models do not generalize. We demonstrate that a trained agent fails completely when facing small visual changes, and that fine-tuning\u2014the common transfer learning paradigm\u2014fails to adapt to these changes, to the extent that it is faster to re-train the model from scratch. We show that by separating the visual transfer task from the control policy we achieve substantially better sample efficiency and transfer behavior, allowing an agent trained on the source task to transfer well to the target tasks. The visual mapping from the target to the source domain is performed using unaligned GANs, resulting in a control policy that can be further improved using imitation learning from imperfect demonstrations. We demonstrate the approach on synthetic visual variants of the Breakout game, as well as on transfer between subsequent levels of Road Fighter, a Nintendo car-driving game. A visualization of our approach can be seen in \\url{https://youtu.be/4mnkzYyXMn4} and \\url{https://youtu.be/KCGTrQi6Ogo}.", "bibtex": "@InProceedings{pmlr-v97-gamrian19a,\n title = \t {Transfer Learning for Related Reinforcement Learning Tasks via Image-to-Image Translation},\n author = {Gamrian, Shani and Goldberg, Yoav},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2063--2072},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/gamrian19a/gamrian19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/gamrian19a.html},\n abstract = \t {Despite the remarkable success of Deep RL in learning control policies from raw pixels, the resulting models do not generalize. We demonstrate that a trained agent fails completely when facing small visual changes, and that fine-tuning\u2014the common transfer learning paradigm\u2014fails to adapt to these changes, to the extent that it is faster to re-train the model from scratch. We show that by separating the visual transfer task from the control policy we achieve substantially better sample efficiency and transfer behavior, allowing an agent trained on the source task to transfer well to the target tasks. The visual mapping from the target to the source domain is performed using unaligned GANs, resulting in a control policy that can be further improved using imitation learning from imperfect demonstrations. We demonstrate the approach on synthetic visual variants of the Breakout game, as well as on transfer between subsequent levels of Road Fighter, a Nintendo car-driving game. A visualization of our approach can be seen in \\url{https://youtu.be/4mnkzYyXMn4} and \\url{https://youtu.be/KCGTrQi6Ogo}.}\n}", "pdf": "http://proceedings.mlr.press/v97/gamrian19a/gamrian19a.pdf", "supp": "", "pdf_size": 1207074, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9611056051873190205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Computer Science Department, Bar-Ilan University, Ramat-Gan, Israel+Allen Institute for Arti\ufb01cial Intelligence; Computer Science Department, Bar-Ilan University, Ramat-Gan, Israel+Allen Institute for Arti\ufb01cial Intelligence", "aff_domain": "gmail.com;gmail.com", "email": "gmail.com;gmail.com", "github": "", "project": "https://youtu.be/4mnkzYyXMn4; https://youtu.be/KCGTrQi6Ogo", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/gamrian19a.html", "aff_unique_index": "0+1;0+1", "aff_unique_norm": "Bar-Ilan University;Allen Institute for Artificial Intelligence", "aff_unique_dep": "Computer Science Department;Artificial Intelligence", "aff_unique_url": "https://www.biu.ac.il;https://allenai.org", "aff_unique_abbr": "BIU;AI2", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Ramat-Gan;", "aff_country_unique_index": "0+1;0+1", "aff_country_unique": "Israel;United States" }, { "title": "Transfer of Samples in Policy Search via Multiple Importance Sampling", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4094", "id": "4094", "author_site": "Andrea Tirinzoni, Mattia Salvini, Marcello Restelli", "author": "Andrea Tirinzoni; Mattia Salvini; Marcello Restelli", "abstract": "We consider the transfer of experience samples in reinforcement learning. Most of the previous works in this context focused on value-based settings, where transferring instances conveniently reduces to the transfer of (s,a,s\u2019,r) tuples. In this paper, we consider the more complex case of reusing samples in policy search methods, in which the agent is required to transfer entire trajectories between environments with different transition models. By leveraging ideas from multiple importance sampling, we propose robust gradient estimators that effectively achieve this goal, along with several techniques to reduce their variance. In the case where the transition models are known, we theoretically establish the robustness to the negative transfer for our estimators. In the case of unknown models, we propose a method to efficiently estimate them when the target task belongs to a finite set of possible tasks and when it belongs to some reproducing kernel Hilbert space. We provide empirical results to show the effectiveness of our estimators.", "bibtex": "@InProceedings{pmlr-v97-tirinzoni19a,\n title = \t {Transfer of Samples in Policy Search via Multiple Importance Sampling},\n author = {Tirinzoni, Andrea and Salvini, Mattia and Restelli, Marcello},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6264--6274},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tirinzoni19a/tirinzoni19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tirinzoni19a.html},\n abstract = \t {We consider the transfer of experience samples in reinforcement learning. Most of the previous works in this context focused on value-based settings, where transferring instances conveniently reduces to the transfer of (s,a,s\u2019,r) tuples. In this paper, we consider the more complex case of reusing samples in policy search methods, in which the agent is required to transfer entire trajectories between environments with different transition models. By leveraging ideas from multiple importance sampling, we propose robust gradient estimators that effectively achieve this goal, along with several techniques to reduce their variance. In the case where the transition models are known, we theoretically establish the robustness to the negative transfer for our estimators. In the case of unknown models, we propose a method to efficiently estimate them when the target task belongs to a finite set of possible tasks and when it belongs to some reproducing kernel Hilbert space. We provide empirical results to show the effectiveness of our estimators.}\n}", "pdf": "http://proceedings.mlr.press/v97/tirinzoni19a/tirinzoni19a.pdf", "supp": "", "pdf_size": 463225, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10411537418909254515&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Politecnico di Milano; Politecnico di Milano; Politecnico di Milano", "aff_domain": "polimi.it; ; ", "email": "polimi.it; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/tirinzoni19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Politecnico di Milano", "aff_unique_dep": "", "aff_unique_url": "https://www.polimi.it", "aff_unique_abbr": "Polimi", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "title": "Transferability vs. Discriminability: Batch Spectral Penalization for Adversarial Domain Adaptation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3734", "id": "3734", "author_site": "Xinyang Chen, Sinan Wang, Mingsheng Long, Jianmin Wang", "author": "Xinyang Chen; Sinan Wang; Mingsheng Long; Jianmin Wang", "abstract": "Adversarial domain adaptation has made remarkable advances in learning transferable representations for knowledge transfer across domains. While adversarial learning strengthens the feature transferability which the community focuses on, its impact on the feature discriminability has not been fully explored. In this paper, a series of experiments based on spectral analysis of the feature representations have been conducted, revealing an unexpected deterioration of the discriminability while learning transferable features adversarially. Our key finding is that the eigenvectors with the largest singular values will dominate the feature transferability. As a consequence, the transferability is enhanced at the expense of over penalization of other eigenvectors that embody rich structures crucial for discriminability. Towards this problem, we present Batch Spectral Penalization (BSP), a general approach to penalizing the largest singular values so that other eigenvectors can be relatively strengthened to boost the feature discriminability. Experiments show that the approach significantly improves upon representative adversarial domain adaptation methods to yield state of the art results.", "bibtex": "@InProceedings{pmlr-v97-chen19i,\n title = \t {Transferability vs. Discriminability: Batch Spectral Penalization for Adversarial Domain Adaptation},\n author = {Chen, Xinyang and Wang, Sinan and Long, Mingsheng and Wang, Jianmin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1081--1090},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19i/chen19i.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19i.html},\n abstract = \t {Adversarial domain adaptation has made remarkable advances in learning transferable representations for knowledge transfer across domains. While adversarial learning strengthens the feature transferability which the community focuses on, its impact on the feature discriminability has not been fully explored. In this paper, a series of experiments based on spectral analysis of the feature representations have been conducted, revealing an unexpected deterioration of the discriminability while learning transferable features adversarially. Our key finding is that the eigenvectors with the largest singular values will dominate the feature transferability. As a consequence, the transferability is enhanced at the expense of over penalization of other eigenvectors that embody rich structures crucial for discriminability. Towards this problem, we present Batch Spectral Penalization (BSP), a general approach to penalizing the largest singular values so that other eigenvectors can be relatively strengthened to boost the feature discriminability. Experiments show that the approach significantly improves upon representative adversarial domain adaptation methods to yield state of the art results.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19i/chen19i.pdf", "supp": "", "pdf_size": 4156457, "gs_citation": 629, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8590630247063758749&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff": "School of Software, BNRist, Research Center for Big Data, Tsinghua University; School of Software, BNRist, Research Center for Big Data, Tsinghua University; School of Software, BNRist, Research Center for Big Data, Tsinghua University; School of Software, BNRist, Research Center for Big Data, Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn; ;tsinghua.edu.cn; ", "email": "mails.tsinghua.edu.cn; ;tsinghua.edu.cn; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chen19i.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "School of Software", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "title": "Transferable Adversarial Training: A General Approach to Adapting Deep Classifiers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3644", "id": "3644", "author_site": "Hong Liu, Mingsheng Long, Jianmin Wang, Michael Jordan", "author": "Hong Liu; Mingsheng Long; Jianmin Wang; Michael Jordan", "abstract": "Domain adaptation enables knowledge transfer from a labeled source domain to an unlabeled target domain. A mainstream approach is adversarial feature adaptation, which learns domain-invariant representations through aligning the feature distributions of both domains. However, a theoretical prerequisite of domain adaptation is the adaptability measured by the expected risk of an ideal joint hypothesis over the source and target domains. In this respect, adversarial feature adaptation may potentially deteriorate the adaptability, since it distorts the original feature distributions when suppressing domain-specific variations. To this end, we propose Transferable Adversarial Training (TAT) to enable the adaptation of deep classifiers. The approach generates transferable examples to fill in the gap between the source and target domains, and adversarially trains the deep classifiers to make consistent predictions over the transferable examples. Without learning domain-invariant representations at the expense of distorting the feature distributions, the adaptability in the theoretical learning bound is algorithmically guaranteed. A series of experiments validate that our approach advances the state of the arts on a variety of domain adaptation tasks in vision and NLP, including object recognition, learning from synthetic to real data, and sentiment classification.", "bibtex": "@InProceedings{pmlr-v97-liu19b,\n title = \t {Transferable Adversarial Training: A General Approach to Adapting Deep Classifiers},\n author = {Liu, Hong and Long, Mingsheng and Wang, Jianmin and Jordan, Michael},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4013--4022},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19b/liu19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19b.html},\n abstract = \t {Domain adaptation enables knowledge transfer from a labeled source domain to an unlabeled target domain. A mainstream approach is adversarial feature adaptation, which learns domain-invariant representations through aligning the feature distributions of both domains. However, a theoretical prerequisite of domain adaptation is the adaptability measured by the expected risk of an ideal joint hypothesis over the source and target domains. In this respect, adversarial feature adaptation may potentially deteriorate the adaptability, since it distorts the original feature distributions when suppressing domain-specific variations. To this end, we propose Transferable Adversarial Training (TAT) to enable the adaptation of deep classifiers. The approach generates transferable examples to fill in the gap between the source and target domains, and adversarially trains the deep classifiers to make consistent predictions over the transferable examples. Without learning domain-invariant representations at the expense of distorting the feature distributions, the adaptability in the theoretical learning bound is algorithmically guaranteed. A series of experiments validate that our approach advances the state of the arts on a variety of domain adaptation tasks in vision and NLP, including object recognition, learning from synthetic to real data, and sentiment classification.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19b/liu19b.pdf", "supp": "", "pdf_size": 3165028, "gs_citation": 314, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1231000223746553465&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff": "School of Software + BNRist, Research Center for Big Data, Tsinghua University, Beijing, China; Department of Electronic Engineering + BNRist, Research Center for Big Data, Tsinghua University, Beijing, China; BNRist, Research Center for Big Data, Tsinghua University, Beijing, China; University of California, Berkeley, USA", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn; ; ", "email": "mails.tsinghua.edu.cn;tsinghua.edu.cn; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/liu19b.html", "aff_unique_index": "0+1;2+1;1;3", "aff_unique_norm": "School of Software;Tsinghua University;Institution Name Not Provided;University of California, Berkeley", "aff_unique_dep": "Software;Research Center for Big Data;Department of Electronic Engineering;", "aff_unique_url": ";https://www.tsinghua.edu.cn;;https://www.berkeley.edu", "aff_unique_abbr": ";THU;;UC Berkeley", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Beijing;Berkeley", "aff_country_unique_index": "1;1;1;2", "aff_country_unique": ";China;United States" }, { "title": "Transferable Clean-Label Poisoning Attacks on Deep Neural Nets", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4267", "id": "4267", "author_site": "Chen Zhu, W. Ronny Huang, Hengduo Li, Gavin Taylor, Christoph Studer, Tom Goldstein", "author": "Chen Zhu; W. Ronny Huang; Hengduo Li; Gavin Taylor; Christoph Studer; Tom Goldstein", "abstract": "In this paper, we explore clean-label poisoning attacks on deep convolutional networks with access to neither the network\u2019s output nor its architecture or parameters. Our goal is to ensure that after injecting the poisons into the training data, a model with unknown architecture and parameters trained on that data will misclassify the target image into a specific class. To achieve this goal, we generate multiple poison images from the base class by adding small perturbations which cause the poison images to trap the target image within their convex polytope in feature space. We also demonstrate that using Dropout during crafting of the poisons and enforcing this objective in multiple layers enhances transferability, enabling attacks against both the transfer learning and end-to-end training settings. We demonstrate transferable attack success rates of over 50% by poisoning only 1% of the training set.", "bibtex": "@InProceedings{pmlr-v97-zhu19a,\n title = \t {Transferable Clean-Label Poisoning Attacks on Deep Neural Nets},\n author = {Zhu, Chen and Huang, W. Ronny and Li, Hengduo and Taylor, Gavin and Studer, Christoph and Goldstein, Tom},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7614--7623},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhu19a/zhu19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhu19a.html},\n abstract = \t {In this paper, we explore clean-label poisoning attacks on deep convolutional networks with access to neither the network\u2019s output nor its architecture or parameters. Our goal is to ensure that after injecting the poisons into the training data, a model with unknown architecture and parameters trained on that data will misclassify the target image into a specific class. To achieve this goal, we generate multiple poison images from the base class by adding small perturbations which cause the poison images to trap the target image within their convex polytope in feature space. We also demonstrate that using Dropout during crafting of the poisons and enforcing this objective in multiple layers enhances transferability, enabling attacks against both the transfer learning and end-to-end training settings. We demonstrate transferable attack success rates of over 50% by poisoning only 1% of the training set.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhu19a/zhu19a.pdf", "supp": "", "pdf_size": 5874078, "gs_citation": 380, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=457598797512585014&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": ";;;;;", "aff_domain": ";;;;;", "email": ";;;;;", "github": "", "project": "", "author_num": 6, "oa": "https://proceedings.mlr.press/v97/zhu19a.html" }, { "title": "Trimming the $\\ell_1$ Regularizer: Statistical Analysis, Optimization, and Applications to Deep Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4087", "id": "4087", "author_site": "Jihun Yun, Peng Zheng, Eunho Yang, Aurelie Lozano, Aleksandr Aravkin", "author": "Jihun Yun; Peng Zheng; Eunho Yang; Aurelie Lozano; Aleksandr Aravkin", "abstract": "We study high-dimensional estimators with the trimmed $\\ell_1$ penalty, which leaves the h largest parameter entries penalty-free. While optimization techniques for this nonconvex penalty have been studied, the statistical properties have not yet been analyzed. We present the first statistical analyses for M-estimation, and characterize support recovery, $\\ell_\\infty$ and $\\ell_2$ error of the trimmed $\\ell_1$ estimates as a function of the trimming parameter h. Our results show different regimes based on how h compares to the true support size. Our second contribution is a new algorithm for the trimmed regularization problem, which has the same theoretical convergence rate as difference of convex (DC) algorithms, but in practice is faster and finds lower objective values. Empirical evaluation of $\\ell_1$ trimming for sparse linear regression and graphical model estimation indicate that trimmed $\\ell_1$ can outperform vanilla $\\ell_1$ and non-convex alternatives. Our last contribution is to show that the trimmed penalty is beneficial beyond M-estimation, and yields promising results for two deep learning tasks: input structures recovery and network sparsification.", "bibtex": "@InProceedings{pmlr-v97-yun19a,\n title = \t {Trimming the $\\ell_1$ Regularizer: Statistical Analysis, Optimization, and Applications to Deep Learning},\n author = {Yun, Jihun and Zheng, Peng and Yang, Eunho and Lozano, Aurelie and Aravkin, Aleksandr},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7242--7251},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/yun19a/yun19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/yun19a.html},\n abstract = \t {We study high-dimensional estimators with the trimmed $\\ell_1$ penalty, which leaves the h largest parameter entries penalty-free. While optimization techniques for this nonconvex penalty have been studied, the statistical properties have not yet been analyzed. We present the first statistical analyses for M-estimation, and characterize support recovery, $\\ell_\\infty$ and $\\ell_2$ error of the trimmed $\\ell_1$ estimates as a function of the trimming parameter h. Our results show different regimes based on how h compares to the true support size. Our second contribution is a new algorithm for the trimmed regularization problem, which has the same theoretical convergence rate as difference of convex (DC) algorithms, but in practice is faster and finds lower objective values. Empirical evaluation of $\\ell_1$ trimming for sparse linear regression and graphical model estimation indicate that trimmed $\\ell_1$ can outperform vanilla $\\ell_1$ and non-convex alternatives. Our last contribution is to show that the trimmed penalty is beneficial beyond M-estimation, and yields promising results for two deep learning tasks: input structures recovery and network sparsification.}\n}", "pdf": "http://proceedings.mlr.press/v97/yun19a/yun19a.pdf", "supp": "", "pdf_size": 1112698, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4778764444619086454&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/yun19a.html" }, { "title": "Understanding Geometry of Encoder-Decoder CNNs", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4095", "id": "4095", "author_site": "Jong Chul Ye, woonkyoung Sung", "author": "Jong Chul Ye; Woon Kyoung Sung", "abstract": "Encoder-decoder networks using convolutional neural network (CNN) architecture have been extensively used in deep learning literatures thanks to its excellent performance for various inverse problems in computer vision, medical imaging, etc. However, it is still difficult to obtain coherent geometric view why such an architecture gives the desired performance. Inspired by recent theoretical understanding on generalizability, expressivity and optimization landscape of neural networks, as well as the theory of convolutional framelets, here we provide a unified theoretical framework that leads to a better understanding of geometry of encoder-decoder CNNs. Our unified mathematical framework shows that encoder-decoder CNN architecture is closely related to nonlinear basis representation using combinatorial convolution frames, whose expressibility increases exponentially with the network depth. We also demonstrate the importance of skipped connection in terms of expressibility, and optimization landscape.", "bibtex": "@InProceedings{pmlr-v97-ye19a,\n title = \t {Understanding Geometry of Encoder-Decoder {CNN}s},\n author = {Ye, Jong Chul and Sung, Woon Kyoung},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7064--7073},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ye19a/ye19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ye19a.html},\n abstract = \t {Encoder-decoder networks using convolutional neural network (CNN) architecture have been extensively used in deep learning literatures thanks to its excellent performance for various inverse problems in computer vision, medical imaging, etc. However, it is still difficult to obtain coherent geometric view why such an architecture gives the desired performance. Inspired by recent theoretical understanding on generalizability, expressivity and optimization landscape of neural networks, as well as the theory of convolutional framelets, here we provide a unified theoretical framework that leads to a better understanding of geometry of encoder-decoder CNNs. Our unified mathematical framework shows that encoder-decoder CNN architecture is closely related to nonlinear basis representation using combinatorial convolution frames, whose expressibility increases exponentially with the network depth. We also demonstrate the importance of skipped connection in terms of expressibility, and optimization landscape.}\n}", "pdf": "http://proceedings.mlr.press/v97/ye19a/ye19a.pdf", "supp": "", "pdf_size": 434989, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17776985857091364891&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "aff": "Dept. of Bio/Brain Engineering, KAIST Daejeon 34141, Republic of Korea; Dept. of Mathematical Sciences, KAIST, Daejeon 34141, Republic of Korea", "aff_domain": "kaist.ac.kr; ", "email": "kaist.ac.kr; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/ye19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "KAIST", "aff_unique_dep": "Dept. of Bio/Brain Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Understanding Impacts of High-Order Loss Approximations and Features in Deep Learning Interpretation", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3755", "id": "3755", "author_site": "Sahil Singla, Eric Wallace, Shi Feng, Soheil Feizi", "author": "Sahil Singla; Eric Wallace; Shi Feng; Soheil Feizi", "abstract": "Current saliency map interpretations for neural networks generally rely on two key assumptions. First, they use first-order approximations of the loss function, neglecting higher-order terms such as the loss curvature. Second, they evaluate each feature\u2019s importance in isolation, ignoring feature interdependencies. This work studies the effect of relaxing these two assumptions. First, we characterize a closed-form formula for the input Hessian matrix of a deep ReLU network. Using this formula, we show that, for classification problems with many classes, if a prediction has high probability then including the Hessian term has a small impact on the interpretation. We prove this result by demonstrating that these conditions cause the Hessian matrix to be approximately rank one and its leading eigenvector to be almost parallel to the gradient of the loss. We empirically validate this theory by interpreting ImageNet classifiers. Second, we incorporate feature interdependencies by calculating the importance of group-features using a sparsity regularization term. We use an L0 - L1 relaxation technique along with proximal gradient descent to efficiently compute group-feature importance values. Our empirical results show that our method significantly improves deep learning interpretations.", "bibtex": "@InProceedings{pmlr-v97-singla19a,\n title = \t {Understanding Impacts of High-Order Loss Approximations and Features in Deep Learning Interpretation},\n author = {Singla, Sahil and Wallace, Eric and Feng, Shi and Feizi, Soheil},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5848--5856},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/singla19a/singla19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/singla19a.html},\n abstract = \t {Current saliency map interpretations for neural networks generally rely on two key assumptions. First, they use first-order approximations of the loss function, neglecting higher-order terms such as the loss curvature. Second, they evaluate each feature\u2019s importance in isolation, ignoring feature interdependencies. This work studies the effect of relaxing these two assumptions. First, we characterize a closed-form formula for the input Hessian matrix of a deep ReLU network. Using this formula, we show that, for classification problems with many classes, if a prediction has high probability then including the Hessian term has a small impact on the interpretation. We prove this result by demonstrating that these conditions cause the Hessian matrix to be approximately rank one and its leading eigenvector to be almost parallel to the gradient of the loss. We empirically validate this theory by interpreting ImageNet classifiers. Second, we incorporate feature interdependencies by calculating the importance of group-features using a sparsity regularization term. We use an L0 - L1 relaxation technique along with proximal gradient descent to efficiently compute group-feature importance values. Our empirical results show that our method significantly improves deep learning interpretations.}\n}", "pdf": "http://proceedings.mlr.press/v97/singla19a/singla19a.pdf", "supp": "", "pdf_size": 3281172, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17624808507201697872&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Computer Science Department, University of Maryland; Computer Science Department, University of Maryland; Computer Science Department, University of Maryland; Computer Science Department, University of Maryland", "aff_domain": "cs.umd.edu; ; ;cs.umd.edu", "email": "cs.umd.edu; ; ;cs.umd.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/singla19a.html", "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Maryland", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www/umd.edu", "aff_unique_abbr": "UMD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding MCMC Dynamics as Flows on the Wasserstein Space", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3822", "id": "3822", "author_site": "Chang Liu, Jingwei Zhuo, Jun Zhu", "author": "Chang Liu; Jingwei Zhuo; Jun Zhu", "abstract": "It is known that the Langevin dynamics used in MCMC is the gradient flow of the KL divergence on the Wasserstein space, which helps convergence analysis and inspires recent particle-based variational inference methods (ParVIs). But no more MCMC dynamics is understood in this way. In this work, by developing novel concepts, we propose a theoretical framework that recognizes a general MCMC dynamics as the fiber-gradient Hamiltonian flow on the Wasserstein space of a fiber-Riemannian Poisson manifold. The \"conservation + convergence\" structure of the flow gives a clear picture on the behavior of general MCMC dynamics. The framework also enables ParVI simulation of MCMC dynamics, which enriches the ParVI family with more efficient dynamics, and also adapts ParVI advantages to MCMCs. We develop two ParVI methods for a particular MCMC dynamics and demonstrate the benefits in experiments.", "bibtex": "@InProceedings{pmlr-v97-liu19j,\n title = \t {Understanding {MCMC} Dynamics as Flows on the {W}asserstein Space},\n author = {Liu, Chang and Zhuo, Jingwei and Zhu, Jun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4093--4103},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19j/liu19j.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19j.html},\n abstract = \t {It is known that the Langevin dynamics used in MCMC is the gradient flow of the KL divergence on the Wasserstein space, which helps convergence analysis and inspires recent particle-based variational inference methods (ParVIs). But no more MCMC dynamics is understood in this way. In this work, by developing novel concepts, we propose a theoretical framework that recognizes a general MCMC dynamics as the fiber-gradient Hamiltonian flow on the Wasserstein space of a fiber-Riemannian Poisson manifold. The \"conservation + convergence\" structure of the flow gives a clear picture on the behavior of general MCMC dynamics. The framework also enables ParVI simulation of MCMC dynamics, which enriches the ParVI family with more efficient dynamics, and also adapts ParVI advantages to MCMCs. We develop two ParVI methods for a particular MCMC dynamics and demonstrate the benefits in experiments.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19j/liu19j.pdf", "supp": "", "pdf_size": 828154, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16148000850438563191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Dept. of Comp. Sci. & Tech., Institute for AI, BNRist Center, Tsinghua-Fuzhou Inst. for Data Tech., THBI Lab, Tsinghua University, Beijing, 100084, China; Dept. of Comp. Sci. & Tech., Institute for AI, BNRist Center, Tsinghua-Fuzhou Inst. for Data Tech., THBI Lab, Tsinghua University, Beijing, 100084, China; Dept. of Comp. Sci. & Tech., Institute for AI, BNRist Center, Tsinghua-Fuzhou Inst. for Data Tech., THBI Lab, Tsinghua University, Beijing, 100084, China", "aff_domain": "tsinghua.edu.cn; ; ", "email": "tsinghua.edu.cn; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/liu19j.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "Dept. of Comp. Sci. & Tech.", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Beijing", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "title": "Understanding Priors in Bayesian Neural Networks at the Unit Level", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4258", "id": "4258", "author_site": "Mariia Vladimirova, Jakob Verbeek, Pablo Mesejo, Julyan Arbel", "author": "Mariia Vladimirova; Jakob Verbeek; Pablo Mesejo; Julyan Arbel", "abstract": "We investigate deep Bayesian neural networks with Gaussian priors on the weights and a class of ReLU-like nonlinearities. Bayesian neural networks with Gaussian priors are well known to induce an L2, \u201cweight decay\u201d, regularization. Our results indicate a more intricate regularization effect at the level of the unit activations. Our main result establishes that the induced prior distribution on the units before and after activation becomes increasingly heavy-tailed with the depth of the layer. We show that first layer units are Gaussian, second layer units are sub-exponential, and units in deeper layers are characterized by sub-Weibull distributions. Our results provide new theoretical insight on deep Bayesian neural networks, which we corroborate with simulation experiments.", "bibtex": "@InProceedings{pmlr-v97-vladimirova19a,\n title = \t {Understanding Priors in {B}ayesian Neural Networks at the Unit Level},\n author = {Vladimirova, Mariia and Verbeek, Jakob and Mesejo, Pablo and Arbel, Julyan},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6458--6467},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/vladimirova19a/vladimirova19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/vladimirova19a.html},\n abstract = \t {We investigate deep Bayesian neural networks with Gaussian priors on the weights and a class of ReLU-like nonlinearities. Bayesian neural networks with Gaussian priors are well known to induce an L2, \u201cweight decay\u201d, regularization. Our results indicate a more intricate regularization effect at the level of the unit activations. Our main result establishes that the induced prior distribution on the units before and after activation becomes increasingly heavy-tailed with the depth of the layer. We show that first layer units are Gaussian, second layer units are sub-exponential, and units in deeper layers are characterized by sub-Weibull distributions. Our results provide new theoretical insight on deep Bayesian neural networks, which we corroborate with simulation experiments.}\n}", "pdf": "http://proceedings.mlr.press/v97/vladimirova19a/vladimirova19a.pdf", "supp": "", "pdf_size": 567216, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2788070974820986555&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France+Moscow Institute of Physics and Technology, 141701 Dolgoprudny, Russia; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France; Andalusian Research Institute in Data Science and Computational Intelligence (DaSCI), University of Granada, 18071 Granada, Spain; Univ. Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK, 38000 Grenoble, France", "aff_domain": "inria.fr; ; ; ", "email": "inria.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/vladimirova19a.html", "aff_unique_index": "0+1;0;2;0", "aff_unique_norm": "Universite Grenoble Alpes;Moscow Institute of Physics and Technology;University of Granada", "aff_unique_dep": ";;Andalusian Research Institute in Data Science and Computational Intelligence (DaSCI)", "aff_unique_url": "https://www.univ-grenoble-alpes.fr;https://www.mipt.ru;https://www.ugr.es", "aff_unique_abbr": "UGA;MIPT;UGr", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Grenoble;;Granada", "aff_country_unique_index": "0+1;0;2;0", "aff_country_unique": "France;Russian Federation;Spain" }, { "title": "Understanding and Accelerating Particle-Based Variational Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3823", "id": "3823", "author_site": "Chang Liu, Jingwei Zhuo, Pengyu Cheng, RUIYI (ROY) ZHANG, Jun Zhu", "author": "Chang Liu; Jingwei Zhuo; Pengyu Cheng; Ruiyi Zhang; Jun Zhu", "abstract": "Particle-based variational inference methods (ParVIs) have gained attention in the Bayesian inference literature, for their capacity to yield flexible and accurate approximations. We explore ParVIs from the perspective of Wasserstein gradient flows, and make both theoretical and practical contributions. We unify various finite-particle approximations that existing ParVIs use, and recognize that the approximation is essentially a compulsory smoothing treatment, in either of two equivalent forms. This novel understanding reveals the assumptions and relations of existing ParVIs, and also inspires new ParVIs. We propose an acceleration framework and a principled bandwidth-selection method for general ParVIs; these are based on the developed theory and leverage the geometry of the Wasserstein space. Experimental results show the improved convergence by the acceleration framework and enhanced sample accuracy by the bandwidth-selection method.", "bibtex": "@InProceedings{pmlr-v97-liu19i,\n title = \t {Understanding and Accelerating Particle-Based Variational Inference},\n author = {Liu, Chang and Zhuo, Jingwei and Cheng, Pengyu and Zhang, Ruiyi and Zhu, Jun},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4082--4092},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/liu19i/liu19i.pdf},\n url = \t {https://proceedings.mlr.press/v97/liu19i.html},\n abstract = \t {Particle-based variational inference methods (ParVIs) have gained attention in the Bayesian inference literature, for their capacity to yield flexible and accurate approximations. We explore ParVIs from the perspective of Wasserstein gradient flows, and make both theoretical and practical contributions. We unify various finite-particle approximations that existing ParVIs use, and recognize that the approximation is essentially a compulsory smoothing treatment, in either of two equivalent forms. This novel understanding reveals the assumptions and relations of existing ParVIs, and also inspires new ParVIs. We propose an acceleration framework and a principled bandwidth-selection method for general ParVIs; these are based on the developed theory and leverage the geometry of the Wasserstein space. Experimental results show the improved convergence by the acceleration framework and enhanced sample accuracy by the bandwidth-selection method.}\n}", "pdf": "http://proceedings.mlr.press/v97/liu19i/liu19i.pdf", "supp": "", "pdf_size": 796575, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7410249710967287826&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "aff": ";;;;", "aff_domain": ";;;;", "email": ";;;;", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/liu19i.html" }, { "title": "Understanding and Controlling Memory in Recurrent Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3902", "id": "3902", "author_site": "Doron Haviv, Alexander Rivkind, Omri Barak", "author": "Doron Haviv; Alexander Rivkind; Omri Barak", "abstract": "To be effective in sequential data processing, Recurrent Neural Networks (RNNs) are required to keep track of past events by creating memories. While the relation between memories and the network\u2019s hidden state dynamics was established over the last decade, previous works in this direction were of a predominantly descriptive nature focusing mainly on locating the dynamical objects of interest. In particular, it remained unclear how dynamical observables affect the performance, how they form and whether they can be manipulated. Here, we utilize different training protocols, datasets and architectures to obtain a range of networks solving a delayed classification task with similar performance, alongside substantial differences in their ability to extrapolate for longer delays. We analyze the dynamics of the network\u2019s hidden state, and uncover the reasons for this difference. Each memory is found to be associated with a nearly steady state of the dynamics which we refer to as a \u2019slow point\u2019. Slow point speeds predict extrapolation performance across all datasets, protocols and architectures tested. Furthermore, by tracking the formation of the slow points we are able to understand the origin of differences between training protocols. Finally, we propose a novel regularization technique that is based on the relation between hidden state speeds and memory longevity. Our technique manipulates these speeds, thereby leading to a dramatic improvement in memory robustness over time, and could pave the way for a new class of regularization methods.", "bibtex": "@InProceedings{pmlr-v97-haviv19a,\n title = \t {Understanding and Controlling Memory in Recurrent Neural Networks},\n author = {Haviv, Doron and Rivkind, Alexander and Barak, Omri},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2663--2671},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/haviv19a/haviv19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/haviv19a.html},\n abstract = \t {To be effective in sequential data processing, Recurrent Neural Networks (RNNs) are required to keep track of past events by creating memories. While the relation between memories and the network\u2019s hidden state dynamics was established over the last decade, previous works in this direction were of a predominantly descriptive nature focusing mainly on locating the dynamical objects of interest. In particular, it remained unclear how dynamical observables affect the performance, how they form and whether they can be manipulated. Here, we utilize different training protocols, datasets and architectures to obtain a range of networks solving a delayed classification task with similar performance, alongside substantial differences in their ability to extrapolate for longer delays. We analyze the dynamics of the network\u2019s hidden state, and uncover the reasons for this difference. Each memory is found to be associated with a nearly steady state of the dynamics which we refer to as a \u2019slow point\u2019. Slow point speeds predict extrapolation performance across all datasets, protocols and architectures tested. Furthermore, by tracking the formation of the slow points we are able to understand the origin of differences between training protocols. Finally, we propose a novel regularization technique that is based on the relation between hidden state speeds and memory longevity. Our technique manipulates these speeds, thereby leading to a dramatic improvement in memory robustness over time, and could pave the way for a new class of regularization methods.}\n}", "pdf": "http://proceedings.mlr.press/v97/haviv19a/haviv19a.pdf", "supp": "", "pdf_size": 1478882, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8928228287643305651&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Faculty of Electrical Engineering, Technion, Israel Institute of Technology+Network Biology Research Laboratory, Technion, Israel Institute of Technology+Rappaport Faculty of Medicine, Technion, Israel Institute of Technology; Network Biology Research Laboratory, Technion, Israel Institute of Technology+Rappaport Faculty of Medicine, Technion, Israel Institute of Technology+Weizmann Institute of Science, Israel; Network Biology Research Laboratory, Technion, Israel Institute of Technology+Rappaport Faculty of Medicine, Technion, Israel Institute of Technology", "aff_domain": "gmail.com;gmail.com;gmail.com", "email": "gmail.com;gmail.com;gmail.com", "github": "https://github.com/DoronHaviv/MemoryRNN", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/haviv19a.html", "aff_unique_index": "0+0+0;0+0+1;0+0", "aff_unique_norm": "Technion, Israel Institute of Technology;Weizmann Institute of Science", "aff_unique_dep": "Faculty of Electrical Engineering;", "aff_unique_url": "https://www.technion.ac.il;https://www.weizmann.org.il", "aff_unique_abbr": "Technion;Weizmann", "aff_campus_unique_index": ";;", "aff_campus_unique": "", "aff_country_unique_index": "0+0+0;0+0+0;0+0", "aff_country_unique": "Israel" }, { "title": "Understanding and Utilizing Deep Neural Networks Trained with Noisy Labels", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3593", "id": "3593", "author_site": "Pengfei Chen, Ben Liao, Guangyong Chen, Shengyu Zhang", "author": "Pengfei Chen; Ben Ben Liao; Guangyong Chen; Shengyu Zhang", "abstract": "Noisy labels are ubiquitous in real-world datasets, which poses a challenge for robustly training deep neural networks (DNNs) as DNNs usually have the high capacity to memorize the noisy labels. In this paper, we find that the test accuracy can be quantitatively characterized in terms of the noise ratio in datasets. In particular, the test accuracy is a quadratic function of the noise ratio in the case of symmetric noise, which explains the experimental findings previously published. Based on our analysis, we apply cross-validation to randomly split noisy datasets, which identifies most samples that have correct labels. Then we adopt the Co-teaching strategy which takes full advantage of the identified samples to train DNNs robustly against noisy labels. Compared with extensive state-of-the-art methods, our strategy consistently improves the generalization performance of DNNs under both synthetic and real-world training noise.", "bibtex": "@InProceedings{pmlr-v97-chen19g,\n title = \t {Understanding and Utilizing Deep Neural Networks Trained with Noisy Labels},\n author = {Chen, Pengfei and Liao, Ben Ben and Chen, Guangyong and Zhang, Shengyu},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1062--1070},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chen19g/chen19g.pdf},\n url = \t {https://proceedings.mlr.press/v97/chen19g.html},\n abstract = \t {Noisy labels are ubiquitous in real-world datasets, which poses a challenge for robustly training deep neural networks (DNNs) as DNNs usually have the high capacity to memorize the noisy labels. In this paper, we find that the test accuracy can be quantitatively characterized in terms of the noise ratio in datasets. In particular, the test accuracy is a quadratic function of the noise ratio in the case of symmetric noise, which explains the experimental findings previously published. Based on our analysis, we apply cross-validation to randomly split noisy datasets, which identifies most samples that have correct labels. Then we adopt the Co-teaching strategy which takes full advantage of the identified samples to train DNNs robustly against noisy labels. Compared with extensive state-of-the-art methods, our strategy consistently improves the generalization performance of DNNs under both synthetic and real-world training noise.}\n}", "pdf": "http://proceedings.mlr.press/v97/chen19g/chen19g.pdf", "supp": "", "pdf_size": 3436794, "gs_citation": 488, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1459914703144318986&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science and Engineering, The Chinese University of Hong Kong+Tencent Technology; Tencent Technology; Tencent Technology; Department of Computer Science and Engineering, The Chinese University of Hong Kong+Tencent Technology", "aff_domain": "tencent.com; ; ;tencent.com", "email": "tencent.com; ; ;tencent.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/chen19g.html", "aff_unique_index": "0+1;1;1;0+1", "aff_unique_norm": "Chinese University of Hong Kong;Tencent", "aff_unique_dep": "Department of Computer Science and Engineering;Tencent Technology", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.tencent.com", "aff_unique_abbr": "CUHK;Tencent", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0+0;0;0;0+0", "aff_country_unique": "China" }, { "title": "Understanding and correcting pathologies in the training of learned optimizers", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3868", "id": "3868", "author_site": "Luke Metz, Niru Maheswaranathan, Jeremy Nixon, Daniel Freeman, Jascha Sohl-Dickstein", "author": "Luke Metz; Niru Maheswaranathan; Jeremy Nixon; Daniel Freeman; Jascha Sohl-Dickstein", "abstract": "Deep learning has shown that learned functions can dramatically outperform hand-designed functions on perceptual tasks. Analogously, this suggests that learned optimizers may similarly outperform current hand-designed optimizers, especially for specific problems. However, learned optimizers are notoriously difficult to train and have yet to demonstrate wall-clock speedups over hand-designed optimizers, and thus are rarely used in practice. Typically, learned optimizers are trained by truncated backpropagation through an unrolled optimization process. The resulting gradients are either strongly biased (for short truncations) or have exploding norm (for long truncations). In this work we propose a training scheme which overcomes both of these difficulties, by dynamically weighting two unbiased gradient estimators for a variational loss on optimizer performance. This allows us to train neural networks to perform optimization of a specific task faster than tuned first-order methods. Moreover, by training the optimizer against validation loss (as opposed to training loss), we are able to learn optimizers that train networks to generalize better than first order methods. We demonstrate these results on problems where our learned optimizer trains convolutional networks faster in wall-clock time compared to tuned first-order methods and with an improvement in test loss.", "bibtex": "@InProceedings{pmlr-v97-metz19a,\n title = \t {Understanding and correcting pathologies in the training of learned optimizers},\n author = {Metz, Luke and Maheswaranathan, Niru and Nixon, Jeremy and Freeman, Daniel and Sohl-Dickstein, Jascha},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4556--4565},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/metz19a/metz19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/metz19a.html},\n abstract = \t {Deep learning has shown that learned functions can dramatically outperform hand-designed functions on perceptual tasks. Analogously, this suggests that learned optimizers may similarly outperform current hand-designed optimizers, especially for specific problems. However, learned optimizers are notoriously difficult to train and have yet to demonstrate wall-clock speedups over hand-designed optimizers, and thus are rarely used in practice. Typically, learned optimizers are trained by truncated backpropagation through an unrolled optimization process. The resulting gradients are either strongly biased (for short truncations) or have exploding norm (for long truncations). In this work we propose a training scheme which overcomes both of these difficulties, by dynamically weighting two unbiased gradient estimators for a variational loss on optimizer performance. This allows us to train neural networks to perform optimization of a specific task faster than tuned first-order methods. Moreover, by training the optimizer against validation loss (as opposed to training loss), we are able to learn optimizers that train networks to generalize better than first order methods. We demonstrate these results on problems where our learned optimizer trains convolutional networks faster in wall-clock time compared to tuned first-order methods and with an improvement in test loss.}\n}", "pdf": "http://proceedings.mlr.press/v97/metz19a/metz19a.pdf", "supp": "", "pdf_size": 2342990, "gs_citation": 177, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17084395700030116025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Google Brain; Google Brain; Google Brain; Google Brain; Google Brain", "aff_domain": "google.com; ; ; ; ", "email": "google.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/metz19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google Brain", "aff_unique_url": "https://brain.google.com", "aff_unique_abbr": "Google Brain", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "title": "Understanding the Impact of Entropy on Policy Optimization", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4104", "id": "4104", "author_site": "Zafarali Ahmed, Nicolas Le Roux, Mohammad Norouzi, Dale Schuurmans", "author": "Zafarali Ahmed; Nicolas Le Roux; Mohammad Norouzi; Dale Schuurmans", "abstract": "Entropy regularization is commonly used to improve policy optimization in reinforcement learning. It is believed to help with exploration by encouraging the selection of more stochastic policies. In this work, we analyze this claim using new visualizations of the optimization landscape based on randomly perturbing the loss function. We first show that even with access to the exact gradient, policy optimization is difficult due to the geometry of the objective function. We then qualitatively show that in some environments, a policy with higher entropy can make the optimization landscape smoother, thereby connecting local optima and enabling the use of larger learning rates. This paper presents new tools for understanding the optimization landscape, shows that policy entropy serves as a regularizer, and highlights the challenge of designing general-purpose policy optimization algorithms.", "bibtex": "@InProceedings{pmlr-v97-ahmed19a,\n title = \t {Understanding the Impact of Entropy on Policy Optimization},\n author = {Ahmed, Zafarali and Le Roux, Nicolas and Norouzi, Mohammad and Schuurmans, Dale},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {151--160},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ahmed19a/ahmed19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/ahmed19a.html},\n abstract = \t {Entropy regularization is commonly used to improve policy optimization in reinforcement learning. It is believed to help with exploration by encouraging the selection of more stochastic policies. In this work, we analyze this claim using new visualizations of the optimization landscape based on randomly perturbing the loss function. We first show that even with access to the exact gradient, policy optimization is difficult due to the geometry of the objective function. We then qualitatively show that in some environments, a policy with higher entropy can make the optimization landscape smoother, thereby connecting local optima and enabling the use of larger learning rates. This paper presents new tools for understanding the optimization landscape, shows that policy entropy serves as a regularizer, and highlights the challenge of designing general-purpose policy optimization algorithms.}\n}", "pdf": "http://proceedings.mlr.press/v97/ahmed19a/ahmed19a.pdf", "supp": "", "pdf_size": 965925, "gs_citation": 306, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8905478721868235472&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Mila, McGill University, Montr \u00b4eal, Canada+Google Research; Google Research; Google Research+University of Alberta; University of Alberta", "aff_domain": "mail.mcgill.ca; ; ; ", "email": "mail.mcgill.ca; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/ahmed19a.html", "aff_unique_index": "0+1;1;1+2;2", "aff_unique_norm": "McGill University;Google;University of Alberta", "aff_unique_dep": "Mila;Google Research;", "aff_unique_url": "https://www.mcgill.ca;https://research.google;https://www.ualberta.ca", "aff_unique_abbr": "McGill;Google Research;UAlberta", "aff_campus_unique_index": "0+1;1;1", "aff_campus_unique": "Montr\u00e9al;Mountain View;", "aff_country_unique_index": "0+1;1;1+0;0", "aff_country_unique": "Canada;United States" }, { "title": "Understanding the Origins of Bias in Word Embeddings", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3620", "id": "3620", "author_site": "Marc-Etienne Brunet, Colleen Alkalay-Houlihan, Ashton Anderson, Richard Zemel", "author": "Marc-Etienne Brunet; Colleen Alkalay-Houlihan; Ashton Anderson; Richard Zemel", "abstract": "Popular word embedding algorithms exhibit stereotypical biases, such as gender bias. The widespread use of these algorithms in machine learning systems can amplify stereotypes in important contexts. Although some methods have been developed to mitigate this problem, how word embedding biases arise during training is poorly understood. In this work we develop a technique to address this question. Given a word embedding, our method reveals how perturbing the training corpus would affect the resulting embedding bias. By tracing the origins of word embedding bias back to the original training documents, one can identify subsets of documents whose removal would most reduce bias. We demonstrate our methodology on Wikipedia and New York Times corpora, and find it to be very accurate.", "bibtex": "@InProceedings{pmlr-v97-brunet19a,\n title = \t {Understanding the Origins of Bias in Word Embeddings},\n author = {Brunet, Marc-Etienne and Alkalay-Houlihan, Colleen and Anderson, Ashton and Zemel, Richard},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {803--811},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/brunet19a/brunet19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/brunet19a.html},\n abstract = \t {Popular word embedding algorithms exhibit stereotypical biases, such as gender bias. The widespread use of these algorithms in machine learning systems can amplify stereotypes in important contexts. Although some methods have been developed to mitigate this problem, how word embedding biases arise during training is poorly understood. In this work we develop a technique to address this question. Given a word embedding, our method reveals how perturbing the training corpus would affect the resulting embedding bias. By tracing the origins of word embedding bias back to the original training documents, one can identify subsets of documents whose removal would most reduce bias. We demonstrate our methodology on Wikipedia and New York Times corpora, and find it to be very accurate.}\n}", "pdf": "http://proceedings.mlr.press/v97/brunet19a/brunet19a.pdf", "supp": "", "pdf_size": 502530, "gs_citation": 290, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18061585171680402541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute for Arti\ufb01cial Intelligence, Toronto, Canada; Department of Computer Science, University of Toronto, Toronto, Canada; Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute for Arti\ufb01cial Intelligence, Toronto, Canada; Department of Computer Science, University of Toronto, Toronto, Canada+Vector Institute for Arti\ufb01cial Intelligence, Toronto, Canada", "aff_domain": "cs.toronto.edu; ; ; ", "email": "cs.toronto.edu; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/brunet19a.html", "aff_unique_index": "0+1;0;0+1;0+1", "aff_unique_norm": "University of Toronto;Vector Institute for Artificial Intelligence", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.utoronto.ca;https://vectorinstitute.ai", "aff_unique_abbr": "U of T;Vector Institute", "aff_campus_unique_index": "0+0;0;0+0;0+0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0+0;0;0+0;0+0", "aff_country_unique": "Canada" }, { "title": "Uniform Convergence Rate of the Kernel Density Estimator Adaptive to Intrinsic Volume Dimension", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4108", "id": "4108", "author_site": "Jisu Kim, Jaehyeok Shin, Alessandro Rinaldo, Larry Wasserman", "author": "Jisu Kim; Jaehyeok Shin; Alessandro Rinaldo; Larry Wasserman", "abstract": "We derive concentration inequalities for the supremum norm of the difference between a kernel density estimator (KDE) and its point-wise expectation that hold uniformly over the selection of the bandwidth and under weaker conditions on the kernel and the data generating distribution than previously used in the literature. We first propose a novel concept, called the volume dimension, to measure the intrinsic dimension of the support of a probability distribution based on the rates of decay of the probability of vanishing Euclidean balls. Our bounds depend on the volume dimension and generalize the existing bounds derived in the literature. In particular, when the data-generating distribution has a bounded Lebesgue density or is supported on a sufficiently well-behaved lower-dimensional manifold, our bound recovers the same convergence rate depending on the intrinsic dimension of the support as ones known in the literature. At the same time, our results apply to more general cases, such as the ones of distribution with unbounded densities or supported on a mixture of manifolds with different dimensions. Analogous bounds are derived for the derivative of the KDE, of any order. Our results are generally applicable but are especially useful for problems in geometric inference and topological data analysis, including level set estimation, density-based clustering, modal clustering and mode hunting, ridge estimation and persistent homology.", "bibtex": "@InProceedings{pmlr-v97-kim19e,\n title = \t {Uniform Convergence Rate of the Kernel Density Estimator Adaptive to Intrinsic Volume Dimension},\n author = {Kim, Jisu and Shin, Jaehyeok and Rinaldo, Alessandro and Wasserman, Larry},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {3398--3407},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/kim19e/kim19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/kim19e.html},\n abstract = \t {We derive concentration inequalities for the supremum norm of the difference between a kernel density estimator (KDE) and its point-wise expectation that hold uniformly over the selection of the bandwidth and under weaker conditions on the kernel and the data generating distribution than previously used in the literature. We first propose a novel concept, called the volume dimension, to measure the intrinsic dimension of the support of a probability distribution based on the rates of decay of the probability of vanishing Euclidean balls. Our bounds depend on the volume dimension and generalize the existing bounds derived in the literature. In particular, when the data-generating distribution has a bounded Lebesgue density or is supported on a sufficiently well-behaved lower-dimensional manifold, our bound recovers the same convergence rate depending on the intrinsic dimension of the support as ones known in the literature. At the same time, our results apply to more general cases, such as the ones of distribution with unbounded densities or supported on a mixture of manifolds with different dimensions. Analogous bounds are derived for the derivative of the KDE, of any order. Our results are generally applicable but are especially useful for problems in geometric inference and topological data analysis, including level set estimation, density-based clustering, modal clustering and mode hunting, ridge estimation and persistent homology.}\n}", "pdf": "http://proceedings.mlr.press/v97/kim19e/kim19e.pdf", "supp": "", "pdf_size": 316502, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12897563533814063621&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "aff": "Inria Saclay \u2013 \u00cele-de-France, Palaiseau, France; Department of Statistics and Data Science, Carnegie Mellon University, Pittsburgh, USA; Department of Statistics and Data Science, Carnegie Mellon University, Pittsburgh, USA; Department of Statistics and Data Science, Carnegie Mellon University, Pittsburgh, USA", "aff_domain": "inria.fr; ; ; ", "email": "inria.fr; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/kim19e.html", "aff_unique_index": "0;1;1;1", "aff_unique_norm": "INRIA;Carnegie Mellon University", "aff_unique_dep": ";Department of Statistics and Data Science", "aff_unique_url": "https://www.inria.fr;https://www.cmu.edu", "aff_unique_abbr": "Inria;CMU", "aff_campus_unique_index": "0;1;1;1", "aff_campus_unique": "Saclay;Pittsburgh", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "France;United States" }, { "title": "Unifying Orthogonal Monte Carlo Methods", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3580", "id": "3580", "author_site": "Krzysztof Choromanski, Mark Rowland, Wenyu Chen, Adrian Weller", "author": "Krzysztof Choromanski; Mark Rowland; Wenyu Chen; Adrian Weller", "abstract": "Many machine learning methods making use of Monte Carlo sampling in vector spaces have been shown to be improved by conditioning samples to be mutually orthogonal. Exact orthogonal coupling of samples is computationally intensive, hence approximate methods have been of great interest. In this paper, we present a unifying perspective of many approximate methods by considering Givens transformations, propose new approximate methods based on this framework, and demonstrate the \ufb01rst statistical guarantees for families of approximate methods in kernel approximation. We provide extensive empirical evaluations with guidance for practitioners.", "bibtex": "@InProceedings{pmlr-v97-choromanski19a,\n title = \t {Unifying Orthogonal {M}onte {C}arlo Methods},\n author = {Choromanski, Krzysztof and Rowland, Mark and Chen, Wenyu and Weller, Adrian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1203--1212},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/choromanski19a/choromanski19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/choromanski19a.html},\n abstract = \t {Many machine learning methods making use of Monte Carlo sampling in vector spaces have been shown to be improved by conditioning samples to be mutually orthogonal. Exact orthogonal coupling of samples is computationally intensive, hence approximate methods have been of great interest. In this paper, we present a unifying perspective of many approximate methods by considering Givens transformations, propose new approximate methods based on this framework, and demonstrate the \ufb01rst statistical guarantees for families of approximate methods in kernel approximation. We provide extensive empirical evaluations with guidance for practitioners.}\n}", "pdf": "http://proceedings.mlr.press/v97/choromanski19a/choromanski19a.pdf", "supp": "", "pdf_size": 1180973, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13002193111593631343&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff": "Google Brain; University of Cambridge; Massachusetts Institute of Technology; University of Cambridge + Alan Turing Institute", "aff_domain": "google.com; ; ; ", "email": "google.com; ; ; ", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/choromanski19a.html", "aff_unique_index": "0;1;2;1+3", "aff_unique_norm": "Google;University of Cambridge;Massachusetts Institute of Technology;Alan Turing Institute", "aff_unique_dep": "Google Brain;;;", "aff_unique_url": "https://brain.google.com;https://www.cam.ac.uk;https://web.mit.edu;https://www.turing.ac.uk", "aff_unique_abbr": "Google Brain;Cambridge;MIT;ATI", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Mountain View;Cambridge;", "aff_country_unique_index": "0;1;0;1+1", "aff_country_unique": "United States;United Kingdom" }, { "id": "5065873f75", "title": "Universal Multi-Party Poisoning Attacks", "site": "https://proceedings.mlr.press/v97/mahloujifar19a.html", "author": "Saeed Mahloujifar; Mohammad Mahmoody; Ameer Mohammed", "abstract": "In this work, we demonstrate universal multi-party poisoning attacks that adapt and apply to any multi-party learning process with arbitrary interaction pattern between the parties. More generally, we introduce and study $(k,p)$-poisoning attacks in which an adversary controls $k\\in[m]$ of the parties, and for each corrupted party $P_i$, the adversary submits some poisoned data $T\u2019_i$ on behalf of $P_i$ that is still \"$(1-p)$-close\" to the correct data $T_i$ (e.g., $1-p$ fraction of $T\u2019_i$ is still honestly generated).We prove that for any \"bad\" property $B$ of the final trained hypothesis $h$ (e.g., $h$ failing on a particular test example or having \"large\" risk) that has an arbitrarily small constant probability of happening without the attack, there always is a $(k,p)$-poisoning attack that increases the probability of $B$ from $\\mu$ to by $\\mu^{1-p \\cdot k/m} = \\mu + \\Omega(p \\cdot k/m)$. Our attack only uses clean labels, and it is online, as it only knows the the data shared so far.", "bibtex": "@InProceedings{pmlr-v97-mahloujifar19a,\n title = \t {Universal Multi-Party Poisoning Attacks},\n author = {Mahloujifar, Saeed and Mahmoody, Mohammad and Mohammed, Ameer},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4274--4283},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/mahloujifar19a/mahloujifar19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/mahloujifar19a.html},\n abstract = \t {In this work, we demonstrate universal multi-party poisoning attacks that adapt and apply to any multi-party learning process with arbitrary interaction pattern between the parties. More generally, we introduce and study $(k,p)$-poisoning attacks in which an adversary controls $k\\in[m]$ of the parties, and for each corrupted party $P_i$, the adversary submits some poisoned data $T\u2019_i$ on behalf of $P_i$ that is still \"$(1-p)$-close\" to the correct data $T_i$ (e.g., $1-p$ fraction of $T\u2019_i$ is still honestly generated).We prove that for any \"bad\" property $B$ of the final trained hypothesis $h$ (e.g., $h$ failing on a particular test example or having \"large\" risk) that has an arbitrarily small constant probability of happening without the attack, there always is a $(k,p)$-poisoning attack that increases the probability of $B$ from $\\mu$ to by $\\mu^{1-p \\cdot k/m} = \\mu + \\Omega(p \\cdot k/m)$. Our attack only uses clean labels, and it is online, as it only knows the the data shared so far.}\n}", "pdf": "http://proceedings.mlr.press/v97/mahloujifar19a/mahloujifar19a.pdf", "supp": "", "pdf_size": 336929, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13780641017263757068&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff": "University of Virginia; University of Virginia; University of Kuwait", "aff_domain": "virginia.edu;virginia.edu;ku.edu.kw", "email": "virginia.edu;virginia.edu;ku.edu.kw", "github": "", "project": "", "author_num": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Virginia;University of Kuwait", "aff_unique_dep": ";", "aff_unique_url": "https://www.virginia.edu;https://www.ku.edu.kw", "aff_unique_abbr": "UVA;UK", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "United States;Kuwait" }, { "title": "Unreproducible Research is Reproducible", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4177", "id": "4177", "author_site": "Xavier Bouthillier, C\u00e9sar Laurent, Pascal Vincent", "author": "Xavier Bouthillier; C\u00e9sar Laurent; Pascal Vincent", "abstract": "The apparent contradiction in the title is a wordplay on the different meanings attributed to the word reproducible across different scientific fields. What we imply is that unreproducible findings can be built upon reproducible methods. Without denying the importance of facilitating the reproduction of methods, we deem important to reassert that reproduction of findings is a fundamental step of the scientific inquiry. We argue that the commendable quest towards easy deterministic reproducibility of methods and numerical results should not have us forget the even more important necessity of ensuring the reproducibility of empirical findings and conclusions by properly accounting for essential sources of variations. We provide experiments to exemplify the brittleness of current common practice in the evaluation of models in the field of deep learning, showing that even if the results could be reproduced, a slightly different experiment would not support the findings. We hope to help clarify the distinction between exploratory and empirical research in the field of deep learning and believe more energy should be devoted to proper empirical research in our community. This work is an attempt to promote the use of more rigorous and diversified methodologies. It is not an attempt to impose a new methodology and it is not a critique on the nature of exploratory research.", "bibtex": "@InProceedings{pmlr-v97-bouthillier19a,\n title = \t {Unreproducible Research is Reproducible},\n author = {Bouthillier, Xavier and Laurent, C{\\'e}sar and Vincent, Pascal},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {725--734},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/bouthillier19a/bouthillier19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/bouthillier19a.html},\n abstract = \t {The apparent contradiction in the title is a wordplay on the different meanings attributed to the word reproducible across different scientific fields. What we imply is that unreproducible findings can be built upon reproducible methods. Without denying the importance of facilitating the reproduction of methods, we deem important to reassert that reproduction of findings is a fundamental step of the scientific inquiry. We argue that the commendable quest towards easy deterministic reproducibility of methods and numerical results should not have us forget the even more important necessity of ensuring the reproducibility of empirical findings and conclusions by properly accounting for essential sources of variations. We provide experiments to exemplify the brittleness of current common practice in the evaluation of models in the field of deep learning, showing that even if the results could be reproduced, a slightly different experiment would not support the findings. We hope to help clarify the distinction between exploratory and empirical research in the field of deep learning and believe more energy should be devoted to proper empirical research in our community. This work is an attempt to promote the use of more rigorous and diversified methodologies. It is not an attempt to impose a new methodology and it is not a critique on the nature of exploratory research.}\n}", "pdf": "http://proceedings.mlr.press/v97/bouthillier19a/bouthillier19a.pdf", "supp": "", "pdf_size": 2775389, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8025844255333126719&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff": "Mila, Universit \u00b4e de Montr \u00b4eal+Facebook AI Research+Canadian Institute for Advanced Research (CIFAR); Mila, Universit \u00b4e de Montr \u00b4eal+Facebook AI Research+Canadian Institute for Advanced Research (CIFAR); Mila, Universit \u00b4e de Montr \u00b4eal+Facebook AI Research+Canadian Institute for Advanced Research (CIFAR)", "aff_domain": "umontreal.ca; ; ", "email": "umontreal.ca; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/bouthillier19a.html", "aff_unique_index": "0+1+2;0+1+2;0+1+2", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Meta;Canadian Institute for Advanced Research", "aff_unique_dep": "Mila;Facebook AI Research;", "aff_unique_url": "https://www.mila.quebec;https://research.facebook.com;https://www.cifar.ca", "aff_unique_abbr": "Mila;FAIR;CIFAR", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Montr\u00e9al;", "aff_country_unique_index": "0+1+0;0+1+0;0+1+0", "aff_country_unique": "Canada;United States" }, { "title": "Unsupervised Deep Learning by Neighbourhood Discovery", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3564", "id": "3564", "author_site": "Jiabo Huang, Qi Dong, Shaogang Gong, Xiatian Zhu", "author": "Jiabo Huang; Qi Dong; Shaogang Gong; Xiatian Zhu", "abstract": "Deep convolutional neural networks (CNNs) have demonstrated remarkable success in computer vision by supervisedly learning strong visual feature representations. However, training CNNs relies heavily on the availability of exhaustive training data annotations, limiting significantly their deployment and scalability in many application scenarios. In this work, we introduce a generic unsupervised deep learning approach to training deep models without the need for any manual label supervision. Specifically, we progressively discover sample anchored/centred neighbourhoods to reason and learn the underlying class decision boundaries iteratively and accumulatively. Every single neighbourhood is specially formulated so that all the member samples can share the same unseen class labels at high probability for facilitating the extraction of class discriminative feature representations during training. Experiments on image classification show the performance advantages of the proposed method over the state-of-the-art unsupervised learning models on six benchmarks including both coarse-grained and fine-grained object image categorisation.", "bibtex": "@InProceedings{pmlr-v97-huang19b,\n title = \t {Unsupervised Deep Learning by Neighbourhood Discovery},\n author = {Huang, Jiabo and Dong, Qi and Gong, Shaogang and Zhu, Xiatian},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2849--2858},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/huang19b/huang19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/huang19b.html},\n abstract = \t {Deep convolutional neural networks (CNNs) have demonstrated remarkable success in computer vision by supervisedly learning strong visual feature representations. However, training CNNs relies heavily on the availability of exhaustive training data annotations, limiting significantly their deployment and scalability in many application scenarios. In this work, we introduce a generic unsupervised deep learning approach to training deep models without the need for any manual label supervision. Specifically, we progressively discover sample anchored/centred neighbourhoods to reason and learn the underlying class decision boundaries iteratively and accumulatively. Every single neighbourhood is specially formulated so that all the member samples can share the same unseen class labels at high probability for facilitating the extraction of class discriminative feature representations during training. Experiments on image classification show the performance advantages of the proposed method over the state-of-the-art unsupervised learning models on six benchmarks including both coarse-grained and fine-grained object image categorisation.}\n}", "pdf": "http://proceedings.mlr.press/v97/huang19b/huang19b.pdf", "supp": "", "pdf_size": 1633291, "gs_citation": 185, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2594287551241248539&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Queen Mary University of London; Queen Mary University of London; Queen Mary University of London; Vision Semantics Limited", "aff_domain": "gmail.com; ; ; ", "email": "gmail.com; ; ; ", "github": "https://github.com/raymond-sci/AND", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/huang19b.html", "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Queen Mary University of London;Vision Semantics Limited", "aff_unique_dep": ";", "aff_unique_url": "https://www.qmul.ac.uk;", "aff_unique_abbr": "QMUL;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "London;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "Unsupervised Label Noise Modeling and Loss Correction", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3738", "id": "3738", "author_site": "Eric Arazo, Diego Ortego, Paul Albert, Noel O'Connor, Kevin McGuinness", "author": "Eric Arazo; Diego Ortego; Paul Albert; Noel O\u2019Connor; Kevin Mcguinness", "abstract": "Despite being robust to small amounts of label noise, convolutional neural networks trained with stochastic gradient methods have been shown to easily fit random labels. When there are a mixture of correct and mislabelled targets, networks tend to fit the former before the latter. This suggests using a suitable two-component mixture model as an unsupervised generative model of sample loss values during training to allow online estimation of the probability that a sample is mislabelled. Specifically, we propose a beta mixture to estimate this probability and correct the loss by relying on the network prediction (the so-called bootstrapping loss). We further adapt mixup augmentation to drive our approach a step further. Experiments on CIFAR-10/100 and TinyImageNet demonstrate a robustness to label noise that substantially outperforms recent state-of-the-art. Source code is available at https://git.io/fjsvE and Appendix at https://arxiv.org/abs/1904.11238.", "bibtex": "@InProceedings{pmlr-v97-arazo19a,\n title = \t {Unsupervised Label Noise Modeling and Loss Correction},\n author = {Arazo, Eric and Ortego, Diego and Albert, Paul and O'Connor, Noel and Mcguinness, Kevin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {312--321},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/arazo19a/arazo19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/arazo19a.html},\n abstract = \t {Despite being robust to small amounts of label noise, convolutional neural networks trained with stochastic gradient methods have been shown to easily fit random labels. When there are a mixture of correct and mislabelled targets, networks tend to fit the former before the latter. This suggests using a suitable two-component mixture model as an unsupervised generative model of sample loss values during training to allow online estimation of the probability that a sample is mislabelled. Specifically, we propose a beta mixture to estimate this probability and correct the loss by relying on the network prediction (the so-called bootstrapping loss). We further adapt mixup augmentation to drive our approach a step further. Experiments on CIFAR-10/100 and TinyImageNet demonstrate a robustness to label noise that substantially outperforms recent state-of-the-art. Source code is available at https://git.io/fjsvE and Appendix at https://arxiv.org/abs/1904.11238.}\n}", "pdf": "http://proceedings.mlr.press/v97/arazo19a/arazo19a.pdf", "supp": "", "pdf_size": 3693328, "gs_citation": 790, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7008706792393698059&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Insight Centre for Data Analytics, Dublin City University (DCU), Dublin, Ireland; Insight Centre for Data Analytics, Dublin City University (DCU), Dublin, Ireland; Insight Centre for Data Analytics, Dublin City University (DCU), Dublin, Ireland; Insight Centre for Data Analytics, Dublin City University (DCU), Dublin, Ireland; Insight Centre for Data Analytics, Dublin City University (DCU), Dublin, Ireland", "aff_domain": "insight-centre.org;insight-centre.org; ; ; ", "email": "insight-centre.org;insight-centre.org; ; ; ", "github": "https://git.io/fjsvE", "project": "https://arxiv.org/abs/1904.11238", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/arazo19a.html", "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Dublin City University", "aff_unique_dep": "Insight Centre for Data Analytics", "aff_unique_url": "https://www.dcu.ie", "aff_unique_abbr": "DCU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Dublin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Ireland" }, { "title": "Using Pre-Training Can Improve Model Robustness and Uncertainty", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4237", "id": "4237", "author_site": "Dan Hendrycks, Kimin Lee, Mantas Mazeika", "author": "Dan Hendrycks; Kimin Lee; Mantas Mazeika", "abstract": "He et al. (2018) have called into question the utility of pre-training by showing that training from scratch can often yield similar performance to pre-training. We show that although pre-training may not improve performance on traditional classification metrics, it improves model robustness and uncertainty estimates. Through extensive experiments on label corruption, class imbalance, adversarial examples, out-of-distribution detection, and confidence calibration, we demonstrate large gains from pre-training and complementary effects with task-specific methods. We show approximately a 10% absolute improvement over the previous state-of-the-art in adversarial robustness. In some cases, using pre-training without task-specific methods also surpasses the state-of-the-art, highlighting the need for pre-training when evaluating future methods on robustness and uncertainty tasks.", "bibtex": "@InProceedings{pmlr-v97-hendrycks19a,\n title = \t {Using Pre-Training Can Improve Model Robustness and Uncertainty},\n author = {Hendrycks, Dan and Lee, Kimin and Mazeika, Mantas},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {2712--2721},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/hendrycks19a/hendrycks19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/hendrycks19a.html},\n abstract = \t {He et al. (2018) have called into question the utility of pre-training by showing that training from scratch can often yield similar performance to pre-training. We show that although pre-training may not improve performance on traditional classification metrics, it improves model robustness and uncertainty estimates. Through extensive experiments on label corruption, class imbalance, adversarial examples, out-of-distribution detection, and confidence calibration, we demonstrate large gains from pre-training and complementary effects with task-specific methods. We show approximately a 10% absolute improvement over the previous state-of-the-art in adversarial robustness. In some cases, using pre-training without task-specific methods also surpasses the state-of-the-art, highlighting the need for pre-training when evaluating future methods on robustness and uncertainty tasks.}\n}", "pdf": "http://proceedings.mlr.press/v97/hendrycks19a/hendrycks19a.pdf", "supp": "", "pdf_size": 3682158, "gs_citation": 952, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12052219296634461852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "UC Berkeley; KAIST; University of Chicago", "aff_domain": "berkeley.edu; ; ", "email": "berkeley.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/hendrycks19a.html", "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Berkeley;Korea Advanced Institute of Science and Technology;University of Chicago", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.kaist.ac.kr;https://www.uchicago.edu", "aff_unique_abbr": "UC Berkeley;KAIST;UChicago", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;0", "aff_country_unique": "United States;South Korea" }, { "title": "Validating Causal Inference Models via Influence Functions", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3882", "id": "3882", "author_site": "Ahmed Alaa, Mihaela van der Schaar", "author": "Ahmed Alaa; Mihaela Van Der Schaar", "abstract": "The problem of estimating causal effects of treatments from observational data falls beyond the realm of supervised learning {\u2014} because counterfactual data is inaccessible, we can never observe the true causal effects. In the absence of \"supervision\", how can we evaluate the performance of causal inference methods? In this paper, we use influence functions {\u2014} the functional derivatives of a loss function {\u2014} to develop a model validation procedure that estimates the estimation error of causal inference methods. Our procedure utilizes a Taylor-like expansion to approximate the loss function of a method on a given dataset in terms of the influence functions of its loss on a \"synthesized\", proximal dataset with known causal effects. Under minimal regularity assumptions, we show that our procedure is consistent and efficient. Experiments on 77 benchmark datasets show that using our procedure, we can accurately predict the comparative performances of state-of-the-art causal inference methods applied to a given observational study.", "bibtex": "@InProceedings{pmlr-v97-alaa19a,\n title = \t {Validating Causal Inference Models via Influence Functions},\n author = {Alaa, Ahmed and Van Der Schaar, Mihaela},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {191--201},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/alaa19a/alaa19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/alaa19a.html},\n abstract = \t {The problem of estimating causal effects of treatments from observational data falls beyond the realm of supervised learning {\u2014} because counterfactual data is inaccessible, we can never observe the true causal effects. In the absence of \"supervision\", how can we evaluate the performance of causal inference methods? In this paper, we use influence functions {\u2014} the functional derivatives of a loss function {\u2014} to develop a model validation procedure that estimates the estimation error of causal inference methods. Our procedure utilizes a Taylor-like expansion to approximate the loss function of a method on a given dataset in terms of the influence functions of its loss on a \"synthesized\", proximal dataset with known causal effects. Under minimal regularity assumptions, we show that our procedure is consistent and efficient. Experiments on 77 benchmark datasets show that using our procedure, we can accurately predict the comparative performances of state-of-the-art causal inference methods applied to a given observational study.}\n}", "pdf": "http://proceedings.mlr.press/v97/alaa19a/alaa19a.pdf", "supp": "", "pdf_size": 1076775, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6962449644914465679&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "University of California, Los Angeles, USA+University of Cambridge, Cambridge, UK+Alan Turing Institute, London, UK; University of California, Los Angeles, USA+University of Cambridge, Cambridge, UK+Alan Turing Institute, London, UK", "aff_domain": "ucla.edu; ", "email": "ucla.edu; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/alaa19a.html", "aff_unique_index": "0+1+2;0+1+2", "aff_unique_norm": "University of California, Los Angeles;University of Cambridge;Alan Turing Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucla.edu;https://www.cam.ac.uk;https://www.turing.ac.uk", "aff_unique_abbr": "UCLA;Cambridge;ATI", "aff_campus_unique_index": "0+1+2;0+1+2", "aff_campus_unique": "Los Angeles;Cambridge;London", "aff_country_unique_index": "0+1+1;0+1+1", "aff_country_unique": "United States;United Kingdom" }, { "title": "Variational Annealing of GANs: A Langevin Perspective", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4271", "id": "4271", "author_site": "Chenyang Tao, Shuyang Dai, Liqun Chen, Ke Bai, Junya Chen, Chang Liu, RUIYI (ROY) ZHANG, Georgiy Bobashev, Lawrence Carin", "author": "Chenyang Tao; Shuyang Dai; Liqun Chen; Ke Bai; Junya Chen; Chang Liu; Ruiyi Zhang; Georgiy Bobashev; Lawrence Carin Duke", "abstract": "The generative adversarial network (GAN) has received considerable attention recently as a model for data synthesis, without an explicit specification of a likelihood function. There has been commensurate interest in leveraging likelihood estimates to improve GAN training. To enrich the understanding of this fast-growing yet almost exclusively heuristic-driven subject, we elucidate the theoretical roots of some of the empirical attempts to stabilize and improve GAN training with the introduction of likelihoods. We highlight new insights from variational theory of diffusion processes to derive a likelihood-based regularizing scheme for GAN training, and present a novel approach to train GANs with an unnormalized distribution instead of empirical samples. To substantiate our claims, we provide experimental evidence on how our theoretically-inspired new algorithms improve upon current practice.", "bibtex": "@InProceedings{pmlr-v97-tao19a,\n title = \t {Variational Annealing of {GAN}s: A {L}angevin Perspective},\n author = {Tao, Chenyang and Dai, Shuyang and Chen, Liqun and Bai, Ke and Chen, Junya and Liu, Chang and Zhang, Ruiyi and Bobashev, Georgiy and Duke, Lawrence Carin},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6176--6185},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/tao19a/tao19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/tao19a.html},\n abstract = \t {The generative adversarial network (GAN) has received considerable attention recently as a model for data synthesis, without an explicit specification of a likelihood function. There has been commensurate interest in leveraging likelihood estimates to improve GAN training. To enrich the understanding of this fast-growing yet almost exclusively heuristic-driven subject, we elucidate the theoretical roots of some of the empirical attempts to stabilize and improve GAN training with the introduction of likelihoods. We highlight new insights from variational theory of diffusion processes to derive a likelihood-based regularizing scheme for GAN training, and present a novel approach to train GANs with an unnormalized distribution instead of empirical samples. To substantiate our claims, we provide experimental evidence on how our theoretically-inspired new algorithms improve upon current practice.}\n}", "pdf": "http://proceedings.mlr.press/v97/tao19a/tao19a.pdf", "supp": "", "pdf_size": 3169817, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17361618914609700795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "email": ";;;;;;;;", "github": "", "project": "", "author_num": 9, "oa": "https://proceedings.mlr.press/v97/tao19a.html" }, { "title": "Variational Implicit Processes", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4123", "id": "4123", "author_site": "Chao Ma, Yingzhen Li, Jose Miguel Hernandez-Lobato", "author": "Chao Ma; Yingzhen Li; Jose Miguel Hernandez-Lobato", "abstract": "We introduce the implicit processes (IPs), a stochastic process that places implicitly defined multivariate distributions over any finite collections of random variables. IPs are therefore highly flexible implicit priors over", "bibtex": "@InProceedings{pmlr-v97-ma19b,\n title = \t {Variational Implicit Processes},\n author = {Ma, Chao and Li, Yingzhen and Hernandez-Lobato, Jose Miguel},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4222--4233},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/ma19b/ma19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/ma19b.html},\n abstract = \t {We introduce the implicit processes (IPs), a stochastic process that places implicitly defined multivariate distributions over any finite collections of random variables. IPs are therefore highly flexible implicit priors over", "pdf": "http://proceedings.mlr.press/v97/ma19b/ma19b.pdf", "supp": "", "pdf_size": 926784, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11479270094313825180&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "Department of Engineering, University of Cambridge, Cambridge, UK+Microsoft Research Cambridge, Cambridge, UK; Microsoft Research Cambridge, Cambridge, UK; Department of Engineering, University of Cambridge, Cambridge, UK+Microsoft Research Cambridge, Cambridge, UK", "aff_domain": "cam.ac.uk; ;cam.ac.uk", "email": "cam.ac.uk; ;cam.ac.uk", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/ma19b.html", "aff_unique_index": "0+1;1;0+1", "aff_unique_norm": "University of Cambridge;Microsoft", "aff_unique_dep": "Department of Engineering;Microsoft Research", "aff_unique_url": "https://www.cam.ac.uk;https://www.microsoft.com/en-us/research/group/microsoft-research-cambridge", "aff_unique_abbr": "Cambridge;MSR Cambridge", "aff_campus_unique_index": "0+0;0;0+0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0+0;0;0+0", "aff_country_unique": "United Kingdom" }, { "title": "Variational Inference for sparse network reconstruction from count data", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3682", "id": "3682", "author_site": "Julien Chiquet, Stephane Robin, Mahendra Mariadassou", "author": "Julien Chiquet; Stephane Robin; Mahendra Mariadassou", "abstract": "Networks provide a natural yet statistically grounded way to depict and understand how a set of entities interact. However, in many situations interactions are not directly observed and the network needs to be reconstructed based on observations collected for each entity. Our work focuses on the situation where these observations consist of counts. A typical example is the reconstruction of an ecological network based on abundance data. In this setting, the abundance of a set of species is collected in a series of samples and/or environments and we aim at inferring direct interactions between the species. The abundances at hand can be, for example, direct counts of individuals (ecology of macro-organisms) or read counts resulting from metagenomic sequencing (microbial ecology). Whatever the approach chosen to infer such a network, it has to account for the peculiaraties of the data at hand. The first, obvious one, is that the data are counts, i.e. non continuous. Also, the observed counts often vary over many orders of magnitude and are more dispersed than expected under a simple model, such as the Poisson distribution. The observed counts may also result from different sampling efforts in each sample and/or for each entity, which hampers direct comparison. Furthermore, because the network is supposed to reveal only direct interactions, it is highly desirable to account for covariates describing the environment to avoid spurious edges. Many methods of network reconstruction from count data have been proposed. In the context of microbial ecology, most methods (SparCC, REBACCA, SPIEC-EASI, gCODA, BanOCC) rely on a two-step strategy: transform the counts to pseudo Gaussian observations using simple transforms before moving back to the setting of Gaussian Graphical Models, for which state of the art methods exist to infer the network, but only in a Gaussian world. In this work, we consider instead a full-fledged probabilistic model with a latent layer where the counts follow Poisson distributions, conditional to latent (hidden) Gaussian correlated variables. In this model, known as Poisson log-normal (PLN), the dependency structure is completely captured by the latent layer and we model counts, rather than transformations thereof. To our knowledge, the PLN framework is quite new and has only been used by two other recent methods (Mint and plnDAG) to reconstruct networks from count data. In this work, we use the same mathematical framework but adopt a different optimization strategy which alleviates the whole optimization process. We also fully exploit the connection between the PLN framework and generalized linear models to account for the peculiarities of microbiological data sets. The network inference step is done as usual by adding sparsity inducing constraints on the inverse covariance matrix of the latent Gaussian vector to select only the most important interactions between species. Unlike the usual Gaussian setting, the penalized likelihood is generally not tractable in this framework. We resort instead to a variational approximation for parameter inference and solve the corresponding optimization problem by alternating a gradient descent on the variational parameters and a graphical-Lasso step on the covariance matrix. We also select the sparsity parameter using the resampling-based StARS procedure. We show that the sparse PLN approach has better performance than existing methods on simulated datasets and that it extracts relevant signal from microbial ecology datasets. We also show that the inference scales to datasets made up of hundred of species and samples, in line with other methods in the field. In short, our contributions to the field are the following: we extend the use of PLN distributions in network inference by (i) accounting for covariates and offset and thus removing some spurious edges induced by confounding factors, (ii) accounting for different sampling effort to integrate data sets from different sources and thus infer interactions between different types of organisms (e.g. bacteria - fungi), (iii) developing an inference procedure based on the iterative optimization of a well defined objective function. Our objective function is a provable lower bound of the observed likelihood and our procedure accounts for the uncertainty associated with the estimation of the latent variable, unlike the algorithm presented in Mint and plnDAG.", "bibtex": "@InProceedings{pmlr-v97-chiquet19a,\n title = \t {Variational Inference for sparse network reconstruction from count data},\n author = {Chiquet, Julien and Robin, Stephane and Mariadassou, Mahendra},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1162--1171},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chiquet19a/chiquet19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chiquet19a.html},\n abstract = \t {Networks provide a natural yet statistically grounded way to depict and understand how a set of entities interact. However, in many situations interactions are not directly observed and the network needs to be reconstructed based on observations collected for each entity. Our work focuses on the situation where these observations consist of counts. A typical example is the reconstruction of an ecological network based on abundance data. In this setting, the abundance of a set of species is collected in a series of samples and/or environments and we aim at inferring direct interactions between the species. The abundances at hand can be, for example, direct counts of individuals (ecology of macro-organisms) or read counts resulting from metagenomic sequencing (microbial ecology). Whatever the approach chosen to infer such a network, it has to account for the peculiaraties of the data at hand. The first, obvious one, is that the data are counts, i.e. non continuous. Also, the observed counts often vary over many orders of magnitude and are more dispersed than expected under a simple model, such as the Poisson distribution. The observed counts may also result from different sampling efforts in each sample and/or for each entity, which hampers direct comparison. Furthermore, because the network is supposed to reveal only direct interactions, it is highly desirable to account for covariates describing the environment to avoid spurious edges. Many methods of network reconstruction from count data have been proposed. In the context of microbial ecology, most methods (SparCC, REBACCA, SPIEC-EASI, gCODA, BanOCC) rely on a two-step strategy: transform the counts to pseudo Gaussian observations using simple transforms before moving back to the setting of Gaussian Graphical Models, for which state of the art methods exist to infer the network, but only in a Gaussian world. In this work, we consider instead a full-fledged probabilistic model with a latent layer where the counts follow Poisson distributions, conditional to latent (hidden) Gaussian correlated variables. In this model, known as Poisson log-normal (PLN), the dependency structure is completely captured by the latent layer and we model counts, rather than transformations thereof. To our knowledge, the PLN framework is quite new and has only been used by two other recent methods (Mint and plnDAG) to reconstruct networks from count data. In this work, we use the same mathematical framework but adopt a different optimization strategy which alleviates the whole optimization process. We also fully exploit the connection between the PLN framework and generalized linear models to account for the peculiarities of microbiological data sets. The network inference step is done as usual by adding sparsity inducing constraints on the inverse covariance matrix of the latent Gaussian vector to select only the most important interactions between species. Unlike the usual Gaussian setting, the penalized likelihood is generally not tractable in this framework. We resort instead to a variational approximation for parameter inference and solve the corresponding optimization problem by alternating a gradient descent on the variational parameters and a graphical-Lasso step on the covariance matrix. We also select the sparsity parameter using the resampling-based StARS procedure. We show that the sparse PLN approach has better performance than existing methods on simulated datasets and that it extracts relevant signal from microbial ecology datasets. We also show that the inference scales to datasets made up of hundred of species and samples, in line with other methods in the field. In short, our contributions to the field are the following: we extend the use of PLN distributions in network inference by (i) accounting for covariates and offset and thus removing some spurious edges induced by confounding factors, (ii) accounting for different sampling effort to integrate data sets from different sources and thus infer interactions between different types of organisms (e.g. bacteria - fungi), (iii) developing an inference procedure based on the iterative optimization of a well defined objective function. Our objective function is a provable lower bound of the observed likelihood and our procedure accounts for the uncertainty associated with the estimation of the latent variable, unlike the algorithm presented in Mint and plnDAG.}\n}", "pdf": "http://proceedings.mlr.press/v97/chiquet19a/chiquet19a.pdf", "supp": "", "pdf_size": 1800837, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1166241430093997050&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff": "MIA 518, AgroParitech/INRA, Universit \u00b4e Paris-Saclay, Paris, France; MaIAGE, INRA, Universit \u00b4e Paris-Saclay, Jouy-en-Josas, France; MIA 518, AgroParitech/INRA, Universit \u00b4e Paris-Saclay, Paris, France", "aff_domain": "agroparistech.fr; ; ", "email": "agroparistech.fr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/chiquet19a.html", "aff_unique_index": "0;1;0", "aff_unique_norm": "Universit\u00e9 Paris-Saclay;INRA - Institut National de la Recherche Agronomique", "aff_unique_dep": "AgroParitech/INRA;MaIAGE", "aff_unique_url": "https://www.universite-paris-saclay.fr;https://www.inra.fr", "aff_unique_abbr": "Paris-Saclay;INRA", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Paris;Jouy-en-Josas", "aff_country_unique_index": "0;0;0", "aff_country_unique": "France" }, { "title": "Variational Laplace Autoencoders", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3667", "id": "3667", "author_site": "Yookoon Park, Chris Kim, Gunhee Kim", "author": "Yookoon Park; Chris Kim; Gunhee Kim", "abstract": "Variational autoencoders employ an amortized inference model to approximate the posterior of latent variables. However, such amortized variational inference faces two challenges: (1) the limited posterior expressiveness of fully-factorized Gaussian assumption and (2) the amortization error of the inference model. We present a novel approach that addresses both challenges. First, we focus on ReLU networks with Gaussian output and illustrate their connection to probabilistic PCA. Building on this observation, we derive an iterative algorithm that finds the mode of the posterior and apply fullcovariance Gaussian posterior approximation centered on the mode. Subsequently, we present a general framework named Variational Laplace Autoencoders (VLAEs) for training deep generative models. Based on the Laplace approximation of the latent variable posterior, VLAEs enhance the expressiveness of the posterior while reducing the amortization error. Empirical results on MNIST, Omniglot, Fashion-MNIST, SVHN and CIFAR10 show that the proposed approach significantly outperforms other recent amortized or iterative methods on the ReLU networks.", "bibtex": "@InProceedings{pmlr-v97-park19a,\n title = \t {Variational {L}aplace Autoencoders},\n author = {Park, Yookoon and Kim, Chris and Kim, Gunhee},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5032--5041},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/park19a/park19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/park19a.html},\n abstract = \t {Variational autoencoders employ an amortized inference model to approximate the posterior of latent variables. However, such amortized variational inference faces two challenges: (1) the limited posterior expressiveness of fully-factorized Gaussian assumption and (2) the amortization error of the inference model. We present a novel approach that addresses both challenges. First, we focus on ReLU networks with Gaussian output and illustrate their connection to probabilistic PCA. Building on this observation, we derive an iterative algorithm that finds the mode of the posterior and apply fullcovariance Gaussian posterior approximation centered on the mode. Subsequently, we present a general framework named Variational Laplace Autoencoders (VLAEs) for training deep generative models. Based on the Laplace approximation of the latent variable posterior, VLAEs enhance the expressiveness of the posterior while reducing the amortization error. Empirical results on MNIST, Omniglot, Fashion-MNIST, SVHN and CIFAR10 show that the proposed approach significantly outperforms other recent amortized or iterative methods on the ReLU networks.}\n}", "pdf": "http://proceedings.mlr.press/v97/park19a/park19a.pdf", "supp": "", "pdf_size": 2394587, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7179167555036813772&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Neural Processing Research Center, Seoul National University, Seoul, South Korea; Neural Processing Research Center, Seoul National University, Seoul, South Korea; Neural Processing Research Center, Seoul National University, Seoul, South Korea", "aff_domain": "snu.ac.kr; ; ", "email": "snu.ac.kr; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/park19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "Neural Processing Research Center", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Seoul", "aff_country_unique_index": "0;0;0", "aff_country_unique": "South Korea" }, { "title": "Variational Russian Roulette for Deep Bayesian Nonparametrics", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4118", "id": "4118", "author_site": "Kai Xu, Akash Srivastava, Charles Sutton", "author": "Kai Xu; Akash Srivastava; Charles Sutton", "abstract": "Bayesian nonparametric models provide a principled way to automatically adapt the complexity of a model to the amount of the data available, but computation in such models is difficult. Amortized variational approximations are appealing because of their computational efficiency, but current methods rely on a fixed finite truncation of the infinite model. This truncation level can be difficult to set, and also interacts poorly with amortized methods due to the over-pruning problem. Instead, we propose a new variational approximation, based on a method from statistical physics called Russian roulette sampling. This allows the variational distribution to adapt its complexity during inference, without relying on a fixed truncation level, and while still obtaining an unbiased estimate of the gradient of the original variational objective. We demonstrate this method on infinite sized variational auto-encoders using a Beta-Bernoulli (Indian buffet process) prior.", "bibtex": "@InProceedings{pmlr-v97-xu19e,\n title = \t {Variational Russian Roulette for Deep {B}ayesian Nonparametrics},\n author = {Xu, Kai and Srivastava, Akash and Sutton, Charles},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6963--6972},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xu19e/xu19e.pdf},\n url = \t {https://proceedings.mlr.press/v97/xu19e.html},\n abstract = \t {Bayesian nonparametric models provide a principled way to automatically adapt the complexity of a model to the amount of the data available, but computation in such models is difficult. Amortized variational approximations are appealing because of their computational efficiency, but current methods rely on a fixed finite truncation of the infinite model. This truncation level can be difficult to set, and also interacts poorly with amortized methods due to the over-pruning problem. Instead, we propose a new variational approximation, based on a method from statistical physics called Russian roulette sampling. This allows the variational distribution to adapt its complexity during inference, without relying on a fixed truncation level, and while still obtaining an unbiased estimate of the gradient of the original variational objective. We demonstrate this method on infinite sized variational auto-encoders using a Beta-Bernoulli (Indian buffet process) prior.}\n}", "pdf": "http://proceedings.mlr.press/v97/xu19e/xu19e.pdf", "supp": "", "pdf_size": 6744591, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8517215681947739151&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "aff": "School of Informatics, University of Edinburgh, Edinburgh, United Kingdom+MIT-IBM Watson AI Lab, Cambridge, MA, United States+Google AI, Mountain View, CA, United States+Alan Turing Institute, London, United Kingdom; School of Informatics, University of Edinburgh, Edinburgh, United Kingdom+MIT-IBM Watson AI Lab, Cambridge, MA, United States+Google AI, Mountain View, CA, United States+Alan Turing Institute, London, United Kingdom; School of Informatics, University of Edinburgh, Edinburgh, United Kingdom+MIT-IBM Watson AI Lab, Cambridge, MA, United States+Google AI, Mountain View, CA, United States+Alan Turing Institute, London, United Kingdom", "aff_domain": "ed.ac.uk; ; ", "email": "ed.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/xu19e.html", "aff_unique_index": "0+1+2+3;0+1+2+3;0+1+2+3", "aff_unique_norm": "University of Edinburgh;IBM;Google;Alan Turing Institute", "aff_unique_dep": "School of Informatics;AI Lab;Google AI;", "aff_unique_url": "https://www.ed.ac.uk;https://www.ibmwatson.com/;https://ai.google;https://www.turing.ac.uk", "aff_unique_abbr": "Edinburgh;MIT-IBM AI Lab;Google AI;ATI", "aff_campus_unique_index": "0+1+2+3;0+1+2+3;0+1+2+3", "aff_campus_unique": "Edinburgh;Cambridge;Mountain View;London", "aff_country_unique_index": "0+1+1+0;0+1+1+0;0+1+1+0", "aff_country_unique": "United Kingdom;United States" }, { "title": "Voronoi Boundary Classification: A High-Dimensional Geometric Approach via Weighted Monte Carlo Integration", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4132", "id": "4132", "author_site": "Vladislav Polianskii, Florian T. Pokorny", "author": "Vladislav Polianskii; Florian T. Pokorny", "abstract": "Voronoi cell decompositions provide a classical avenue to classification. Typical approaches however only utilize point-wise cell-membership information by means of nearest neighbor queries and do not utilize further geometric information about Voronoi cells since the computation of Voronoi diagrams is prohibitively expensive in high dimensions. We propose a Monte-Carlo integration based approach that instead computes a weighted integral over the boundaries of Voronoi cells, thus incorporating additional information about the Voronoi cell structure. We demonstrate the scalability of our approach in up to 3072 dimensional spaces and analyze convergence based on the number of Monte Carlo samples and choice of weight functions. Experiments comparing our approach to Nearest Neighbors, SVM and Random Forests indicate that while our approach performs similarly to Random Forests for large data sizes, the algorithm exhibits non-trivial data-dependent performance characteristics for smaller datasets and can be analyzed in terms of a geometric confidence measure, thus adding to the repertoire of geometric approaches to classification while having the benefit of not requiring any model changes or retraining as new training samples or classes are added.", "bibtex": "@InProceedings{pmlr-v97-polianskii19a,\n title = \t {Voronoi Boundary Classification: A High-Dimensional Geometric Approach via Weighted {M}onte {C}arlo Integration},\n author = {Polianskii, Vladislav and Pokorny, Florian T.},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5162--5170},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/polianskii19a/polianskii19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/polianskii19a.html},\n abstract = \t {Voronoi cell decompositions provide a classical avenue to classification. Typical approaches however only utilize point-wise cell-membership information by means of nearest neighbor queries and do not utilize further geometric information about Voronoi cells since the computation of Voronoi diagrams is prohibitively expensive in high dimensions. We propose a Monte-Carlo integration based approach that instead computes a weighted integral over the boundaries of Voronoi cells, thus incorporating additional information about the Voronoi cell structure. We demonstrate the scalability of our approach in up to 3072 dimensional spaces and analyze convergence based on the number of Monte Carlo samples and choice of weight functions. Experiments comparing our approach to Nearest Neighbors, SVM and Random Forests indicate that while our approach performs similarly to Random Forests for large data sizes, the algorithm exhibits non-trivial data-dependent performance characteristics for smaller datasets and can be analyzed in terms of a geometric confidence measure, thus adding to the repertoire of geometric approaches to classification while having the benefit of not requiring any model changes or retraining as new training samples or classes are added.}\n}", "pdf": "http://proceedings.mlr.press/v97/polianskii19a/polianskii19a.pdf", "supp": "", "pdf_size": 692991, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7087005765214242088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Division of Robotics, Perception and Learning, EECS, KTH Royal Institute of Technology, Stockholm, Sweden; Division of Robotics, Perception and Learning, EECS, KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se; ", "email": "kth.se; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/polianskii19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "Division of Robotics, Perception and Learning", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0;0", "aff_country_unique": "Sweden" }, { "title": "Warm-starting Contextual Bandits: Robustly Combining Supervised and Bandit Feedback", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3673", "id": "3673", "author_site": "Chicheng Zhang, Alekh Agarwal, Hal Daum\u00e9 III, John Langford, Sahand Negahban", "author": "Chicheng Zhang; Alekh Agarwal; Hal Daum\u00e9 Iii; John Langford; Sahand Negahban", "abstract": "We investigate the feasibility of learning from both fully-labeled supervised data and contextual bandit data. We specifically consider settings in which the underlying learning signal may be different between these two data sources. Theoretically, we state and prove no-regret algorithms for learning that is robust to divergences between the two sources. Empirically, we evaluate some of these algorithms on a large selection of datasets, showing that our approaches are feasible, and helpful in practice.", "bibtex": "@InProceedings{pmlr-v97-zhang19b,\n title = \t {Warm-starting Contextual Bandits: Robustly Combining Supervised and Bandit Feedback},\n author = {Zhang, Chicheng and Agarwal, Alekh and Iii, Hal Daum{\\'e} and Langford, John and Negahban, Sahand},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7335--7344},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19b/zhang19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19b.html},\n abstract = \t {We investigate the feasibility of learning from both fully-labeled supervised data and contextual bandit data. We specifically consider settings in which the underlying learning signal may be different between these two data sources. Theoretically, we state and prove no-regret algorithms for learning that is robust to divergences between the two sources. Empirically, we evaluate some of these algorithms on a large selection of datasets, showing that our approaches are feasible, and helpful in practice.}\n}", "pdf": "http://proceedings.mlr.press/v97/zhang19b/zhang19b.pdf", "supp": "", "pdf_size": 8288611, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13381714542277312288&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "Microsoft Research; Microsoft Research; Microsoft Research + University of Maryland; Microsoft Research; Yale University", "aff_domain": "microsoft.com; ; ; ; ", "email": "microsoft.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/zhang19b.html", "aff_unique_index": "0;0;0+1;0;2", "aff_unique_norm": "Microsoft;University of Maryland;Yale University", "aff_unique_dep": "Microsoft Research;;", "aff_unique_url": "https://www.microsoft.com/en-us/research;https://www/umd.edu;https://www.yale.edu", "aff_unique_abbr": "MSR;UMD;Yale", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0+0;0;0", "aff_country_unique": "United States" }, { "title": "Wasserstein Adversarial Examples via Projected Sinkhorn Iterations", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4229", "id": "4229", "author_site": "Eric Wong, Frank R Schmidt, Zico Kolter", "author": "Eric Wong; Frank Schmidt; Zico Kolter", "abstract": "A rapidly growing area of work has studied the existence of adversarial examples, datapoints which have been perturbed to fool a classifier, but the vast majority of these works have focused primarily on threat models defined by $\\ell_p$ norm-bounded perturbations. In this paper, we propose a new threat model for adversarial attacks based on the Wasserstein distance. In the image classification setting, such distances measure the cost of moving pixel mass, which can naturally represent \u201cstandard\u201d image manipulations such as scaling, rotation, translation, and distortion (and can potentially be applied to other settings as well). To generate Wasserstein adversarial examples, we develop a procedure for approximate projection onto the Wasserstein ball, based upon a modified version of the Sinkhorn iteration. The resulting algorithm can successfully attack image classification models, bringing traditional CIFAR10 models down to 3% accuracy within a Wasserstein ball with radius 0.1 (i.e., moving 10% of the image mass 1 pixel), and we demonstrate that PGD-based adversarial training can improve this adversarial accuracy to 76%. In total, this work opens up a new direction of study in adversarial robustness, more formally considering convex metrics that accurately capture the invariances that we typically believe should exist in classifiers, and code for all experiments in the paper is available at https://github.com/locuslab/projected_sinkhorn.", "bibtex": "@InProceedings{pmlr-v97-wong19a,\n title = \t {{W}asserstein Adversarial Examples via Projected {S}inkhorn Iterations},\n author = {Wong, Eric and Schmidt, Frank and Kolter, Zico},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6808--6817},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/wong19a/wong19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/wong19a.html},\n abstract = \t {A rapidly growing area of work has studied the existence of adversarial examples, datapoints which have been perturbed to fool a classifier, but the vast majority of these works have focused primarily on threat models defined by $\\ell_p$ norm-bounded perturbations. In this paper, we propose a new threat model for adversarial attacks based on the Wasserstein distance. In the image classification setting, such distances measure the cost of moving pixel mass, which can naturally represent \u201cstandard\u201d image manipulations such as scaling, rotation, translation, and distortion (and can potentially be applied to other settings as well). To generate Wasserstein adversarial examples, we develop a procedure for approximate projection onto the Wasserstein ball, based upon a modified version of the Sinkhorn iteration. The resulting algorithm can successfully attack image classification models, bringing traditional CIFAR10 models down to 3% accuracy within a Wasserstein ball with radius 0.1 (i.e., moving 10% of the image mass 1 pixel), and we demonstrate that PGD-based adversarial training can improve this adversarial accuracy to 76%. In total, this work opens up a new direction of study in adversarial robustness, more formally considering convex metrics that accurately capture the invariances that we typically believe should exist in classifiers, and code for all experiments in the paper is available at https://github.com/locuslab/projected_sinkhorn.}\n}", "pdf": "http://proceedings.mlr.press/v97/wong19a/wong19a.pdf", "supp": "", "pdf_size": 1463597, "gs_citation": 275, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4087808921541648707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff": "Machine Learning Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA + Bosch Center for Artificial Intelligence, Pittsburgh, Pennsylvania, USA; Bosch Center for Artificial Intelligence, Renningen, Germany; Computer Science Department, Carnegie Mellon University, Pittsburgh, Pennsylvania, USA + Bosch Center for Artificial Intelligence, Pittsburgh, Pennsylvania, USA", "aff_domain": "cs.cmu.edu; ; ", "email": "cs.cmu.edu; ; ", "github": "https://github.com/locuslab/projected_sinkhorn", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/wong19a.html", "aff_unique_index": "0+1;1;0+1", "aff_unique_norm": "Carnegie Mellon University;Bosch Center for Artificial Intelligence", "aff_unique_dep": "Machine Learning Department;Artificial Intelligence", "aff_unique_url": "https://www.cmu.edu;https://www.bosch-ai.com", "aff_unique_abbr": "CMU;BCAI", "aff_campus_unique_index": "0+0;1;0+0", "aff_campus_unique": "Pittsburgh;Renningen", "aff_country_unique_index": "0+0;1;0+0", "aff_country_unique": "United States;Germany" }, { "title": "Wasserstein of Wasserstein Loss for Learning Generative Models", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4124", "id": "4124", "author_site": "Yonatan Dukler, Wuchen Li, Alex Lin, Guido Montufar", "author": "Yonatan Dukler; Wuchen Li; Alex Lin; Guido Montufar", "abstract": "The Wasserstein distance serves as a loss function for unsupervised learning which depends on the choice of a ground metric on sample space. We propose to use the Wasserstein distance itself as the ground metric on the sample space of images. This ground metric is known as an effective distance for image retrieval, that correlates with human perception. We derive the Wasserstein ground metric on pixel space and define a Riemannian Wasserstein gradient penalty to be used in the Wasserstein Generative Adversarial Network (WGAN) framework. The new gradient penalty is computed efficiently via convolutions on the $L^2$ gradients with negligible additional computational cost. The new formulation is more robust to the natural variability of the data and provides for a more continuous discriminator in sample space.", "bibtex": "@InProceedings{pmlr-v97-dukler19a,\n title = \t {{W}asserstein of {W}asserstein Loss for Learning Generative Models},\n author = {Dukler, Yonatan and Li, Wuchen and Lin, Alex and Montufar, Guido},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1716--1725},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/dukler19a/dukler19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/dukler19a.html},\n abstract = \t {The Wasserstein distance serves as a loss function for unsupervised learning which depends on the choice of a ground metric on sample space. We propose to use the Wasserstein distance itself as the ground metric on the sample space of images. This ground metric is known as an effective distance for image retrieval, that correlates with human perception. We derive the Wasserstein ground metric on pixel space and define a Riemannian Wasserstein gradient penalty to be used in the Wasserstein Generative Adversarial Network (WGAN) framework. The new gradient penalty is computed efficiently via convolutions on the $L^2$ gradients with negligible additional computational cost. The new formulation is more robust to the natural variability of the data and provides for a more continuous discriminator in sample space.}\n}", "pdf": "http://proceedings.mlr.press/v97/dukler19a/dukler19a.pdf", "supp": "", "pdf_size": 988483, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13254249482108639455&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "aff": "Department of Mathematics, University of California, Los Angeles, CA 90095 + Department of Statistics, University of California, Los Angeles, CA 90095; Department of Mathematics, University of California, Los Angeles, CA 90095 + Department of Statistics, University of California, Los Angeles, CA 90095; Department of Mathematics, University of California, Los Angeles, CA 90095 + Department of Statistics, University of California, Los Angeles, CA 90095; Department of Mathematics, University of California, Los Angeles, CA 90095 + Department of Statistics, University of California, Los Angeles, CA 90095 + Max Planck Institute for Mathematics in the Sciences, 04103 Leipzig, Germany", "aff_domain": "math.ucla.edu;math.ucla.edu;math.ucla.edu;math.ucla.edu", "email": "math.ucla.edu;math.ucla.edu;math.ucla.edu;math.ucla.edu", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/dukler19a.html", "aff_unique_index": "0+0;0+0;0+0;0+0+1", "aff_unique_norm": "University of California, Los Angeles;Max Planck Institute for Mathematics in the Sciences", "aff_unique_dep": "Department of Mathematics;Mathematics", "aff_unique_url": "https://www.ucla.edu;https://www.mis.mpg.de", "aff_unique_abbr": "UCLA;MPI MIS", "aff_campus_unique_index": "0+0;0+0;0+0;0+0+1", "aff_campus_unique": "Los Angeles;Leipzig", "aff_country_unique_index": "0+0;0+0;0+0;0+0+1", "aff_country_unique": "United States;Germany" }, { "title": "Weak Detection of Signal in the Spiked Wigner Model", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3665", "id": "3665", "author_site": "Hye Won Chung, Ji Oon Lee", "author": "Hye Won Chung; Ji Oon Lee", "abstract": "We consider the problem of detecting the presence of the signal in a rank-one signal-plus-noise data matrix. In case the signal-to-noise ratio is under the threshold below which a reliable detection is impossible, we propose a hypothesis test based on the linear spectral statistics of the data matrix. When the noise is Gaussian, the error of the proposed test is optimal as it matches the error of the likelihood ratio test that minimizes the sum of the Type-I and Type-II errors. The test is data-driven and does not depend on the distribution of the signal or the noise. If the density of the noise is known, it can be further improved by an entrywise transformation to lower the error of the test.", "bibtex": "@InProceedings{pmlr-v97-chung19a,\n title = \t {Weak Detection of Signal in the Spiked Wigner Model},\n author = {Chung, Hye Won and Lee, Ji Oon},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1233--1241},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/chung19a/chung19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/chung19a.html},\n abstract = \t {We consider the problem of detecting the presence of the signal in a rank-one signal-plus-noise data matrix. In case the signal-to-noise ratio is under the threshold below which a reliable detection is impossible, we propose a hypothesis test based on the linear spectral statistics of the data matrix. When the noise is Gaussian, the error of the proposed test is optimal as it matches the error of the likelihood ratio test that minimizes the sum of the Type-I and Type-II errors. The test is data-driven and does not depend on the distribution of the signal or the noise. If the density of the noise is known, it can be further improved by an entrywise transformation to lower the error of the test.}\n}", "pdf": "http://proceedings.mlr.press/v97/chung19a/chung19a.pdf", "supp": "", "pdf_size": 478094, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2763308140386314213&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "School of Electrical Engineering, KAIST, Daejeon, Korea; Department of Mathematical Sciences, KAIST, Daejeon, Korea", "aff_domain": "kaist.ac.kr;kaist.edu", "email": "kaist.ac.kr;kaist.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/chung19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "KAIST", "aff_unique_dep": "School of Electrical Engineering", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Daejeon", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "title": "Weakly-Supervised Temporal Localization via Occurrence Count Learning", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3674", "id": "3674", "author_site": "Julien Schroeter, Kirill Sidorov, David Marshall", "author": "Julien Schroeter; Kirill Sidorov; David Marshall", "abstract": "We propose a novel model for temporal detection and localization which allows the training of deep neural networks using only counts of event occurrences as training labels. This powerful weakly-supervised framework alleviates the burden of the imprecise and time consuming process of annotating event locations in temporal data. Unlike existing methods, in which localization is explicitly achieved by design, our model learns localization implicitly as a byproduct of learning to count instances. This unique feature is a direct consequence of the model\u2019s theoretical properties. We validate the effectiveness of our approach in a number of experiments (drum hit and piano onset detection in audio, digit detection in images) and demonstrate performance comparable to that of fully-supervised state-of-the-art methods, despite much weaker training requirements.", "bibtex": "@InProceedings{pmlr-v97-schroeter19a,\n title = \t {Weakly-Supervised Temporal Localization via Occurrence Count Learning},\n author = {Schroeter, Julien and Sidorov, Kirill and Marshall, David},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5649--5659},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/schroeter19a/schroeter19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/schroeter19a.html},\n abstract = \t {We propose a novel model for temporal detection and localization which allows the training of deep neural networks using only counts of event occurrences as training labels. This powerful weakly-supervised framework alleviates the burden of the imprecise and time consuming process of annotating event locations in temporal data. Unlike existing methods, in which localization is explicitly achieved by design, our model learns localization implicitly as a byproduct of learning to count instances. This unique feature is a direct consequence of the model\u2019s theoretical properties. We validate the effectiveness of our approach in a number of experiments (drum hit and piano onset detection in audio, digit detection in images) and demonstrate performance comparable to that of fully-supervised state-of-the-art methods, despite much weaker training requirements.}\n}", "pdf": "http://proceedings.mlr.press/v97/schroeter19a/schroeter19a.pdf", "supp": "", "pdf_size": 646689, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3321291191316445058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13, "aff": "Cardiff University; Cardiff University; Cardiff University", "aff_domain": "cardiff.ac.uk; ; ", "email": "cardiff.ac.uk; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/schroeter19a.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Cardiff University", "aff_unique_dep": "", "aff_unique_url": "https://www.cardiff.ac.uk", "aff_unique_abbr": "Cardiff", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "title": "What is the Effect of Importance Weighting in Deep Learning?", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4247", "id": "4247", "author_site": "Jonathon Byrd, Zachary Lipton", "author": "Jonathon Byrd; Zachary Lipton", "abstract": "Importance-weighted risk minimization is a key ingredient in many machine learning algorithms for causal inference, domain adaptation, class imbalance, and off-policy reinforcement learning. While the effect of importance weighting is well-characterized for low-capacity misspecified models, little is known about how it impacts over-parameterized, deep neural networks. This work is inspired by recent theoretical results showing that on (linearly) separable data, deep linear networks optimized by SGD learn weight-agnostic solutions, prompting us to ask, for realistic deep networks, for which many practical datasets are separable, what is the effect of importance weighting? We present the surprising finding that while importance weighting impacts models early in training, its effect diminishes over successive epochs. Moreover, while L2 regularization and batch normalization (but not dropout), restore some of the impact of importance weighting, they express the effect via (seemingly) the wrong abstraction: why should practitioners tweak the L2 regularization, and by how much, to produce the correct weighting effect? Our experiments confirm these findings across a range of architectures and datasets.", "bibtex": "@InProceedings{pmlr-v97-byrd19a,\n title = \t {What is the Effect of Importance Weighting in Deep Learning?},\n author = {Byrd, Jonathon and Lipton, Zachary},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {872--881},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/byrd19a/byrd19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/byrd19a.html},\n abstract = \t {Importance-weighted risk minimization is a key ingredient in many machine learning algorithms for causal inference, domain adaptation, class imbalance, and off-policy reinforcement learning. While the effect of importance weighting is well-characterized for low-capacity misspecified models, little is known about how it impacts over-parameterized, deep neural networks. This work is inspired by recent theoretical results showing that on (linearly) separable data, deep linear networks optimized by SGD learn weight-agnostic solutions, prompting us to ask, for realistic deep networks, for which many practical datasets are separable, what is the effect of importance weighting? We present the surprising finding that while importance weighting impacts models early in training, its effect diminishes over successive epochs. Moreover, while L2 regularization and batch normalization (but not dropout), restore some of the impact of importance weighting, they express the effect via (seemingly) the wrong abstraction: why should practitioners tweak the L2 regularization, and by how much, to produce the correct weighting effect? Our experiments confirm these findings across a range of architectures and datasets.}\n}", "pdf": "http://proceedings.mlr.press/v97/byrd19a/byrd19a.pdf", "supp": "", "pdf_size": 4902320, "gs_citation": 606, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11711054521177496250&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff": "Carnegie Mellon University; Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu", "email": "cmu.edu;cmu.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/byrd19a.html", "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "When Samples Are Strategically Selected", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4103", "id": "4103", "author_site": "Hanrui Zhang, Yu Cheng, Vincent Conitzer", "author": "Hanrui Zhang; Yu Cheng; Vincent Conitzer", "abstract": "In standard classification problems, the assumption is that the entity making the decision (the", "bibtex": "@InProceedings{pmlr-v97-zhang19c,\n title = \t {When Samples Are Strategically Selected},\n author = {Zhang, Hanrui and Cheng, Yu and Conitzer, Vincent},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {7345--7353},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/zhang19c/zhang19c.pdf},\n url = \t {https://proceedings.mlr.press/v97/zhang19c.html},\n abstract = \t {In standard classification problems, the assumption is that the entity making the decision (the", "pdf": "http://proceedings.mlr.press/v97/zhang19c/zhang19c.pdf", "supp": "", "pdf_size": 264697, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=563135821011669781&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "aff": "Department of Computer Science, Duke University, Durham, North Carolina, USA; Department of Computer Science, Duke University, Durham, North Carolina, USA; Department of Computer Science, Duke University, Durham, North Carolina, USA", "aff_domain": "cs.duke.edu;cs.duke.edu;cs.duke.edu", "email": "cs.duke.edu;cs.duke.edu;cs.duke.edu", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/zhang19c.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Durham", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "White-box vs Black-box: Bayes Optimal Strategies for Membership Inference", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4066", "id": "4066", "author_site": "Alexandre Sablayrolles, Douze Matthijs, Cordelia Schmid, Yann Ollivier, Herve Jegou", "author": "Alexandre Sablayrolles; Matthijs Douze; Cordelia Schmid; Yann Ollivier; Herve Jegou", "abstract": "Membership inference determines, given a sample and trained parameters of a machine learning model, whether the sample was part of the training set. In this paper, we derive the optimal strategy for membership inference with a few assumptions on the distribution of the parameters. We show that optimal attacks only depend on the loss function, and thus black-box attacks are as good as white-box attacks. As the optimal strategy is not tractable, we provide approximations of it leading to several inference methods, and show that existing membership inference methods are coarser approximations of this optimal strategy. Our membership attacks outperform the state of the art in various settings, ranging from a simple logistic regression to more complex architectures and datasets, such as ResNet-101 and Imagenet.", "bibtex": "@InProceedings{pmlr-v97-sablayrolles19a,\n title = \t {White-box vs Black-box: {B}ayes Optimal Strategies for Membership Inference},\n author = {Sablayrolles, Alexandre and Douze, Matthijs and Schmid, Cordelia and Ollivier, Yann and Jegou, Herve},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5558--5567},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/sablayrolles19a/sablayrolles19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/sablayrolles19a.html},\n abstract = \t {Membership inference determines, given a sample and trained parameters of a machine learning model, whether the sample was part of the training set. In this paper, we derive the optimal strategy for membership inference with a few assumptions on the distribution of the parameters. We show that optimal attacks only depend on the loss function, and thus black-box attacks are as good as white-box attacks. As the optimal strategy is not tractable, we provide approximations of it leading to several inference methods, and show that existing membership inference methods are coarser approximations of this optimal strategy. Our membership attacks outperform the state of the art in various settings, ranging from a simple logistic regression to more complex architectures and datasets, such as ResNet-101 and Imagenet.}\n}", "pdf": "http://proceedings.mlr.press/v97/sablayrolles19a/sablayrolles19a.pdf", "supp": "", "pdf_size": 1695162, "gs_citation": 432, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13361864936685134927&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff": "University Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK+Facebook AI Research; Facebook AI Research; Facebook AI Research; University Grenoble Alpes, Inria, CNRS, Grenoble INP, LJK; Facebook AI Research", "aff_domain": "fb.com; ; ; ; ", "email": "fb.com; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/sablayrolles19a.html", "aff_unique_index": "0+1;1;1;0;1", "aff_unique_norm": "University Grenoble Alpes;Meta", "aff_unique_dep": ";Facebook AI Research", "aff_unique_url": "https://www.univ-grenoble-alpes.fr;https://research.facebook.com", "aff_unique_abbr": "UGA;FAIR", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Grenoble;", "aff_country_unique_index": "0+1;1;1;0;1", "aff_country_unique": "France;United States" }, { "title": "Why do Larger Models Generalize Better? A Theoretical Perspective via the XOR Problem", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3884", "id": "3884", "author_site": "Alon Brutzkus, Amir Globerson", "author": "Alon Brutzkus; Amir Globerson", "abstract": "Empirical evidence suggests that neural networks with ReLU activations generalize better with over-parameterization. However, there is currently no theoretical analysis that explains this observation. In this work, we provide theoretical and empirical evidence that, in certain cases, overparameterized convolutional networks generalize better than small networks because of an interplay between weight clustering and feature exploration at initialization. We demonstrate this theoretically for a 3-layer convolutional neural network with max-pooling, in a novel setting which extends the XOR problem. We show that this interplay implies that with overparamterization, gradient descent converges to global minima with better generalization performance compared to global minima of small networks. Empirically, we demonstrate these phenomena for a 3-layer convolutional neural network in the MNIST task.", "bibtex": "@InProceedings{pmlr-v97-brutzkus19b,\n title = \t {Why do Larger Models Generalize Better? {A} Theoretical Perspective via the {XOR} Problem},\n author = {Brutzkus, Alon and Globerson, Amir},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {822--830},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/brutzkus19b/brutzkus19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/brutzkus19b.html},\n abstract = \t {Empirical evidence suggests that neural networks with ReLU activations generalize better with over-parameterization. However, there is currently no theoretical analysis that explains this observation. In this work, we provide theoretical and empirical evidence that, in certain cases, overparameterized convolutional networks generalize better than small networks because of an interplay between weight clustering and feature exploration at initialization. We demonstrate this theoretically for a 3-layer convolutional neural network with max-pooling, in a novel setting which extends the XOR problem. We show that this interplay implies that with overparamterization, gradient descent converges to global minima with better generalization performance compared to global minima of small networks. Empirically, we demonstrate these phenomena for a 3-layer convolutional neural network in the MNIST task.}\n}", "pdf": "http://proceedings.mlr.press/v97/brutzkus19b/brutzkus19b.pdf", "supp": "", "pdf_size": 737741, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3273411260671371749&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff": "Blavatnik School of Computer Science, Tel Aviv University, Israel; Blavatnik School of Computer Science, Tel Aviv University, Israel", "aff_domain": "mail.tau.ac.il; ", "email": "mail.tau.ac.il; ", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/brutzkus19b.html", "aff_unique_index": "0;0", "aff_unique_norm": "Tel Aviv University", "aff_unique_dep": "Blavatnik School of Computer Science", "aff_unique_url": "https://www.tau.ac.il", "aff_unique_abbr": "TAU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Tel Aviv", "aff_country_unique_index": "0;0", "aff_country_unique": "Israel" }, { "title": "Width Provably Matters in Optimization for Deep Linear Neural Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4001", "id": "4001", "author_site": "Simon Du, Wei Hu", "author": "Simon Du; Wei Hu", "abstract": "We prove that for an $L$-layer fully-connected linear neural network, if the width of every hidden layer is $\\widetilde{\\Omega}\\left(L \\cdot r \\cdot d_{out} \\cdot \\kappa^3 \\right)$, where $r$ and $\\kappa$ are the rank and the condition number of the input data, and $d_{out}$ is the output dimension, then gradient descent with Gaussian random initialization converges to a global minimum at a linear rate. The number of iterations to find an $\\epsilon$-suboptimal solution is $O(\\kappa \\log(\\frac{1}{\\epsilon}))$. Our polynomial upper bound on the total running time for wide deep linear networks and the $\\exp\\left(\\Omega\\left(L\\right)\\right)$ lower bound for narrow deep linear neural networks [Shamir, 2018] together demonstrate that wide layers are necessary for optimizing deep models.", "bibtex": "@InProceedings{pmlr-v97-du19a,\n title = \t {Width Provably Matters in Optimization for Deep Linear Neural Networks},\n author = {Du, Simon and Hu, Wei},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {1655--1664},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/du19a/du19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/du19a.html},\n abstract = \t {We prove that for an $L$-layer fully-connected linear neural network, if the width of every hidden layer is $\\widetilde{\\Omega}\\left(L \\cdot r \\cdot d_{out} \\cdot \\kappa^3 \\right)$, where $r$ and $\\kappa$ are the rank and the condition number of the input data, and $d_{out}$ is the output dimension, then gradient descent with Gaussian random initialization converges to a global minimum at a linear rate. The number of iterations to find an $\\epsilon$-suboptimal solution is $O(\\kappa \\log(\\frac{1}{\\epsilon}))$. Our polynomial upper bound on the total running time for wide deep linear networks and the $\\exp\\left(\\Omega\\left(L\\right)\\right)$ lower bound for narrow deep linear neural networks [Shamir, 2018] together demonstrate that wide layers are necessary for optimizing deep models.}\n}", "pdf": "http://proceedings.mlr.press/v97/du19a/du19a.pdf", "supp": "", "pdf_size": 742502, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12283019803057598337&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Carnegie Mellon University, Pittsburgh, PA, USA; Princeton University, Princeton, NJ, USA", "aff_domain": "cs.cmu.edu;cs.princeton.edu", "email": "cs.cmu.edu;cs.princeton.edu", "github": "", "project": "", "author_num": 2, "oa": "https://proceedings.mlr.press/v97/du19a.html", "aff_unique_index": "0;1", "aff_unique_norm": "Carnegie Mellon University;Princeton University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.princeton.edu", "aff_unique_abbr": "CMU;Princeton", "aff_campus_unique_index": "0;1", "aff_campus_unique": "Pittsburgh;Princeton", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "title": "Zeno: Distributed Stochastic Gradient Descent with Suspicion-based Fault-tolerance", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3783", "id": "3783", "author_site": "Cong Xie, Sanmi Koyejo, Indranil Gupta", "author": "Cong Xie; Sanmi Koyejo; Indranil Gupta", "abstract": "We present Zeno, a technique to make distributed machine learning, particularly Stochastic Gradient Descent (SGD), tolerant to an arbitrary number of faulty workers. Zeno generalizes previous results that assumed a majority of non-faulty nodes; we need assume only one non-faulty worker. Our key idea is to suspect workers that are potentially defective. Since this is likely to lead to false positives, we use a ranking-based preference mechanism. We prove the convergence of SGD for non-convex problems under these scenarios. Experimental results show that Zeno outperforms existing approaches.", "bibtex": "@InProceedings{pmlr-v97-xie19b,\n title = \t {Zeno: Distributed Stochastic Gradient Descent with Suspicion-based Fault-tolerance},\n author = {Xie, Cong and Koyejo, Sanmi and Gupta, Indranil},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {6893--6901},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/xie19b/xie19b.pdf},\n url = \t {https://proceedings.mlr.press/v97/xie19b.html},\n abstract = \t {We present Zeno, a technique to make distributed machine learning, particularly Stochastic Gradient Descent (SGD), tolerant to an arbitrary number of faulty workers. Zeno generalizes previous results that assumed a majority of non-faulty nodes; we need assume only one non-faulty worker. Our key idea is to suspect workers that are potentially defective. Since this is likely to lead to false positives, we use a ranking-based preference mechanism. We prove the convergence of SGD for non-convex problems under these scenarios. Experimental results show that Zeno outperforms existing approaches.}\n}", "pdf": "http://proceedings.mlr.press/v97/xie19b/xie19b.pdf", "supp": "", "pdf_size": 453169, "gs_citation": 346, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10331500453771682409&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff": "Department of Computer Science, University of Illinois, Urbana-Champaign, USA; Department of Computer Science, University of Illinois, Urbana-Champaign, USA; Department of Computer Science, University of Illinois, Urbana-Champaign, USA", "aff_domain": "illinois.edu; ; ", "email": "illinois.edu; ; ", "github": "", "project": "", "author_num": 3, "oa": "https://proceedings.mlr.press/v97/xie19b.html", "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois, Urbana-Champaign", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "title": "Zero-Shot Knowledge Distillation in Deep Networks", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/3836", "id": "3836", "author_site": "Gaurav Kumar Nayak, Konda Reddy Mopuri, Vaisakh Shaj, Venkatesh Babu Radhakrishnan, Anirban Chakraborty", "author": "Gaurav Kumar Nayak; Konda Reddy Mopuri; Vaisakh Shaj; Venkatesh Babu Radhakrishnan; Anirban Chakraborty", "abstract": "Knowledge distillation deals with the problem of training a smaller model (", "bibtex": "@InProceedings{pmlr-v97-nayak19a,\n title = \t {Zero-Shot Knowledge Distillation in Deep Networks},\n author = {Nayak, Gaurav Kumar and Mopuri, Konda Reddy and Shaj, Vaisakh and Radhakrishnan, Venkatesh Babu and Chakraborty, Anirban},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {4743--4751},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/nayak19a/nayak19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/nayak19a.html},\n abstract = \t {Knowledge distillation deals with the problem of training a smaller model (", "pdf": "http://proceedings.mlr.press/v97/nayak19a/nayak19a.pdf", "supp": "", "pdf_size": 1642413, "gs_citation": 306, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6513271489867205724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff": "Department of Computational and Data Sciences, Indian Institute of Science, Bangalore, India+School of Informatics, University of Edinburgh, United Kingdom+University of Lincoln, United Kingdom; Department of Computational and Data Sciences, Indian Institute of Science, Bangalore, India; University of Lincoln, United Kingdom; Department of Computational and Data Sciences, Indian Institute of Science, Bangalore, India; Department of Computational and Data Sciences, Indian Institute of Science, Bangalore, India", "aff_domain": "iisc.ac.in; ; ; ; ", "email": "iisc.ac.in; ; ; ; ", "github": "", "project": "", "author_num": 5, "oa": "https://proceedings.mlr.press/v97/nayak19a.html", "aff_unique_index": "0+1+2;0;2;0;0", "aff_unique_norm": "Indian Institute of Science;University of Edinburgh;University of Lincoln", "aff_unique_dep": "Department of Computational and Data Sciences;School of Informatics;", "aff_unique_url": "https://www.iisc.ac.in;https://www.ed.ac.uk;https://www.lincoln.ac.uk", "aff_unique_abbr": "IISc;Edinburgh;UoL", "aff_campus_unique_index": "0+1;0;0;0", "aff_campus_unique": "Bangalore;Edinburgh;", "aff_country_unique_index": "0+1+1;0;1;0;0", "aff_country_unique": "India;United Kingdom" }, { "title": "kernelPSI: a Post-Selection Inference Framework for Nonlinear Variable Selection", "status": "Oral", "track": "main", "site": "https://icml.cc/virtual/2019/poster/4164", "id": "4164", "author_site": "Lotfi Slim, Cl\u00e9ment Chatelain, Chloe-Agathe Azencott, Jean-Philippe Vert", "author": "Lotfi Slim; Cl\u00e9ment Chatelain; Chloe-Agathe Azencott; Jean-Philippe Vert", "abstract": "Model selection is an essential task for many applications in scientific discovery. The most common approaches rely on univariate linear measures of association between each feature and the outcome. Such classical selection procedures fail to take into account nonlinear effects and interactions between features. Kernel-based selection procedures have been proposed as a solution. However, current strategies for kernel selection fail to measure the significance of a joint model constructed through the combination of the basis kernels. In the present work, we exploit recent advances in post-selection inference to propose a valid statistical test for the association of a joint model of the selected kernels with the outcome. The kernels are selected via a step-wise procedure which we model as a succession of quadratic constraints in the outcome variable.", "bibtex": "@InProceedings{pmlr-v97-slim19a,\n title = \t {kernel{PSI}: a Post-Selection Inference Framework for Nonlinear Variable Selection},\n author = {Slim, Lotfi and Chatelain, Cl{\\'e}ment and Azencott, Chloe-Agathe and Vert, Jean-Philippe},\n booktitle = \t {Proceedings of the 36th International Conference on Machine Learning},\n pages = \t {5857--5865},\n year = \t {2019},\n editor = \t {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},\n volume = \t {97},\n series = \t {Proceedings of Machine Learning Research},\n month = \t {09--15 Jun},\n publisher = {PMLR},\n pdf = \t {http://proceedings.mlr.press/v97/slim19a/slim19a.pdf},\n url = \t {https://proceedings.mlr.press/v97/slim19a.html},\n abstract = \t {Model selection is an essential task for many applications in scientific discovery. The most common approaches rely on univariate linear measures of association between each feature and the outcome. Such classical selection procedures fail to take into account nonlinear effects and interactions between features. Kernel-based selection procedures have been proposed as a solution. However, current strategies for kernel selection fail to measure the significance of a joint model constructed through the combination of the basis kernels. In the present work, we exploit recent advances in post-selection inference to propose a valid statistical test for the association of a joint model of the selected kernels with the outcome. The kernels are selected via a step-wise procedure which we model as a succession of quadratic constraints in the outcome variable.}\n}", "pdf": "http://proceedings.mlr.press/v97/slim19a/slim19a.pdf", "supp": "", "pdf_size": 438496, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13538019059151790540&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff": "Translational Sciences, SANOFI R&D, France+MINES ParisTech, PSL Research University, CBIO - Centre for Computational Biology, F-75006 Paris, France; MINES ParisTech, PSL Research University, CBIO - Centre for Computational Biology, F-75006 Paris, France; Institut Curie, PSL Research University, INSERM, U900, F-75005 Paris, France+MINES ParisTech, PSL Research University, CBIO - Centre for Computational Biology, F-75006 Paris, France; Google Brain, F-75009 Paris, France+MINES ParisTech, PSL Research University, CBIO - Centre for Computational Biology, F-75006 Paris, France", "aff_domain": "mines-paristech.fr; ; ;google.com", "email": "mines-paristech.fr; ; ;google.com", "github": "", "project": "", "author_num": 4, "oa": "https://proceedings.mlr.press/v97/slim19a.html", "aff_unique_index": "0+1;1;2+1;3+1", "aff_unique_norm": "SANOFI R&D;MINES ParisTech;Institut Curie;Google", "aff_unique_dep": "Translational Sciences;CBIO - Centre for Computational Biology;;Google Brain", "aff_unique_url": "https://www.sanofi.com;https://www.minesparistech.fr;https://www.institut-curie.org;https://brain.google.com", "aff_unique_abbr": "Sanofi;MINES ParisTech;Institut Curie;Google Brain", "aff_campus_unique_index": "1;1;1+1;1+1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0+0;0;0+0;0+0", "aff_country_unique": "France" } ]