[ { "id": "B1e-kxSKDH", "title": "Structured Object-Aware Physics Prediction for Video Modeling and Planning", "track": "main", "status": "Poster", "tldr": "We propose a structured object-aware video prediction model, which explicitly reasons about objects and demonstrate that it provides high-quality long term video predictions for planning.", "abstract": "When humans observe a physical system, they can easily locate components, understand their interactions, and anticipate future behavior, even in settings with complicated and previously unseen interactions. For computers, however, learning such models from videos in an unsupervised fashion is an unsolved research problem. In this paper, we present STOVE, a novel state-space model for videos, which explicitly reasons about objects and their positions, velocities, and interactions. It is constructed by combining an image model and a dynamics model in compositional manner and improves on previous work by reusing the dynamics model for inference, accelerating and regularizing training. STOVE predicts videos with convincing physical behavior over hundreds of timesteps, outperforms previous unsupervised models, and even approaches the performance of supervised baselines. We further demonstrate the strength of our model as a simulator for sample efficient model-based control, in a task with heavily interacting objects.\n", "keywords": "self-supervised learning;probabilistic deep learning;structured models;video prediction;physics prediction;planning;variational auteoncoders;model-based reinforcement learning;VAEs;unsupervised;variational;graph neural networks;tractable probabilistic models;attend-infer-repeat;relational learning;AIR;sum-product networks;object-oriented;object-centric;object-aware;MCTS", "primary_area": "", "supplementary_material": "", "author": "Jannik Kossen;Karl Stelzner;Marcel Hussing;Claas Voelcker;Kristian Kersting", "authorids": "kossen@stud.uni-heidelberg.de;stelzner@cs.tu-darmstadt.de;marcel.hussing@stud.tu-darmstadt.de;c.voelcker@stud.tu-darmstadt.de;kersting@cs.tu-darmstadt.de", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nKossen2020Structured,\ntitle={Structured Object-Aware Physics Prediction for Video Modeling and Planning},\nauthor={Jannik Kossen and Karl Stelzner and Marcel Hussing and Claas Voelcker and Kristian Kersting},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e-kxSKDH}\n}", "github": "https://github.com/ICLR20/STOVE", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1e-kxSKDH", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "779;585;401", "wc_reply_reviewers": "65;0;0", "wc_reply_authors": "842;778;500", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 588.3333333333334, 154.33585311117946 ], "wc_reply_reviewers_avg": [ 21.666666666666668, 30.641293851417057 ], "wc_reply_authors_avg": [ 706.6666666666666, 148.4527609114166 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9673300822333166750&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "B1e3OlStPB", "title": "DeepSphere: a graph-based spherical CNN", "track": "main", "status": "Spotlight", "tldr": "A graph-based spherical CNN that strikes an interesting balance of trade-offs for a wide variety of applications.", "abstract": "Designing a convolution for a spherical neural network requires a delicate tradeoff between efficiency and rotation equivariance. DeepSphere, a method based on a graph representation of the discretized sphere, strikes a controllable balance between these two desiderata. This contribution is twofold. First, we study both theoretically and empirically how equivariance is affected by the underlying graph with respect to the number of pixels and neighbors. Second, we evaluate DeepSphere on relevant problems. Experiments show state-of-the-art performance and demonstrates the efficiency and flexibility of this formulation. Perhaps surprisingly, comparison with previous work suggests that anisotropic filters might be an unnecessary price to pay. Our code is available at https://github.com/deepsphere.", "keywords": "spherical cnns;graph neural networks;geometric deep learning", "primary_area": "", "supplementary_material": "", "author": "Micha\u00ebl Defferrard;Martino Milani;Fr\u00e9d\u00e9rick Gusset;Nathana\u00ebl Perraudin", "authorids": "michael.defferrard@epfl.ch;martino.milani@epfl.ch;frederick.gusset@epfl.ch;nathanael.perraudin@sdsc.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nDefferrard2020DeepSphere:,\ntitle={DeepSphere: a graph-based spherical CNN},\nauthor={Micha\u00ebl Defferrard and Martino Milani and Fr\u00e9d\u00e9rick Gusset and Nathana\u00ebl Perraudin},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e3OlStPB}\n}", "github": "https://github.com/deepsphere", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1e3OlStPB", "pdf_size": 0, "rating": "6;6;8", "confidence": "0;0;0", "wc_review": "155;184;258", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "96;107;407", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 199.0, 43.36665385600631 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 203.33333333333334, 144.0840803913542 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17982837150918641650&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "B1e5NySKwH", "title": "Instant Quantization of Neural Networks using Monte Carlo Methods", "track": "main", "status": "Withdraw", "tldr": "Monte Carlo methods for quantizing pre-trained models without any additional training.", "abstract": "Low bit-width integer weights and activations are very important for efficient inference, especially with respect to lower power consumption. We propose to apply Monte Carlo methods and importance sampling to sparsify and quantize pre-trained neural networks without any retraining. We obtain sparse, low bit-width integer representations that approximate the full precision weights and activations. The precision, sparsity, and complexity are easily configurable by the amount of sampling performed. Our approach, called Monte Carlo Quantization (MCQ), is linear in both time and space, while the resulting quantized sparse networks show minimal accuracy loss compared to the original full-precision networks. Our method either outperforms or achieves results competitive with methods that do require additional training on a variety of challenging tasks.", "keywords": "monte carlo;importance sampling;network quantization", "primary_area": "", "supplementary_material": "", "author": "Gon\u00e7alo Mordido;Matthijs Van Keirsbilck;Alexander Keller", "authorids": "goncalo.mordido@hpi.de;matthijsv@nvidia.com;akeller@nvidia.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1e5NySKwH", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "519;977;112", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 536.0, 353.3393081255844 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1043696544244402983&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "B1e5TA4FPr", "title": "Pareto Optimality in No-Harm Fairness", "track": "main", "status": "Reject", "tldr": "We propose a method to reduce risk disparity gaps between sensitive groups in classification and regression tasks following the no unnecessary harm principle, ensuring that tradeoffs are minimally costly to any subgroup", "abstract": "Common fairness definitions in machine learning focus on balancing various notions of disparity and utility. In this work we study fairness in the context of risk disparity among sub-populations. We introduce the framework of Pareto-optimal fairness, where the goal of reducing risk disparity gaps is secondary only to the principle of not doing unnecessary harm, a concept that is especially applicable to high-stakes domains such as healthcare. We provide analysis and methodology to obtain maximally-fair no-harm classifiers on finite datasets. We argue that even in domains where fairness at cost is required, no-harm fairness can prove to be the optimal first step. This same methodology can also be applied to any unbalanced classification task, where we want to dynamically equalize the misclassification risks across outcomes without degrading overall performance any more than strictly necessary. We test the proposed methodology on real case-studies of predicting income, ICU patient mortality, classifying skin lesions from images, and assessing credit risk, demonstrating how the proposed framework compares favorably to other traditional approaches.", "keywords": "Fairness;Fairness in Machine Learning;No-Harm Fairness", "primary_area": "", "supplementary_material": "", "author": "Natalia Martinez;Martin Bertran;Guillermo Sapiro", "authorids": "natalia.martinez@duke.edu;martin.bertran@duke.edu;guillermo.sapiro@duke.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmartinez2020pareto,\ntitle={Pareto Optimality in No-Harm Fairness},\nauthor={Natalia Martinez and Martin Bertran and Guillermo Sapiro},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e5TA4FPr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1e5TA4FPr", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "458;180;622", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "331;216;384", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 420.0, 182.4353766862849 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 310.3333333333333, 70.12528470926557 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6OssJOxjAmEJ:scholar.google.com/&scioq=Pareto+Optimality+in+No-Harm+Fairness&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1e9Y2NYvS", "title": "On Robustness of Neural Ordinary Differential Equations", "track": "main", "status": "Spotlight", "tldr": "", "abstract": " Neural ordinary differential equations (ODEs) have been attracting increasing attention in various research domains recently. There have been some works studying optimization issues and approximation capabilities of neural ODEs, but their robustness is still yet unclear. In this work, we fill this important gap by exploring robustness properties of neural ODEs both empirically and theoretically. We first present an empirical study on the robustness of the neural ODE-based networks (ODENets) by exposing them to inputs with various types of perturbations and subsequently investigating the changes of the corresponding outputs. In contrast to conventional convolutional neural networks (CNNs), we find that the ODENets are more robust against both random Gaussian perturbations and adversarial attack examples. We then provide an insightful understanding of this phenomenon by exploiting a certain desirable property of the flow of a continuous-time ODE, namely that integral curves are non-intersecting. Our work suggests that, due to their intrinsic robustness, it is promising to use neural ODEs as a basic block for building robust deep network models. To further enhance the robustness of vanilla neural ODEs, we propose the time-invariant steady neural ODE (TisODE), which regularizes the flow on perturbed data via the time-invariant property and the imposition of a steady-state constraint. We show that the TisODE method outperforms vanilla neural ODEs and also can work in conjunction with other state-of-the-art architectural methods to build more robust deep networks.", "keywords": "Neural ODE", "primary_area": "", "supplementary_material": "", "author": "Hanshu YAN;Jiawei DU;Vincent TAN;Jiashi FENG", "authorids": "hanshu.yan@u.nus.edu;dujiawei@u.nus.edu;vtan@nus.edu.sg;elefjia@nus.edu.sg", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nYAN2020On,\ntitle={On Robustness of Neural Ordinary Differential Equations},\nauthor={Hanshu YAN and Jiawei DU and Vincent TAN and Jiashi FENG},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1e9Y2NYvS}\n}", "github": "https://github.com/HanshuYAN/TisODE", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1e9Y2NYvS", "pdf_size": 0, "rating": "6;6;8", "confidence": "0;0;0", "wc_review": "463;194;296", "wc_reply_reviewers": "154;0;0", "wc_reply_authors": "1495;266;273", "reply_reviewers": "1;0;0", "reply_authors": "5;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 317.6666666666667, 110.88232000138204 ], "wc_reply_reviewers_avg": [ 51.333333333333336, 72.59629620181887 ], "wc_reply_authors_avg": [ 678.0, 577.7133083690098 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 188, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12991236712487678100&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1eB5xSFvr", "title": "DiffTaichi: Differentiable Programming for Physical Simulation", "track": "main", "status": "Poster", "tldr": "We study the problem of learning and optimizing through physical simulations via differentiable programming, using our proposed DiffSim programming language and compiler.", "abstract": "We present DiffTaichi, a new differentiable programming language tailored for building high-performance differentiable physical simulators. Based on an imperative programming language, DiffTaichi generates gradients of simulation steps using source code transformations that preserve arithmetic intensity and parallelism. A light-weight tape is used to record the whole simulation program structure and replay the gradient kernels in a reversed order, for end-to-end backpropagation.\nWe demonstrate the performance and productivity of our language in gradient-based learning and optimization tasks on 10 different physical simulators. For example, a differentiable elastic object simulator written in our language is 4.2x shorter than the hand-engineered CUDA version yet runs as fast, and is 188x faster than the TensorFlow implementation.\nUsing our differentiable programs, neural network controllers are typically optimized within only tens of iterations.", "keywords": "Differentiable programming;robotics;optimal control;physical simulation;machine learning system", "primary_area": "", "supplementary_material": "", "author": "Yuanming Hu;Luke Anderson;Tzu-Mao Li;Qi Sun;Nathan Carr;Jonathan Ragan-Kelley;Fredo Durand", "authorids": "yuanmhu@gmail.com;lukea@mit.edu;tzumao@berkeley.edu;qisu@adobe.com;ncarr@adobe.com;jrk@berkeley.edu;fredo@mit.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nHu2020DiffTaichi:,\ntitle={DiffTaichi: Differentiable Programming for Physical Simulation},\nauthor={Yuanming Hu and Luke Anderson and Tzu-Mao Li and Qi Sun and Nathan Carr and Jonathan Ragan-Kelley and Fredo Durand},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eB5xSFvr}\n}", "github": "https://github.com/yuanming-hu/difftaichi", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eB5xSFvr", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "428;427;276", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "305;450;248", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 377.0, 71.41895173318261 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 334.3333333333333, 85.0346334671285 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0, "gs_citation": 486, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16308007401739546779&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7 }, { "id": "B1eBoJStwr", "title": "Semi-supervised semantic segmentation needs strong, high-dimensional perturbations", "track": "main", "status": "Reject", "tldr": "Why semi-supervised semantic segmentation is a challenging problem (no cluster assumption) and how to get consistency regularisation to work", "abstract": "Consistency regularization describes a class of approaches that have yielded ground breaking results in semi-supervised classification problems. Prior work has established the cluster assumption\\,---\\,under which the data distribution consists of uniform class clusters of samples separated by low density regions\\,---\\,as key to its success. We analyze the problem of semantic segmentation and find that the data distribution does not exhibit low density regions separating classes and offer this as an explanation for why semi-supervised segmentation is a challenging problem. \nWe then identify the conditions that allow consistency regularization to work even without such low-density regions. \nThis allows us to generalize the recently proposed CutMix augmentation technique to a powerful masked variant, CowMix, \nleading to a successful application of consistency regularization in the semi-supervised semantic segmentation setting and\nreaching state-of-the-art results in several standard datasets.", "keywords": "computer vision;semantic segmentation;semi-supervised;consistency regularisation", "primary_area": "", "supplementary_material": "", "author": "Geoff French;Timo Aila;Samuli Laine;Michal Mackiewicz;Graham Finlayson", "authorids": "g.french@uea.ac.uk;taila@nvidia.com;slaine@nvidia.com;m.mackiewicz@uea.ac.uk;g.finlayson@uea.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nfrench2020semisupervised,\ntitle={Semi-supervised semantic segmentation needs strong, high-dimensional perturbations},\nauthor={Geoff French and Timo Aila and Samuli Laine and Michal Mackiewicz and Graham Finlayson},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eBoJStwr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eBoJStwr", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "272;238;662", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "367;302;558", "reply_reviewers": "0;0;0", "reply_authors": "1;2;2", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 390.6666666666667, 192.36308262126482 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 409.0, 108.64928286310345 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=327565071745453507&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1eCk1StPH", "title": "The Generalization-Stability Tradeoff in Neural Network Pruning", "track": "main", "status": "Reject", "tldr": "We demonstrate that pruning methods which introduce greater instability into the loss also confer improved generalization, and explore the mechanisms underlying this effect.", "abstract": "Pruning neural network parameters is often viewed as a means to compress models, but pruning has also been motivated by the desire to prevent overfitting. This motivation is particularly relevant given the perhaps surprising observation that a wide variety of pruning approaches increase test accuracy despite sometimes massive reductions in parameter counts. To better understand this phenomenon, we analyze the behavior of pruning over the course of training, finding that pruning's effect on generalization relies more on the instability it generates (defined as the drops in test accuracy immediately following pruning) than on the final size of the pruned model. We demonstrate that even the pruning of unimportant parameters can lead to such instability, and show similarities between pruning and regularizing by injecting noise, suggesting a mechanism for pruning-based generalization improvements that is compatible with the strong generalization recently observed in over-parameterized networks.", "keywords": "pruning;generalization;stability;dynamics;regularization", "primary_area": "", "supplementary_material": "", "author": "Brian R. Bartoldson;Ari S. Morcos;Adrian Barbu;Gordon Erlebacher", "authorids": "bbartoldson@fsu.edu;arimorcos@gmail.com;abarbu@stat.fsu.edu;gerlebacher@fsu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbartoldson2020the,\ntitle={The Generalization-Stability Tradeoff in Neural Network Pruning},\nauthor={Brian R. Bartoldson and Ari S. Morcos and Adrian Barbu and Gordon Erlebacher},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eCk1StPH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eCk1StPH", "pdf_size": 0, "rating": "1;1;3", "confidence": "0;0;0", "wc_review": "397;199;476", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "778;1346;845", "reply_reviewers": "0;0;0", "reply_authors": "2;3;2", "rating_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 357.3333333333333, 116.51132515291759 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 989.6666666666666, 253.44602756580915 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2374906680152964127&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "B1eP504YDr", "title": "Independence-aware Advantage Estimation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Most of existing advantage function estimation methods in reinforcement learning suffer from the problem of high variance, which scales unfavorably with the time horizon. To address this challenge, we propose to identify the independence property between current action and future states in environments, which can be further leveraged to effectively reduce the variance of the advantage estimation. In particular, the recognized independence property can be naturally utilized to construct a novel importance sampling advantage estimator with close-to-zero variance even when the Monte-Carlo return signal yields a large variance. To further remove the risk of the high variance introduced by the new estimator, we combine it with existing Monte-Carlo estimator via a reward decomposition model learned by minimizing the estimation variance. Experiments demonstrate that our method achieves higher sample efficiency compared with existing advantage estimation methods in complex environments. ", "keywords": "Reinforcement Learning;Advantage Estimation", "primary_area": "", "supplementary_material": "", "author": "Pushi Zhang;Li Zhao;Guoqing Liu;Jiang Bian;Minglie Huang;Tao Qin;Tie-Yan Liu", "authorids": "zpschang@gmail.com;lizo@microsoft.com;lgq1001@mail.ustc.edu.cn;jiang.bian@microsoft.com;aihuang@mails.tsinghua.edu.cn;taoqin@microsoft.com;tie-yan.liu@microsoft.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nzhang2020independenceaware,\ntitle={Independence-aware Advantage Estimation},\nauthor={Pushi Zhang and Li Zhao and Guoqing Liu and Jiang Bian and Minglie Huang and Tao Qin and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eP504YDr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1eP504YDr", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "294;985;588", "wc_reply_reviewers": "249;19;16", "wc_reply_authors": "1210;310;483", "reply_reviewers": "1;1;1", "reply_authors": "2;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 622.3333333333334, 283.14228853273676 ], "wc_reply_reviewers_avg": [ 94.66666666666667, 109.13701887484781 ], "wc_reply_authors_avg": [ 667.6666666666666, 389.937031953736 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8968843976194939566&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1eQcCEtDB", "title": "Calibration, Entropy Rates, and Memory in Language Models", "track": "main", "status": "Reject", "tldr": "", "abstract": "Building accurate language models that capture meaningful long-term dependencies is a core challenge in natural language processing. Towards this end, we present a calibration-based approach to measure long-term discrepancies between a generative sequence model and the true distribution, and use these discrepancies to improve the model. Empirically, we show that state-of-the-art language models, including LSTMs and Transformers, are \\emph{miscalibrated}: the entropy rates of their generations drift dramatically upward over time. We then provide provable methods to mitigate this phenomenon. Furthermore, we show how this calibration-based approach can also be used to measure the amount of memory that language models use for prediction.", "keywords": "information theory;natural language processing;calibration", "primary_area": "", "supplementary_material": "", "author": "Mark Braverman;Xinyi Chen;Sham Kakade;Karthik Narasimhan;Cyril Zhang;Yi Zhang", "authorids": "mbraverm@cs.princeton.edu;xinyic@google.com;sham@cs.washington.edu;karthikn@cs.princeton.edu;cyril.zhang@cs.princeton.edu;y.zhang@cs.princeton.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nbraverman2020calibration,\ntitle={Calibration, Entropy Rates, and Memory in Language Models},\nauthor={Mark Braverman and Xinyi Chen and Sham Kakade and Karthik Narasimhan and Cyril Zhang and Yi Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eQcCEtDB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=B1eQcCEtDB", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "704;755;129", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "398;335;104", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 529.3333333333334, 283.8430708840519 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 279.0, 126.38829059687451 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6805814744150690941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14 }, { "id": "B1eWOJHKvB", "title": "Kernel of CycleGAN as a principal homogeneous space", "track": "main", "status": "Poster", "tldr": "The space of approximate solutions of CycleGAN admits a lot of symmetry, and an identity loss does not fix this.", "abstract": "Unpaired image-to-image translation has attracted significant interest due to the invention of CycleGAN, a method which utilizes a combination of adversarial and cycle consistency losses to avoid the need for paired data. It is known that the CycleGAN problem might admit multiple solutions, and our goal in this paper is to analyze the space of exact solutions and to give perturbation bounds for approximate solutions. We show theoretically that the exact solution space is invariant with respect to automorphisms of the underlying probability spaces, and, furthermore, that the group of automorphisms acts freely and transitively on the space of exact solutions. We examine the case of zero pure CycleGAN loss first in its generality, and, subsequently, expand our analysis to approximate solutions for extended CycleGAN loss where identity loss term is included. In order to demonstrate that these results are applicable, we show that under mild conditions nontrivial smooth automorphisms exist. Furthermore, we provide empirical evidence that neural networks can learn these automorphisms with unexpected and unwanted results. We conclude that finding optimal solutions to the CycleGAN loss does not necessarily lead to the envisioned result in image-to-image translation tasks and that underlying hidden symmetries can render the result useless.", "keywords": "Generative models;CycleGAN", "primary_area": "", "supplementary_material": "", "author": "Nikita Moriakov;Jonas Adler;Jonas Teuwen", "authorids": "nikita.moriakov@radboudumc.nl;jonasadl@kth.se;jonas.teuwen@radboudumc.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nMoriakov2020Kernel,\ntitle={Kernel of CycleGAN as a principal homogeneous space},\nauthor={Nikita Moriakov and Jonas Adler and Jonas Teuwen},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eWOJHKvB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eWOJHKvB", "pdf_size": 0, "rating": "3;6;8", "confidence": "0;0;0", "wc_review": "323;318;104", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "421;250;39", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 248.33333333333334, 102.07948972355916 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 236.66666666666666, 156.23557711211475 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16616791058364409013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "B1eWbxStPH", "title": "Directional Message Passing for Molecular Graphs", "track": "main", "status": "Spotlight", "tldr": "Directional message passing incorporates spatial directional information to improve graph neural networks.", "abstract": "Graph neural networks have recently achieved great successes in predicting quantum mechanical properties of molecules. These models represent a molecule as a graph using only the distance between atoms (nodes). They do not, however, consider the spatial direction from one atom to another, despite directional information playing a central role in empirical potentials for molecules, e.g. in angular potentials. To alleviate this limitation we propose directional message passing, in which we embed the messages passed between atoms instead of the atoms themselves. Each message is associated with a direction in coordinate space. These directional message embeddings are rotationally equivariant since the associated directions rotate with the molecule. We propose a message passing scheme analogous to belief propagation, which uses the directional information by transforming messages based on the angle between them. Additionally, we use spherical Bessel functions and spherical harmonics to construct theoretically well-founded, orthogonal representations that achieve better performance than the currently prevalent Gaussian radial basis representations while using fewer than 1/4 of the parameters. We leverage these innovations to construct the directional message passing neural network (DimeNet). DimeNet outperforms previous GNNs on average by 76% on MD17 and by 31% on QM9. Our implementation is available online.", "keywords": "GNN;Graph neural network;message passing;graphs;equivariance;molecules", "primary_area": "", "supplementary_material": "", "author": "Johannes Gasteiger;Janek Gro\u00df;Stephan G\u00fcnnemann", "authorids": "j.gasteiger@in.tum.de;grossja@in.tum.de;guennemann@in.tum.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nGasteiger2020Directional,\ntitle={Directional Message Passing for Molecular Graphs},\nauthor={Johannes Gasteiger and Janek Gro\u00df and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eWbxStPH}\n}", "github": "https://www.daml.in.tum.de/dimenet", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eWbxStPH", "pdf_size": 0, "rating": "6;8;8", "confidence": "0;0;0", "wc_review": "283;139;464", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "256;67;204", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 295.3333333333333, 132.96699674062816 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 175.66666666666666, 79.71755696763974 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 584, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18349010234285626260&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1eWu0NtDS", "title": "Neuron ranking - an informed way to compress convolutional neural networks", "track": "main", "status": "Withdraw", "tldr": "We propose CNN neuron ranking with two different methods and show their consistency in producing the result which allows to interpret what network deems important and compress the network by keeping the most relevant nodes.", "abstract": "Convolutional neural networks (CNNs) in recent years have made a dramatic impact in science, technology and industry, yet the theoretical mechanism of CNN architecture design remains surprisingly vague. The CNN neurons, including its distinctive element, convolutional filters, are known to be learnable features, yet their individual role in producing the output is rather unclear. The thesis of this work is that not all neurons are equally important and some of them contain more useful information to perform a given task. Hence, we propose to quantify and rank neuron importance, and directly incorporate neuron importance in the objective function under two formulations: (1) a game theoretical approach based on Shapley value which computes the marginal contribution of each filter; and (2) a probabilistic approach based on what-we-call, the importance switch using variational inference. Using these two methods we confirm the general theory that some of the neurons are inherently more important than the others. Various experiments illustrate that learned ranks can be readily useable for structured network compression and interpretability of learned features. ", "keywords": "convolutional neural network;compression;shapley value;importance switch;variational inference;interpretability", "primary_area": "", "supplementary_material": "", "author": "Kamil Adamczewski;Mijung Park", "authorids": "kamil.m.adamczewski@gmail.com;mijung.park@tuebingen.mpg.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1eWu0NtDS", "pdf_size": 0, "rating": "1;1;3", "confidence": "0;0;0", "wc_review": "255;961;371", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 529.0, 309.1191787428704 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r1bZzqANhn0J:scholar.google.com/&scioq=Neuron+ranking+-+an+informed+way+to+compress+convolutional+neural+networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1eX_a4twH", "title": "Superseding Model Scaling by Penalizing Dead Units and Points with Separation Constraints", "track": "main", "status": "Reject", "tldr": "We propose using a set of constraints to penalize dead neurons and points in order to train very deep networks of constant width.", "abstract": "In this article, we study a proposal that enables to train extremely thin (4 or 8 neurons per layer) and relatively deep (more than 100 layers) feedforward networks without resorting to any architectural modification such as Residual or Dense connections, data normalization or model scaling. We accomplish that by alleviating two problems. One of them are neurons whose output is zero for all the dataset, which renders them useless. This problem is known to the academic community as \\emph{dead neurons}. The other is a less studied problem, dead points. Dead points refers to data points that are mapped to zero during the forward pass of the network. As such, the gradient generated by those points is not propagated back past the layer where they die, thus having no effect in the training process. In this work, we characterize both problems and propose a constraint formulation that added to the standard loss function solves them both. As an additional benefit, the proposed method allows to initialize the network weights with constant or even zero values and still allowing the network to converge to reasonable results. We show very promising results on a toy, MNIST, and CIFAR-10 datasets.", "keywords": "Dead Point;Dead Unit;Model Scaling;Separation Constraints;Dying ReLU;Constant Width;Deep Neural Networks;Backpropagation", "primary_area": "", "supplementary_material": "", "author": "Carles Riera;Camilo Rey-Torres;Eloi Puertas;Oriol Pujol", "authorids": "blauigris@gmail.com;camilorey@gmail.com;epuertas@ub.edu;oriol_pujol@ub.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nriera2020superseding,\ntitle={Superseding Model Scaling by Penalizing Dead Units and Points with Separation Constraints},\nauthor={Carles Riera and Camilo Rey-Torres and Eloi Puertas and Oriol Pujol},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eX_a4twH}\n}", "github": "https://www.dropbox.com/s/kl96825sae12zkc/sep_cons.zip?dl=0", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1eX_a4twH", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "278;821;126", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 408.3333333333333, 298.3245808771975 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:97uB-W_74NwJ:scholar.google.com/&scioq=Superseding+Model+Scaling+by+Penalizing+Dead+Units+and+Points+with+Separation+Constraints&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1eXvyHKwS", "title": "THE EFFECT OF ADVERSARIAL TRAINING: A THEORETICAL CHARACTERIZATION", "track": "main", "status": "Reject", "tldr": "We prove adversarial training within linear classifier can rapidly converge to a robust solution. In addition, adversarial training is stable to outliers in dataset. ", "abstract": "It has widely shown that adversarial training (Madry et al., 2018) is effective in defending adversarial attack empirically. However, the theoretical understanding of the difference between the solution of adversarial training and that of standard training is limited. In this paper, we characterize the solution of adversarial training for linear classification problem for a full range of adversarial radius \". Specifically, we show that if the data themselves are \u201d-strongly linearly-separable\u201d, adversarial\ntraining with radius smaller than \" converges to the hard margin solution of SVM with a faster rate than standard training. If the data themselves are not \u201d-strongly linearly-separable\u201d, we show that adversarial training with radius \" is stable to outliers while standard training is not. Moreover, we prove that the classifier returned by adversarial training with a large radius \" has low confidence in each data point. Experiments corroborate our theoretical finding well.", "keywords": "adversarial training;robustness;separable data", "primary_area": "", "supplementary_material": "", "author": "Mingyang Yi;Huishuai Zhang;Wei Chen;Zhi-Ming Ma;Tie-Yan Liu", "authorids": "yimingyang17@mails.ucas.edu.cn;huzhang@microsoft.com;wche@microsoft.com;mazm@amt.ac.cn;tie-yan.liu@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyi2020the,\ntitle={{\\{}THE{\\}} {\\{}EFFECT{\\}} {\\{}OF{\\}} {\\{}ADVERSARIAL{\\}} {\\{}TRAINING{\\}}: A {\\{}THEORETICAL{\\}} {\\{}CHARACTERIZATION{\\}}},\nauthor={Mingyang Yi and Huishuai Zhang and Wei Chen and Zhi-Ming Ma and Tie-Yan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eXvyHKwS}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1eXvyHKwS", "pdf_size": 0, "rating": "1;1;1", "confidence": "0;0;0", "wc_review": "828;397;594", "wc_reply_reviewers": "0;91;0", "wc_reply_authors": "363;677;518", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 1.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 606.3333333333334, 176.17100278485736 ], "wc_reply_reviewers_avg": [ 30.333333333333332, 42.897811391983886 ], "wc_reply_authors_avg": [ 519.3333333333334, 128.19343023554507 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZceMslkvVsYJ:scholar.google.com/&scioq=THE+EFFECT+OF+ADVERSARIAL+TRAINING:+A+THEORETICAL+CHARACTERIZATION&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1eXygBFPH", "title": "Attacking Graph Convolutional Networks via Rewiring", "track": "main", "status": "Reject", "tldr": "Using rewiring operation to conduct adversarial attacks on graph structured data.", "abstract": "Graph Neural Networks (GNNs) have boosted the performance of many graph related tasks such as node classification and graph classification. Recent researches show that graph neural networks are vulnerable to adversarial attacks, which deliberately add carefully created unnoticeable perturbation to the graph structure. The perturbation is usually created by adding/deleting a few edges, which might be noticeable even when the number of edges modified is small. In this paper, we propose a graph rewiring operation which affects the graph in a less noticeable way compared to adding/deleting edges. We then use reinforcement learning to learn the attack strategy based on the proposed rewiring operation. Experiments on real world graphs demonstrate the effectiveness of the proposed framework. To understand the proposed framework, we further analyze how its generated perturbation to the graph structure affects the output of the target model.", "keywords": "Graph Neural Networks;Rewiring;Adversarial Attacks", "primary_area": "", "supplementary_material": "", "author": "Yao Ma;Suhang Wang;Tyler Derr;Lingfei Wu;Jiliang Tang", "authorids": "mayao4@msu.edu;szw494@psu.edu;derrtyle@msu.edu;wuli@us.ibm.com;tangjili@msu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nma2020attacking,\ntitle={Attacking Graph Convolutional Networks via Rewiring},\nauthor={Yao Ma and Suhang Wang and Tyler Derr and Lingfei Wu and Jiliang Tang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eXygBFPH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eXygBFPH", "pdf_size": 0, "rating": "3;3;6;6", "confidence": "0;0;0;0", "wc_review": "477;304;224;290", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1111;686;641;697", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "rating_avg": [ 4.5, 1.5 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 323.75, 93.49431800917101 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 783.75, 190.09915176033795 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=943873232204954325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B1eYGkBKDB", "title": "Fully Quantized Transformer for Improved Translation", "track": "main", "status": "Withdraw", "tldr": "We fully quantize the Transformer to 8-bit and improve translation quality compared to the full precision model.", "abstract": "State-of-the-art neural machine translation methods employ massive amounts of parameters. Drastically reducing computational costs of such methods without affecting performance has been up to this point unsolved. In this work, we propose a quantization strategy tailored to the Transformer architecture. We evaluate our method on the WMT14 EN-FR and WMT14 EN-DE translation tasks and achieve state-of-the-art quantization results for the Transformer, obtaining no loss in BLEU scores compared to the non-quantized baseline. We further compress the Transformer by showing that, once the model is trained, a good portion of the nodes in the encoder can be removed without causing any loss in BLEU.", "keywords": "Transformer;quantization;machine translation;compression;pruning", "primary_area": "", "supplementary_material": "", "author": "Gabriele Prato;Ella Charlaix;Mehdi Rezagholizadeh", "authorids": "prato.gab@gmail.com;ella.charlaix@huawei.com;mehdi.rezagholizadeh@huawei.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nprato2020fully,\ntitle={Fully Quantized Transformer for Improved Translation},\nauthor={Gabriele Prato and Ella Charlaix and Mehdi Rezagholizadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eYGkBKDB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eYGkBKDB", "pdf_size": 0, "rating": "1;3;3", "confidence": "0;0;0", "wc_review": "287;252;406", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "323;475;249", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 315.0, 65.91408549518583 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 349.0, 94.07798183776407 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11666785427644973911&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "B1eY_pVYvB", "title": "Efficient and Information-Preserving Future Frame Prediction and Beyond", "track": "main", "status": "Poster", "tldr": "", "abstract": "Applying resolution-preserving blocks is a common practice to maximize information preservation in video prediction, yet their high memory consumption greatly limits their application scenarios. We propose CrevNet, a Conditionally Reversible Network that uses reversible architectures to build a bijective two-way autoencoder and its complementary recurrent predictor. Our model enjoys the theoretically guaranteed property of no information loss during the feature extraction, much lower memory consumption and computational efficiency. The lightweight nature of our model enables us to incorporate 3D convolutions without concern of memory bottleneck, enhancing the model's ability to capture both short-term and long-term temporal dependencies. Our proposed approach achieves state-of-the-art results on Moving MNIST, Traffic4cast and KITTI datasets. We further demonstrate the transferability of our self-supervised learning method by exploiting its learnt features for object detection on KITTI. Our competitive results indicate the potential of using CrevNet as a generative pre-training strategy to guide downstream tasks.", "keywords": "self-supervised learning;generative pre-training;video prediction;reversible architecture", "primary_area": "", "supplementary_material": "", "author": "Wei Yu;Yichao Lu;Steve Easterbrook;Sanja Fidler", "authorids": "gnosis@cs.toronto.edu;yichao@cs.toronto.edu;sme@cs.toronto.edu;fidler@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nYu2020Efficient,\ntitle={Efficient and Information-Preserving Future Frame Prediction and Beyond},\nauthor={Wei Yu and Yichao Lu and Steve Easterbrook and Sanja Fidler},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eY_pVYvB}\n}", "github": "https://drive.google.com/file/d/1koVpH2RhkOl4_Xm_q8Iy1FuX3zQxC9gd/view?usp=sharing", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eY_pVYvB", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "149;309;243", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "452;767;240", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 233.66666666666666, 65.65228268858762 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 486.3333333333333, 216.5122526684242 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 142, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7190900656259459167&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B1eYlgBYPH", "title": "A Deep Recurrent Neural Network via Unfolding Reweighted l1-l1 Minimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep unfolding methods design deep neural networks as learned variations of optimization methods. These networks have been shown to achieve faster convergence and higher accuracy than the original optimization methods. In this line of research, this paper develops a novel deep recurrent neural network (coined reweighted-RNN) by unfolding a reweighted l1-l1 minimization algorithm and applies it to the task of sequential signal reconstruction. To the best of our knowledge, this is the first deep unfolding method that explores reweighted minimization. Due to the underlying reweighted minimization model, our RNN has a different soft-thresholding function (alias, different activation function) for each hidden unit in each layer. Furthermore, it has higher network expressivity than existing deep unfolding RNN models due to the over-parameterizing weights. Moreover, we establish theoretical generalization error bounds for the proposed reweighted-RNN model by means of Rademacher complexity. The bounds reveal that the parameterization of the proposed reweighted-RNN ensures good generalization. We apply the proposed reweighted-RNN to the problem of video-frame reconstruction from low-dimensional measurements, that is, sequential frame reconstruction. The experimental results on the moving MNIST dataset demonstrate that the proposed deep reweighted-RNN significantly outperforms existing RNN models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Huynh Van Luong;Duy Hung Le;Nikos Deligiannis", "authorids": "hvanluon@etrovub.be;dle@etrovub.be;ndeligia@etrovub.be", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nluong2020a,\ntitle={A Deep Recurrent Neural Network via Unfolding Reweighted l1-l1 Minimization},\nauthor={Huynh Van Luong and Duy Hung Le and Nikos Deligiannis},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eYlgBYPH}\n}", "github": "https://1drv.ms/u/s!ApHn770BvhH2aWay9xEhAiXydfo?e=aCX1X0", "project": "", "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer4", "site": "https://openreview.net/forum?id=B1eYlgBYPH", "pdf_size": 0, "rating": "3;6;8", "confidence": "0;0;0", "wc_review": "255;340;216", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "495;641;543", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 270.3333333333333, 51.77086267604802 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 559.6666666666666, 60.75817274701039 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eJCcDW8dHjIJ:scholar.google.com/&scioq=A+Deep+Recurrent+Neural+Network+via+Unfolding+Reweighted+l1-l1+Minimization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1eZYkHYPS", "title": "Shifted Randomized Singular Value Decomposition", "track": "main", "status": "Reject", "tldr": "A randomized algorithm to estimate the SVD of a shifted data matrix without explicitly constructing the matrix in the memory.", "abstract": "We extend the randomized singular value decomposition (SVD) algorithm (Halko et al., 2011) to estimate the SVD of a shifted data matrix without explicitly constructing the matrix in the memory. With no loss in the accuracy of the original algorithm, the extended algorithm provides for a more efficient way of matrix factorization. The algorithm facilitates the low-rank approximation and principal component analysis (PCA) of off-center data matrices. When applied to different types of data matrices, our experimental results confirm the advantages of the extensions made to the original algorithm.", "keywords": "SVD;PCA;Randomized Algorithms", "primary_area": "", "supplementary_material": "", "author": "Ali Basirat", "authorids": "ali.basirat@lingfil.uu.se", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nbasirat2020shifted,\ntitle={Shifted Randomized Singular Value Decomposition},\nauthor={Ali Basirat},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eZYkHYPS}\n}", "github": "https://drive.google.com/file/d/1bjG5kAQ9WoTbQKFX41SnHW9eaik_SujD/view?usp=sharing", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1eZYkHYPS", "pdf_size": 0, "rating": "1;1;3", "confidence": "0;0;0", "wc_review": "104;131;332", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 189.0, 101.7152889196113 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bcpy3bwlTRIJ:scholar.google.com/&scioq=Shifted+Randomized+Singular+Value+Decomposition&hl=en&as_sdt=0,5", "gs_version_total": 6 }, { "id": "B1eZweHFwr", "title": "Statistical Verification of General Perturbations by Gaussian Smoothing", "track": "main", "status": "Reject", "tldr": "We present a statistical certification method to certify robustness for rotations, translations and other transformations.", "abstract": "We present a novel statistical certification method that generalizes prior work based on smoothing to handle richer perturbations. Concretely, our method produces a provable classifier which can establish statistical robustness against geometric perturbations (e.g., rotations, translations) as well as volume changes and pitch shifts on audio data. The generalization is non-trivial and requires careful handling of operations such as interpolation. Our method is agnostic to the choice of classifier and scales to modern architectures such as ResNet-50 on ImageNet.", "keywords": "adversarial robustness;certified network;randomised smoothing;geometric perturbations", "primary_area": "", "supplementary_material": "", "author": "Marc Fischer;Maximilian Baader;Martin Vechev", "authorids": "marcfisc@student.ethz.ch;mbaader@inf.ethz.ch;martin.vechev@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfischer2020statistical,\ntitle={Statistical Verification of General Perturbations by Gaussian Smoothing},\nauthor={Marc Fischer and Maximilian Baader and Martin Vechev},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eZweHFwr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eZweHFwr", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "206;267;350", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "122;209;167", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 274.3333333333333, 59.01600536201081 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 166.0, 35.52463933666322 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5256003582494489364&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1ecVlrtDr", "title": "Symmetric-APL Activations: Training Insights and Robustness to Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "Symmetric Adaptive Piecewise Linear activations are proposed as new activation function with deep explanation on training behavior and robustness to adversarial attacks.", "abstract": "Deep neural networks with learnable activation functions have shown superior performance over deep neural networks with fixed activation functions for many different problems. The adaptability of learnable activation functions adds expressive power to the model which results in better performance. Here, we propose a new learnable activation function based on Adaptive Piecewise Linear units (APL), which 1) gives equal expressive power to both the positive and negative halves on the input space and 2) is able to approximate any zero-centered continuous non-linearity in a closed interval. We investigate how the shape of the Symmetric-APL function changes during training and perform ablation studies to gain insight into the reason behind these changes. We hypothesize that these activation functions go through two distinct stages: 1) adding gradient information and 2) adding expressive power. Finally, we show that the use of Symmetric-APL activations can significantly increase the robustness of deep neural networks to adversarial attacks. Our experiments on both black-box and open-box adversarial attacks show that commonly-used architectures, namely Lenet, Network-in-Network, and ResNet-18 can be up to 51% more resistant to adversarial fooling by only using the proposed activation functions instead of ReLUs.", "keywords": "Activation function;Adaptive;Training;Robustness;Adversarial attack", "primary_area": "", "supplementary_material": "", "author": "Mohammadamin Tavakoli;Forest Agostinelli;Pierre Baldi", "authorids": "mohamadt@uci.edu;fagostin@uci.edu;pfbaldi@ics.uci.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntavakoli2020symmetricapl,\ntitle={Symmetric-{\\{}APL{\\}} Activations: Training Insights and Robustness to Adversarial Attacks},\nauthor={Mohammadamin Tavakoli and Forest Agostinelli and Pierre Baldi},\nyear={2020},\nurl={https://openreview.net/forum?id=B1ecVlrtDr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1ecVlrtDr", "pdf_size": 0, "rating": "1;3;6", "confidence": "0;0;0", "wc_review": "473;258;606", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "416;517;441", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 445.6666666666667, 143.37906247271795 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 458.0, 42.949582846247374 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_pqnYdbRaiwJ:scholar.google.com/&scioq=Symmetric-APL+Activations:+Training+Insights+and+Robustness+to+Adversarial+Attacks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1eiJyrtDB", "title": "Improved Generalization Bound of Permutation Invariant Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We theoretically prove that a permutation invariant property of deep neural networks largely improves its generalization performance.", "abstract": "We theoretically prove that a permutation invariant property of deep neural networks largely improves its generalization performance. Learning problems with data that are invariant to permutations are frequently observed in various applications, for example, point cloud data and graph neural networks. Numerous methodologies have been developed and they achieve great performances, however, understanding a mechanism of the performance is still a developing problem. In this paper, we derive a theoretical generalization bound for invariant deep neural networks with a ReLU activation to clarify their mechanism. Consequently, our bound shows that the main term of their generalization gap is improved by $\\sqrt{n!}$ where $n$ is a number of permuting coordinates of data. Moreover, we prove that an approximation power of invariant deep neural networks can achieve an optimal rate, though the networks are restricted to be invariant. To achieve the results, we develop several new proof techniques such as correspondence with a fundamental domain and a scale-sensitive metric entropy.", "keywords": "Deep Neural Network;Invariance;Symmetry;Group;Generalization", "primary_area": "", "supplementary_material": "", "author": "Akiyoshi Sannai;Masaaki Imaizumi", "authorids": "akiyoshi.sannai@riken.jp;imaizumi@ism.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsannai2020improved,\ntitle={Improved Generalization Bound of Permutation Invariant Deep Neural Networks},\nauthor={Akiyoshi Sannai and Masaaki Imaizumi},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eiJyrtDB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1eiJyrtDB", "pdf_size": 0, "rating": "1;3;6", "confidence": "0;0;0", "wc_review": "188;477;397", "wc_reply_reviewers": "96;0;0", "wc_reply_authors": "272;116;164", "reply_reviewers": "1;0;0", "reply_authors": "2;2;2", "rating_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 354.0, 121.83869117265938 ], "wc_reply_reviewers_avg": [ 32.0, 45.254833995939045 ], "wc_reply_authors_avg": [ 184.0, 65.23802572120036 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8353791521378005488&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1eibJrtwr", "title": "Abstractive Dialog Summarization with Semantic Scaffolds", "track": "main", "status": "Reject", "tldr": "We propose a novel end-to-end model (SPNet) to incorporate semantic scaffolds for improving abstractive dialog summarization.", "abstract": "The demand for abstractive dialog summary is growing in real-world applications. For example, customer service center or hospitals would like to summarize customer service interaction and doctor-patient interaction. However, few researchers explored abstractive summarization on dialogs due to the lack of suitable datasets. We propose an abstractive dialog summarization dataset based on MultiWOZ. If we directly apply previous state-of-the-art document summarization methods on dialogs, there are two significant drawbacks: the informative entities such as restaurant names are difficult to preserve, and the contents from different dialog domains are sometimes mismatched. To address these two drawbacks, we propose Scaffold Pointer Network (SPNet) to utilize the existing annotation on speaker role, semantic slot and dialog domain. SPNet incorporates these semantic scaffolds for dialog summarization. Since ROUGE cannot capture the two drawbacks mentioned, we also propose a new evaluation metric that considers critical informative entities in the text. On MultiWOZ, our proposed SPNet outperforms state-of-the-art abstractive summarization methods on all the automatic and human evaluation metrics.", "keywords": "Abstractive Summarization;Dialog;Multi-task Learning", "primary_area": "", "supplementary_material": "", "author": "Lin Yuan;Zhou Yu", "authorids": "yuanlinzju@gmail.com;joyu@ucdavis.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nyuan2020abstractive,\ntitle={Abstractive Dialog Summarization with Semantic Scaffolds},\nauthor={Lin Yuan and Zhou Yu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eibJrtwr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1eibJrtwr", "pdf_size": 0, "rating": "1;1;3", "confidence": "0;0;0", "wc_review": "138;403;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 271.6666666666667, 108.19837747807907 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13110243739500243556&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1eksh4KvH", "title": "CurricularFace: Adaptive Curriculum Learning Loss for Deep Face Recognition", "track": "main", "status": "Withdraw", "tldr": "A novel Adaptive Curriculum Learning loss for deep face recognition", "abstract": "As an emerging topic in face recognition, designing margin-based loss functions can increase the feature margin between different classes for enhanced discriminability. More recently, absorbing the idea of mining-based strategies is adopted to emphasize the misclassified samples and achieve promising results. However, during the entire training process, the prior methods either do not explicitly emphasize the sample based on its importance that renders the hard samples not fully exploited or explicitly emphasize the effects of semi-hard/hard samples even at the early training stage that may lead to convergence issues. In this work, we propose a novel Adaptive Curriculum Learning loss (CurricularFace) that embeds the idea of curriculum learning into the loss function to achieve a novel training strategy for deep face recognition, which mainly addresses easy samples in the early training stage and hard ones in the later stage. Specifically, our CurricularFace adaptively adjusts the relative importance of easy and hard samples during different training stages. In each stage, different samples are assigned with different importance according to their corresponding difficultness. Extensive experimental results on popular benchmarks demonstrate the superiority of our CurricularFace over the state-of-the-art competitors. Code will be available upon publication.", "keywords": "CurricularFace;Adaptive Curriculum Learning;Face Recognition", "primary_area": "", "supplementary_material": "", "author": "Yuge Huang;Yuhan Wang;Ying Tai;Xiaoming Liu;Pengcheng Shen;Shaoxin Li;Jilin Li;Feiyue Huang", "authorids": "huangyg@zju.edu.cn;wang_yuhan@zju.edu.cn;yingtai@tencent.com;liuxm@cse.msu.edu;quantshen@tencent.com;darwinli@tencent.com;jerolinli@tencent.com;garyhuang@tencent.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eksh4KvH", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "217;279;316", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "364;366;202", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 270.6666666666667, 40.84387618997764 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 310.6666666666667, 76.84327484490032 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0, "gs_citation": 686, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17543857641780685133&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "B1elCp4KwH", "title": "Learning Hierarchical Discrete Linguistic Units from Visually-Grounded Speech", "track": "main", "status": "Talk", "tldr": "Vector quantization layers incorporated into a self-supervised neural model of speech audio learn hierarchical and discrete linguistic units (phone-like, word-like) when trained with a visual-grounding objective. ", "abstract": "In this paper, we present a method for learning discrete linguistic units by incorporating vector quantization layers into neural models of visually grounded speech. We show that our method is capable of capturing both word-level and sub-word units, depending on how it is configured. What differentiates this paper from prior work on speech unit learning is the choice of training objective. Rather than using a reconstruction-based loss, we use a discriminative, multimodal grounding objective which forces the learned units to be useful for semantic image retrieval. We evaluate the sub-word units on the ZeroSpeech 2019 challenge, achieving a 27.3% reduction in ABX error rate over the top-performing submission, while keeping the bitrate approximately the same. We also present experiments demonstrating the noise robustness of these units. Finally, we show that a model with multiple quantizers can simultaneously learn phone-like detectors at a lower layer and word-like detectors at a higher layer. We show that these detectors are highly accurate, discovering 279 words with an F1 score of greater than 0.5.", "keywords": "visually-grounded speech;self-supervised learning;discrete representation learning;vision and language;vision and speech;hierarchical representation learning", "primary_area": "", "supplementary_material": "", "author": "David Harwath*;Wei-Ning Hsu*;James Glass", "authorids": "dharwath@csail.mit.edu;wnhsu@mit.edu;glass@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nHarwath*2020Learning,\ntitle={Learning Hierarchical Discrete Linguistic Units from Visually-Grounded Speech},\nauthor={David Harwath* and Wei-Ning Hsu* and James Glass},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1elCp4KwH}\n}", "github": "https://github.com/wnhsu/ResDAVEnet-VQ", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1elCp4KwH", "pdf_size": 0, "rating": "6;8;8", "confidence": "0;0;0", "wc_review": "124;585;331", "wc_reply_reviewers": "0;21;0", "wc_reply_authors": "194;1467;722", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 346.6666666666667, 188.5282177523802 ], "wc_reply_reviewers_avg": [ 7.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 794.3333333333334, 522.2108982223774 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11078660580062138123&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "B1elqkrKPH", "title": "Learning robust visual representations using data augmentation invariance", "track": "main", "status": "Reject", "tldr": "We propose data augmentation invariance: a simple, yet effective and efficient way of learning robust features by adding a layer-wise invariance objective in the loss function.", "abstract": "Deep convolutional neural networks trained for image object categorization have shown remarkable similarities with representations found across the primate ventral visual stream. Yet, artificial and biological networks still exhibit important differences. Here we investigate one such property: increasing invariance to identity-preserving image transformations found along the ventral stream. Despite theoretical evidence that invariance should emerge naturally from the optimization process, we present empirical evidence that the activations of convolutional neural networks trained for object categorization are not robust to identity-preserving image transformations commonly used in data augmentation. As a solution, we propose data augmentation invariance, an unsupervised learning objective which improves the robustness of the learned representations by promoting the similarity between the activations of augmented image samples. Our results show that this approach is a simple, yet effective and efficient (10 % increase in training time) way of increasing the invariance of the models while obtaining similar categorization performance.", "keywords": "deep neural networks;visual cortex;invariance;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Alex Hernandez-Garcia;Peter K\u00f6nig;Tim C. Kietzmann", "authorids": "alexhg15@gmail.com;pkoenig@uos.de;t.kietzmann@donders.ru.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhernandez-garcia2020learning,\ntitle={Learning robust visual representations using data augmentation invariance},\nauthor={Alex Hernandez-Garcia and Peter K{\\\"o}nig and Tim C. Kietzmann},\nyear={2020},\nurl={https://openreview.net/forum?id=B1elqkrKPH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=B1elqkrKPH", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "254;171;684", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "547;751;614", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 369.6666666666667, 224.83524832394252 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 637.3333333333334, 84.90124982720938 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=46160622919673818&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "B1em8TVtPr", "title": "Discourse-Based Evaluation of Language Understanding", "track": "main", "status": "Reject", "tldr": "Semantics is not all you need", "abstract": "New models for natural language understanding have made unusual progress recently, leading to claims of universal text representations. However, current benchmarks are predominantly targeting semantic phenomena; we make the case that discourse and pragmatics need to take center stage in the evaluation of natural language understanding.\nWe introduce DiscEval, a new benchmark for the evaluation of natural language understanding, that unites 11 discourse-focused evaluation datasets. \nDiscEval can be used as supplementary training data in a multi-task learning setup, and is publicly available, alongside the code for gathering and preprocessing the datasets.\nUsing our evaluation suite, we show that natural language inference, a widely used pretraining task, does not result in genuinely universal representations, which opens a new challenge for multi-task learning.", "keywords": "Natural Language Understanding;Pragmatics;Discourse;Semantics;Evaluation;BERT;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Damien Sileo;Tim Van-De-Cruys;Camille Pradel;Philippe Muller", "authorids": "damien.sileo@irit.fr;tim.vandecruys@irit.fr;camille.pradel@synapse-fr.com;philippe.muller@irit.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsileo2020discoursebased,\ntitle={Discourse-Based Evaluation of Language Understanding},\nauthor={Damien Sileo and Tim Van-De-Cruys and Camille Pradel and Philippe Muller},\nyear={2020},\nurl={https://openreview.net/forum?id=B1em8TVtPr}\n}", "github": "https://github.com/disceval/DiscEval", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1em8TVtPr", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "293;661;358", "wc_reply_reviewers": "0;309;0", "wc_reply_authors": "131;399;252", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 437.3333333333333, 160.36694034480908 ], "wc_reply_reviewers_avg": [ 103.0, 145.6639969244288 ], "wc_reply_authors_avg": [ 260.6666666666667, 109.58203421283172 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2599848560701200713&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "B1em9h4KDS", "title": "Generative Imputation and Stochastic Prediction", "track": "main", "status": "Reject", "tldr": "A method to generate imputations and measure uncertainties over target class assignments based on incomplete feature vectors", "abstract": "In many machine learning applications, we are faced with incomplete datasets. In the literature, missing data imputation techniques have been mostly concerned with filling missing values. However, the existence of missing values is synonymous with uncertainties not only over the distribution of missing values but also over target class assignments that require careful consideration. In this paper, we propose a simple and effective method for imputing missing features and estimating the distribution of target assignments given incomplete data. In order to make imputations, we train a simple and effective generator network to generate imputations that a discriminator network is tasked to distinguish. Following this, a predictor network is trained using the imputed samples from the generator network to capture the classification uncertainties and make predictions accordingly. The proposed method is evaluated on CIFAR-10 image dataset as well as three real-world tabular classification datasets, under different missingness rates and structures. Our experimental results show the effectiveness of the proposed method in generating imputations as well as providing estimates for the class uncertainties in a classification task when faced with missing values.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohammad Kachuee;Kimmo K\u00e4rkk\u00e4inen;Orpaz Goldstein;Sajad Darabi;Majid Sarrafzadeh", "authorids": "mkachuee@ucla.edu;kimmo@cs.ucla.edu;orpgol@cs.ucla.edu;sajad.darabi@cs.ucla.edu;majid@cs.ucla.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkachuee2020generative,\ntitle={Generative Imputation and Stochastic Prediction},\nauthor={Mohammad Kachuee and Kimmo K{\\\"a}rkk{\\\"a}inen and Orpaz Goldstein and Sajad Darabi and Majid Sarrafzadeh},\nyear={2020},\nurl={https://openreview.net/forum?id=B1em9h4KDS}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1em9h4KDS", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "350;535;431", "wc_reply_reviewers": "24;0;0", "wc_reply_authors": "1066;1036;807", "reply_reviewers": "1;0;0", "reply_authors": "3;2;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 438.6666666666667, 75.72024534796196 ], "wc_reply_reviewers_avg": [ 8.0, 11.313708498984761 ], "wc_reply_authors_avg": [ 969.6666666666666, 115.67291049429949 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7755081327323779852&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "B1eoyAVFwH", "title": "Feature Partitioning for Efficient Multi-Task Architectures", "track": "main", "status": "Reject", "tldr": "automatic search for multi-task architectures that reduce per-task feature use", "abstract": "Multi-task learning promises to use less data, parameters, and time than training separate single-task models. But realizing these benefits in practice is challenging. In particular, it is difficult to define a suitable architecture that has enough capacity to support many tasks while not requiring excessive compute for each individual task. There are difficult trade-offs when deciding how to allocate parameters and layers across a large set of tasks. To address this, we propose a method for automatically searching over multi-task architectures that accounts for resource constraints. We define a parameterization of feature sharing strategies for effective coverage and sampling of architectures. We also present a method for quick evaluation of such architectures with feature distillation. Together these contributions allow us to quickly optimize for parameter-efficient multi-task models. We benchmark on Visual Decathlon, demonstrating that we can automatically search for and identify architectures that effectively make trade-offs between task resource requirements while maintaining a high level of final performance.", "keywords": "multi-task learning;neural architecture search;multi-task architecture search", "primary_area": "", "supplementary_material": "", "author": "Alejandro Newell;Lu Jiang;Chong Wang;Li-Jia Li;Jia Deng", "authorids": "anewell@cs.princeton.edu;lujiang@google.com;chong.wang@bytedance.com;lijiali@cs.stanford.edu;jiadeng@princeton.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nnewell2020feature,\ntitle={Feature Partitioning for Efficient Multi-Task Architectures},\nauthor={Alejandro Newell and Lu Jiang and Chong Wang and Li-Jia Li and Jia Deng},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eoyAVFwH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1eoyAVFwH", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "213;555;491", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "213;445;389", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 419.6666666666667, 148.4527609114166 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 349.0, 98.84668262853674 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2965526537438711372&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "B1erJJrYPH", "title": "Optimizing Loss Landscape Connectivity via Neuron Alignment", "track": "main", "status": "Reject", "tldr": "We investigate the effect of weight symmetry on the loss landscape of deep networks. ", "abstract": "The loss landscapes of deep neural networks are poorly understood due to their high nonconvexity. Empirically, the local optima of these loss functions can be connected by a simple curve in model space, along which the loss remains fairly constant. Yet, current path finding algorithms do not consider the influence of symmetry in the loss surface caused by weight permutations of the networks corresponding to the minima. We propose a framework to investigate the effect of symmetry on the landscape connectivity by directly optimizing the weight permutations of the networks being connected. Through utilizing an existing neuron alignment technique, we derive an initialization for the weight permutations. Empirically, this initialization is critical for efficiently learning a simple, planar, low-loss curve between networks that successfully generalizes. Additionally, we introduce a proximal alternating minimization scheme to address if an optimal permutation can be learned, with some provable convergence guarantees. We find that the learned parameterized curve is still a low-loss curve after permuting the weights of the endpoint models, for a subset of permutations. We also show that there is small but steady performance gain in performance of the ensembles constructed from the learned curve, when considering weight space symmetry.", "keywords": "deep learning;optimization;non-convex optimization", "primary_area": "", "supplementary_material": "", "author": "N. Joseph Tatro;Pin-Yu Chen;Payel Das;Igor Melnyk;Prasanna Sattigeri;Rongjie Lai", "authorids": "tatron@rpi.edu;pin-yu.chen@ibm.com;daspa@us.ibm.com;igor.melnyk@ibm.com;psattig@us.ibm.com;lair@rpi.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ntatro2020optimizing,\ntitle={Optimizing Loss Landscape Connectivity via Neuron Alignment},\nauthor={N. Joseph Tatro and Pin-Yu Chen and Payel Das and Igor Melnyk and Prasanna Sattigeri and Rongjie Lai},\nyear={2020},\nurl={https://openreview.net/forum?id=B1erJJrYPH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1erJJrYPH", "pdf_size": 0, "rating": "1;3;6", "confidence": "0;0;0", "wc_review": "476;330;616", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "201;124;149", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 2.0548046676563256 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 474.0, 116.76757540801584 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 158.0, 32.072833779799794 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IMIU7z-mBxEJ:scholar.google.com/&scioq=Optimizing+Loss+Landscape+Connectivity+via+Neuron+Alignment&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1esx6EYvr", "title": "A critical analysis of self-supervision, or what we can learn from a single image", "track": "main", "status": "Poster", "tldr": "We evaluate self-supervised feature learning methods and find that with sufficient data augmentation early layers can be learned using just one image. This is informative about self-supervision and the role of augmentations.", "abstract": "We look critically at popular self-supervision techniques for learning deep convolutional neural networks without manual labels. We show that three different and representative methods, BiGAN, RotNet and DeepCluster, can learn the first few layers of a convolutional network from a single image as well as using millions of images and manual labels, provided that strong data augmentation is used. However, for deeper layers the gap with manual supervision cannot be closed even if millions of unlabelled images are used for training.\nWe conclude that:\n(1) the weights of the early layers of deep networks contain limited information about the statistics of natural images, that\n(2) such low-level statistics can be learned through self-supervision just as well as through strong supervision, and that\n(3) the low-level statistics can be captured via synthetic transformations instead of using a large image dataset.", "keywords": "self-supervision;feature representation learning;CNN", "primary_area": "", "supplementary_material": "", "author": "Asano YM.;Rupprecht C.;Vedaldi A.", "authorids": "yuki@robots.ox.ac.uk;chrisr@robots.ox.ac.uk;vedaldi@robots.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nYM.2020A,\ntitle={A critical analysis of self-supervision, or what we can learn from a single image},\nauthor={Asano YM. and Rupprecht C. and Vedaldi A.},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1esx6EYvr}\n}", "github": "[![github](/images/github_icon.svg) yukimasano/linear-probes](https://github.com/yukimasano/linear-probes) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1esx6EYvr)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1esx6EYvr", "pdf_size": 0, "rating": "1;6;6", "confidence": "0;0;0", "wc_review": "325;525;536", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "416;537;740", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 2.357022603955158 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 462.0, 96.9776606578374 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 564.3333333333334, 133.6770569527754 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 171, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1196793253523325509&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "B1esygHFwS", "title": "Detecting Change in Seasonal Pattern via Autoencoder and Temporal Regularization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Change-point detection problem consists of discovering abrupt property changes in the generation process of time-series. Most state-of-the-art models are optimizing the power of a kernel two-sample test, with only a few assumptions on the distribution of the data. Unfortunately, because they presume the samples are distributed i.i.d, they are not able to use information about the seasonality of a time-series. In this paper, we present a novel approach - ATR-CSPD allowing the detection of changes in the seasonal pattern of a time-series. Our method uses an autoencoder together with a temporal regularization, to learn the pattern of each seasonal cycle. Using low dimensional representation of the seasonal patterns, it is possible to accurately and efficiently estimate the existence of a change point using a clustering algorithm. Through experiments on artificial and real-world data sets, we demonstrate the usefulness of the proposed method for several applications.", "keywords": "Autoencoder;Change Point Detection;Timeseries", "primary_area": "", "supplementary_material": "", "author": "Raphael Fettaya;Dor Bank;Rachel Lemberg;Linoy Barel", "authorids": "raphaelfettaya@gmail.com;doban@microsoft.com;rlemberg@microsoft.com;t-libare@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfettaya2020detecting,\ntitle={Detecting Change in Seasonal Pattern via Autoencoder and Temporal Regularization},\nauthor={Raphael Fettaya and Dor Bank and Rachel Lemberg and Linoy Barel},\nyear={2020},\nurl={https://openreview.net/forum?id=B1esygHFwS}\n}", "github": "https://anonymous.4open.science/r/3655aebd-63f0-4dd1-a5f8-be9dbb5ed060/", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=B1esygHFwS", "pdf_size": 0, "rating": "1;1;3;3", "confidence": "0;0;0;0", "wc_review": "1073;122;358;542", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 2.0, 1.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 523.75, 350.31583963617743 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d_VdqwHutFYJ:scholar.google.com/&scioq=Detecting+Change+in+Seasonal+Pattern+via+Autoencoder+and+Temporal+Regularization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1evfa4tPB", "title": "Neural Network Branching for Neural Network Verification", "track": "main", "status": "Talk", "tldr": "We propose a novel learning to branch framework using graph neural networks to improve branch and bound based neural network verification methods. ", "abstract": "Formal verification of neural networks is essential for their deployment in safety-critical areas. Many available formal verification methods have been shown to be instances of a unified Branch and Bound (BaB) formulation. We propose a novel framework for designing an effective branching strategy for BaB. Specifically, we learn a graph neural network (GNN) to imitate the strong branching heuristic behaviour. Our framework differs from previous methods for learning to branch in two main aspects. Firstly, our framework directly treats the neural network we want to verify as a graph input for the GNN. Secondly, we develop an intuitive forward and backward embedding update schedule. Empirically, our framework achieves roughly $50\\%$ reduction in both the number of branches and the time required for verification on various convolutional networks when compared to the best available hand-designed branching strategy. In addition, we show that our GNN model enjoys both horizontal and vertical transferability. Horizontally, the model trained on easy properties performs well on properties of increased difficulty levels. Vertically, the model trained on small neural networks achieves similar performance on large neural networks.", "keywords": "Neural Network Verification;Branch and Bound;Graph Neural Network;Learning to branch", "primary_area": "", "supplementary_material": "", "author": "Jingyue Lu;M. Pawan Kumar", "authorids": "jingyue.lu@spc.ox.ac.uk;pawan@robots.ox.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nLu2020Neural,\ntitle={Neural Network Branching for Neural Network Verification },\nauthor={Jingyue Lu and M. Pawan Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1evfa4tPB}\n}", "github": "[![github](/images/github_icon.svg) oval-group/GNN_branching](https://github.com/oval-group/GNN_branching)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=B1evfa4tPB", "pdf_size": 0, "rating": "6;8;8", "confidence": "0;0;0", "wc_review": "889;454;920", "wc_reply_reviewers": "0;0;69", "wc_reply_authors": "751;602;1239", "reply_reviewers": "0;0;1", "reply_authors": "1;1;3", "rating_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 754.3333333333334, 212.74449986362097 ], "wc_reply_reviewers_avg": [ 23.0, 32.526911934581186 ], "wc_reply_authors_avg": [ 864.0, 272.05269097486735 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3408814607972511538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "B1eyA3VFwS", "title": "Enforcing Physical Constraints in Neural Neural Networks through Differentiable PDE Layer", "track": "main", "status": "Reject", "tldr": "A novel way of enforcing hard linear constraints within a convolutional neural network using a differentiable PDE layer.", "abstract": "Recent studies at the intersection of physics and deep learning have illustrated successes in the application of deep neural networks to partially or fully replace costly physics simulations. Enforcing physical constraints to solutions generated\nby neural networks remains a challenge, yet it is essential to the accuracy and trustworthiness of such model predictions. Many systems in the physical sciences are governed by Partial Differential Equations (PDEs). Enforcing these as hard\nconstraints, we show, are inefficient in conventional frameworks due to the high dimensionality of the generated fields. To this end, we propose the use of a novel differentiable spectral projection layer for neural networks that efficiently enforces\nspatial PDE constraints using spectral methods, yet is fully differentiable, allowing for its use as a layer in neural networks that supports end-to-end training. We show that its computational cost is cheaper than a regular convolution layer. We apply it to\nan important class of physical systems \u2013 incompressible turbulent flows, where the divergence-free PDE constraint is required. We train a 3D Conditional Generative Adversarial Network (CGAN) for turbulent flow super-resolution efficiently, whilst\nguaranteeing the spatial PDE constraint of zero divergence. Furthermore, our empirical results show that the model produces realistic flow fields with more accurate flow statistics when trained with hard constraints imposed via the proposed\nnovel differentiable spectral projection layer, as compared to soft constrained and unconstrained counterparts.", "keywords": "PDE;Hard Constraints;Turbulence;Super-Resolution;Spectral Methods", "primary_area": "", "supplementary_material": "", "author": "Chiyu \"Max\" Jiang;Karthik Kashinath;Prabhat;Philip Marcus", "authorids": "chiyu.jiang@berkeley.edu;kkashinath@lbl.gov;prabhat@lbl.gov;pmarcus@me.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\njiang2020enforcing,\ntitle={Enforcing Physical Constraints in Neural Neural Networks through Differentiable {\\{}PDE{\\}} Layer},\nauthor={Chiyu ''Max'' Jiang and Karthik Kashinath and Prabhat and Philip Marcus},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eyA3VFwS}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eyA3VFwS", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "414;212;715", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 447.0, 206.670430073261 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9673223462348548782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "B1eyO1BFPr", "title": "Don't Use Large Mini-batches, Use Local SGD", "track": "main", "status": "Poster", "tldr": "", "abstract": "Mini-batch stochastic gradient methods (SGD) are state of the art for distributed training of deep neural networks. \nDrastic increases in the mini-batch sizes have lead to key efficiency and scalability gains in recent years. \nHowever, progress faces a major roadblock, as models trained with large batches often do not generalize well, i.e. they do not show good accuracy on new data.\nAs a remedy, we propose a \\emph{post-local} SGD and show that it significantly improves the generalization performance compared to large-batch training on standard benchmarks while enjoying the same efficiency (time-to-accuracy) and scalability. We further provide an extensive study of the communication efficiency vs. performance trade-offs associated with a host of \\emph{local SGD} variants. \n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tao Lin;Sebastian U. Stich;Kumar Kshitij Patel;Martin Jaggi", "authorids": "tao.lin@epfl.ch;sebastian.stich@epfl.ch;kumarkshitijpatel@gmail.com;martin.jaggi@epfl.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nLin2020Don't,\ntitle={Don't Use Large Mini-batches, Use Local SGD},\nauthor={Tao Lin and Sebastian U. Stich and Kumar Kshitij Patel and Martin Jaggi},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1eyO1BFPr}\n}", "github": "[![github](/images/github_icon.svg) epfml/LocalSGD-Code](https://github.com/epfml/LocalSGD-Code) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1eyO1BFPr)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1eyO1BFPr", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "284;352;397", "wc_reply_reviewers": "29;0;0", "wc_reply_authors": "399;310;511", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 344.3333333333333, 46.449494674921446 ], "wc_reply_reviewers_avg": [ 9.666666666666666, 13.67073110293992 ], "wc_reply_authors_avg": [ 406.6666666666667, 82.23678549705653 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 516, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3406394348267726989&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "B1eySTVtvB", "title": "Combiner: Inductively Learning Tree Structured Attention in Transformers", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transformers employ dense attention mechanisms over text which can fail to capture or utilize the strong intrinsic structures present in natural language. This paper presents the Combiner model, a new Transformer architecture that learns tree-structured attention patterns inductively from language. Instead of dense or pre-specified structures, Combiner automatically learns tree-structured attention connections using a novel sparse residual attention mechanism. It first employs a sparsity-inducing gate that learns to prune attention connections in each network layer, so as to determine the nodes to be combined. Then the learned connections are propagated through layers using hierarchical attention blocks, which combine the sub-tree nodes in a bottom-up manner. Our experiments demonstrate the robust modeling performance of Combiner and usefulness of structures it learns in various information retrieval and unsupervised sentence parsing tasks. By leveraging search session structures, Combiner outperforms other pre-trained Transformers in generative query suggestion. Moreover, the learned tree structures align well with linguistic structures and improve the current state-of-the-art unsupervised constituency parsing by 8 average sentence-level F1.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiawei Wu;Chenyan Xiong;Tobias Schnabel;Yizhe Zhang;William Yang Wang;Paul Bennett", "authorids": "jiawei_wu@cs.ucsb.edu;chenyan.xiong@microsoft.com;tobias.schnabel@microsoft.com;yizhe.zhang@microsoft.com;william@cs.ucsb.edu;paul.n.bennett@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=B1eySTVtvB", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4076295307996107422&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1g5qyHYPS", "title": "Pruning Depthwise Separable Convolutions for Extra Efficiency Gain of Lightweight Models", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep convolutional neural networks are good at accuracy while bad at efficiency. To improve the inference speed, two kinds of directions are developed, lightweight model designing and network weight pruning. Lightweight models have been proposed to improve the speed with good enough accuracy. It is, however, not trivial if we can further speed up these \u201ccompact\u201d models by weight pruning. In this paper, we present a technique to gradually prune the depthwise separable convolution networks, such as MobileNet, for improving the speed of this kind of \u201cdense\u201d network. When pruning depthwise separable convolutions, we need to consider more structural constraints to ensure the speedup of inference. Instead of pruning the model with the desired ratio in one stage, the proposed multi-stage gradual pruning approach can stably prune the filters with a finer pruning ratio. Our method achieves 1.68 times speedup with neglectable accuracy drop for MobileNetV2.", "keywords": "Deep Learning;Network Pruning;Lightweight CNN", "primary_area": "", "supplementary_material": "", "author": "Cheng-Hao Tu;Jia-Hong Lee;Yi-Ming Chan;Chu-Song Chen", "authorids": "andytu28@iis.sinica.edu.tw;honghenry.lee@iis.sinica.edu.tw;yiming@iis.sinica.edu.tw;song@iis.sinica.edu.tw", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1g5qyHYPS", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "137;289;534", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 320.0, 163.55019616823046 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8600895004223954891&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1g5sA4twr", "title": "Deep Double Descent: Where Bigger Models and More Data Hurt", "track": "main", "status": "Poster", "tldr": "We demonstrate, and characterize, realistic settings where bigger models are worse, and more data hurts.", "abstract": "We show that a variety of modern deep learning tasks exhibit a \"double-descent\" phenomenon where, as we increase model size, performance first gets worse and then gets better. Moreover, we show that double descent occurs not just as a function of model size, but also as a function of the number of training epochs. We unify the above phenomena by defining a new complexity measure we call the effective model complexity, and conjecture a generalized double descent with respect to this measure. Furthermore, our notion of model complexity allows us to identify certain regimes where increasing (even quadrupling) the number of train samples actually hurts test performance.", "keywords": "deep learning;double descent;optimization;SGD;complexity", "primary_area": "", "supplementary_material": "", "author": "Preetum Nakkiran;Gal Kaplun;Yamini Bansal;Tristan Yang;Boaz Barak;Ilya Sutskever", "authorids": "preetum@cs.harvard.edu;galkaplun@g.harvard.edu;ybansal@g.harvard.edu;tristanyang@college.harvard.edu;b@boazbarak.org;ilyasu@openai.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nNakkiran2020Deep,\ntitle={Deep Double Descent: Where Bigger Models and More Data Hurt},\nauthor={Preetum Nakkiran and Gal Kaplun and Yamini Bansal and Tristan Yang and Boaz Barak and Ilya Sutskever},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g5sA4twr}\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=B1g5sA4twr)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1g5sA4twr", "pdf_size": 0, "rating": "6;6;8", "confidence": "0;0;0", "wc_review": "370;710;96", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "148;287;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 392.0, 251.14670347561136 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 145.0, 117.1864611065061 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0, "gs_citation": 1228, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9967079231665217897&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "B1g79grKPr", "title": "Goal-Conditioned Video Prediction", "track": "main", "status": "Reject", "tldr": "We propose a new class of visual generative models: goal-conditioned predictors. We show experimentally that conditioning on the goal allows to reduce uncertainty and produce predictions over much longer horizons.", "abstract": "Many processes can be concisely represented as a sequence of events leading from a starting state to an end state. Given raw ingredients, and a finished cake, an experienced chef can surmise the recipe. Building upon this intuition, we propose a new class of visual generative models: goal-conditioned predictors (GCP). Prior work on video generation largely focuses on prediction models that only observe frames from the beginning of the video. GCP instead treats videos as start-goal transformations, making video generation easier by conditioning on the more informative context provided by the first and final frames. Not only do existing forward prediction approaches synthesize better and longer videos when modified to become goal-conditioned, but GCP models can also utilize structures that are not linear in time, to accomplish hierarchical prediction. To this end, we study both auto-regressive GCP models and novel tree-structured GCP models that generate frames recursively, splitting the video iteratively into finer and finer segments delineated by subgoals. In experiments across simulated and real datasets, our GCP methods generate high-quality sequences over long horizons. Tree-structured GCPs are also substantially easier to parallelize than auto-regressive GCPs, making training and inference very efficient, and allowing the model to train on sequences that are thousands of frames in length.Finally, we demonstrate the utility of GCP approaches for imitation learning in the setting without access to expert actions. Videos are on the supplementary website: https://sites.google.com/view/video-gcp", "keywords": "predictive models;video prediction;latent variable models", "primary_area": "", "supplementary_material": "", "author": "Oleh Rybkin;Karl Pertsch;Frederik Ebert;Dinesh Jayaraman;Chelsea Finn;Sergey Levine", "authorids": "oleh@seas.upenn.edu;pertsch@usc.edu;febert@berkeley.edu;dineshjayaraman@berkeley.edu;cbfinn@cs.stanford.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nrybkin2020goalconditioned,\ntitle={Goal-Conditioned Video Prediction},\nauthor={Oleh Rybkin and Karl Pertsch and Frederik Ebert and Dinesh Jayaraman and Chelsea Finn and Sergey Levine},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g79grKPr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1g79grKPr", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "375;305;306", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "584;314;236", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 328.6666666666667, 32.76515764582181 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 378.0, 149.10399055692642 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jQHh3mWxkmAJ:scholar.google.com/&scioq=Goal-Conditioned+Video+Prediction&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1g8VkHFPH", "title": "Rethinking the Hyperparameters for Fine-tuning", "track": "main", "status": "Poster", "tldr": "This paper re-examines several common practices of setting hyper-parameters for fine-tuning and identify optimal hyperparameter depends on source-target domain similarity.", "abstract": "Fine-tuning from pre-trained ImageNet models has become the de-facto standard for various computer vision tasks. Current practices for fine-tuning typically involve selecting an ad-hoc choice of hyperparameters and keeping them fixed to values normally used for training from scratch. This paper re-examines several common practices of setting hyperparameters for fine-tuning. Our findings are based on extensive empirical evaluation for fine-tuning on various transfer learning benchmarks. (1) While prior works have thoroughly investigated learning rate and batch size, momentum for fine-tuning is a relatively unexplored parameter. We find that the value of momentum also affects fine-tuning performance and connect it with previous theoretical findings. (2) Optimal hyperparameters for fine-tuning, in particular, the effective learning rate, are not only dataset dependent but also sensitive to the similarity between the source domain and target domain. This is in contrast to hyperparameters for training from scratch. (3) Reference-based regularization that keeps models close to the initial model does not necessarily apply for \"dissimilar\" datasets. Our findings challenge common practices of fine-tuning and encourages deep learning practitioners to rethink the hyperparameters for fine-tuning.", "keywords": "fine-tuning;hyperparameter search;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Hao Li;Pratik Chaudhari;Hao Yang;Michael Lam;Avinash Ravichandran;Rahul Bhotika;Stefano Soatto", "authorids": "hao.li.ict@gmail.com;pratikac@seas.upenn.edu;lancelot365@gmail.com;michlam@amazon.com;avinash.a.ravichandran@gmail.com;bhotikar@amazon.com;soatto@ucla.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nLi2020Rethinking,\ntitle={Rethinking the Hyperparameters for Fine-tuning},\nauthor={Hao Li and Pratik Chaudhari and Hao Yang and Michael Lam and Avinash Ravichandran and Rahul Bhotika and Stefano Soatto},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g8VkHFPH}\n}", "github": "[![github](/images/github_icon.svg) richardaecn/cvpr18-inaturalist-transfer](https://github.com/richardaecn/cvpr18-inaturalist-transfer)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1g8VkHFPH", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "414;215;456", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "510;265;1494", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 361.6666666666667, 105.1168661802451 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 756.3333333333334, 531.1122501150037 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0, "gs_citation": 184, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14029720773108023404&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B1gF56VYPH", "title": "Deep 3D Pan via local adaptive \"t-shaped\" convolutions with global and local adaptive dilations", "track": "main", "status": "Poster", "tldr": "Novel architecture for stereoscopic view synthesis at arbitrary camera shifts utilizing adaptive t-shaped kernels with adaptive dilations.", "abstract": "Recent advances in deep learning have shown promising results in many low-level vision tasks. However, solving the single-image-based view synthesis is still an open problem. In particular, the generation of new images at parallel camera views given a single input image is of great interest, as it enables 3D visualization of the 2D input scenery. We propose a novel network architecture to perform stereoscopic view synthesis at arbitrary camera positions along the X-axis, or \u201cDeep 3D Pan\u201d, with \u201ct-shaped\u201d adaptive kernels equipped with globally and locally adaptive dilations. Our proposed network architecture, the monster-net, is devised with a novel t-shaped adaptive kernel with globally and locally adaptive dilation, which can efficiently incorporate global camera shift into and handle local 3D geometries of the target image\u2019s pixels for the synthesis of naturally looking 3D panned views when a 2-D input image is given. Extensive experiments were performed on the KITTI, CityScapes, and our VICLAB_STEREO indoors dataset to prove the efficacy of our method. Our monster-net significantly outperforms the state-of-the-art method (SOTA) by a large margin in all metrics of RMSE, PSNR, and SSIM. Our proposed monster-net is capable of reconstructing more reliable image structures in synthesized images with coherent geometry. Moreover, the disparity information that can be extracted from the \u201ct-shaped\u201d kernel is much more reliable than that of the SOTA for the unsupervised monocular depth estimation task, confirming the effectiveness of our method.", "keywords": "Deep learning;Stereoscopic view synthesis;Monocular depth;Deep 3D Pan", "primary_area": "", "supplementary_material": "", "author": "Juan Luis Gonzalez Bello;Munchurl Kim", "authorids": "juanluisgb@kaist.ac.kr;mkimee@kaist.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nBello2020Deep,\ntitle={Deep 3D Pan via local adaptive \"t-shaped\" convolutions with global and local adaptive dilations},\nauthor={Juan Luis Gonzalez Bello and Munchurl Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gF56VYPH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gF56VYPH", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "367;239;295", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "275;0;206", "reply_reviewers": "0;0;0", "reply_authors": "1;0;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 300.3333333333333, 52.39168721170267 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 160.33333333333334, 116.81989937030801 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11744291359883831287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "B1gHokBKwS", "title": "Learning to Guide Random Search", "track": "main", "status": "Poster", "tldr": "We improve the sample-efficiency of the random search for functions defined on low-dimensional manifolds. Our method jointly learns the underlying manifold and optimizes the function.", "abstract": "We are interested in derivative-free optimization of high-dimensional functions. The sample complexity of existing methods is high and depends on problem dimensionality, unlike the dimensionality-independent rates of first-order methods. The recent success of deep learning suggests that many datasets lie on low-dimensional manifolds that can be represented by deep nonlinear models. We therefore consider derivative-free optimization of a high-dimensional function that lies on a latent low-dimensional manifold. We develop an online learning approach that learns this manifold while performing the optimization. In other words, we jointly learn the manifold and optimize the function. Our analysis suggests that the presented method significantly reduces sample complexity. We empirically evaluate the method on continuous optimization benchmarks and high-dimensional continuous control problems. Our method achieves significantly lower sample complexity than Augmented Random Search, Bayesian optimization, covariance matrix adaptation (CMA-ES), and other derivative-free optimization algorithms.", "keywords": "Random search;Derivative-free optimization;Learning continuous control", "primary_area": "", "supplementary_material": "", "author": "Ozan Sener;Vladlen Koltun", "authorids": "ozansener@gmail.com;vkoltun@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nSener2020Learning,\ntitle={Learning to Guide Random Search},\nauthor={Ozan Sener and Vladlen Koltun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gHokBKwS}\n}", "github": "https://github.com/intel-isl/LMRS", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=B1gHokBKwS", "pdf_size": 0, "rating": "6;6;6;8", "confidence": "0;0;0;0", "wc_review": "854;588;311;533", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "802;255;200;653", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 6.5, 0.8660254037844386 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 571.5, 193.2750630578089 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 477.5, 256.2289015704513 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10046802470639742746&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B1gKVeBtDH", "title": "Faster and Just As Accurate: A Simple Decomposition for Transformer Models", "track": "main", "status": "Withdraw", "tldr": "Inference in large Transformers is expensive due to the self-attention in multiple layers. We show a simple decomposition technique can yield a faster, low memory-footprint model that is just as accurate of the original models.", "abstract": "Large pre-trained Transformers such as BERT have been tremendously effective for many NLP tasks. However, inference in these large-capacity models is prohibitively slow and expensive. Transformers are essentially a stack of self-attention layers which encode each input position using the entire input sequence as its context. However, we find that it may not be necessary to apply this expensive sequence-wide self-attention over at all layers. Based on this observation, we propose a decomposition to a pre-trained Transformer that allows the lower layers to process segments of the input independently enabling parallelism and caching. We show that the information loss due to this decomposition can be recovered in the upper layers with auxiliary supervision during fine-tuning. We evaluate de-composition with pre-trained BERT models on five different paired-input tasks in question answering, sentence similarity, and natural language inference. Results show that decomposition enables faster inference (up to 4x), significant memory reduction (up to 70%) while retaining most (up to 99%) of the original performance. We will release the code at.", "keywords": "Faster Inference;Transformers;Pre-trained Representations", "primary_area": "", "supplementary_material": "", "author": "Qingqing Cao;Harsh Trivedi;Aruna Balasubramanian;Niranjan Balasubramanian", "authorids": "qicao@cs.stonybrook.edu;hjtrivedi@cs.stonybrook.edu;arunab@cs.stonybrook.edu;niranjan@cs.stonybrook.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gKVeBtDH", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "186;319;263", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "233;702;484", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 256.0, 54.5221667458903 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 473.0, 191.62637257608012 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7004729919793776023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1gL904FwH", "title": "SIMULTANEOUS ATTRIBUTED NETWORK EMBEDDING AND CLUSTERING", "track": "main", "status": "Withdraw", "tldr": "This paper propose a novel matrix decomposition framework for simultaneous attributed network data embedding and clustering.", "abstract": "To deal simultaneously with both, the attributed network embedding and clustering, we propose a new model. It exploits both content and structure information, capitalising on their simultaneous use. The proposed model relies on the approximation of the relaxed continuous embedding solution by the true discrete clustering one. Thereby, we show that incorporating an embedding representation provides simpler and more interpretable solutions. Experiment results demonstrate that the proposed algorithm performs better, in terms of clustering and embedding, than the state-of-art algorithms, including deep learning methods devoted to similar tasks for attributed network datasets with different proprieties.", "keywords": "Attributed network;Embedding;clustering;matrix decomposition;spectral rotation", "primary_area": "", "supplementary_material": "", "author": "Lazhar labiod;Mohamed Nadif", "authorids": "lazhar.labiod@parisdescartes.fr;mohamed.nadif@parisdescartes.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gL904FwH", "pdf_size": 0, "rating": "1;1;1", "confidence": "0;0;0", "wc_review": "392;172;169", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 1.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 244.33333333333334, 104.42328390843788 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_VB8Ncal3jUJ:scholar.google.com/&scioq=SIMULTANEOUS+ATTRIBUTED+NETWORK+EMBEDDING+AND+CLUSTERING&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1gNKxrYPB", "title": "Attributed Graph Learning with 2-D Graph Convolution", "track": "main", "status": "Reject", "tldr": "We propose a novel 2-D graph convolution framework to jointly model node relations and attribute relations for attributed graph learning.", "abstract": "Graph convolutional neural networks have demonstrated promising performance in attributed graph learning, thanks to the use of graph convolution that effectively combines graph structures and node features for learning node representations. However, one intrinsic limitation of the commonly adopted 1-D graph convolution is that it only exploits graph connectivity for feature smoothing, which may lead to inferior performance on sparse and noisy real-world attributed networks. To address this problem, we propose to explore relational information among node attributes to complement node relations for representation learning. In particular, we propose to use 2-D graph convolution to jointly model the two kinds of relations and develop a computationally efficient dimensionwise separable 2-D graph convolution (DSGC). Theoretically, we show that DSGC can reduce intra-class variance of node features on both the node dimension and the attribute dimension to facilitate learning. Empirically, we demonstrate that by incorporating attribute relations, DSGC achieves significant performance gain over state-of-the-art methods on node classification and clustering on several real-world attributed networks. \n", "keywords": "2-D Graph Convolution;Attributed Graph;Representation learning", "primary_area": "", "supplementary_material": "", "author": "Qimai Li;Xiaotong Zhang;Han Liu;Xiao-Ming Wu", "authorids": "csqmli@comp.polyu.edu.hk;zxt.dut@hotmail.com;liu.han.dut@gmail.com;xiao-ming.wu@polyu.edu.hk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2020attributed,\ntitle={Attributed Graph Learning with 2-D Graph Convolution},\nauthor={Qimai Li and Xiaotong Zhang and Han Liu and Xiao-Ming Wu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gNKxrYPB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gNKxrYPB", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "528;108;305", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "515;34;273", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 313.6666666666667, 171.57376126772868 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 274.0, 196.368700832558 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4876580570566637484&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1gNfkrYvS", "title": "Capsule Networks without Routing Procedures", "track": "main", "status": "Withdraw", "tldr": "Routing procedures are not necessary for CapsNets", "abstract": "We propose Pure CapsNets (P-CapsNets) without routing procedures. Specifically, we make three modifications to CapsNets. First, we remove routing procedures from CapsNets based on the observation that the coupling coefficients can be learned implicitly. Second, we replace the convolutional layers in CapsNets to improve efficiency. Third, we package the capsules into rank-3 tensors to further improve efficiency. The experiment shows that P-CapsNets achieve better performance than CapsNets with varied routine procedures by using significantly fewer parameters on MNIST&CIFAR10. The high efficiency of P-CapsNets is even comparable to some deep compressing models. For example, we achieve more than 99% percent accuracy on MNIST by using only 3888 parameters. We visualize the capsules as well as the corresponding correlation matrix to show a possible way of initializing CapsNets in the future. We also explore the adversarial robustness of P-CapsNets compared to CNNs. ", "keywords": "CapsNets;routing procedures", "primary_area": "", "supplementary_material": "", "author": "Zhenhua Chen;Xiwen Li;Chuhua Wang;David Crandall", "authorids": "chen478@iu.edu;xiwenli@wustl.edu;cw234@iu.edu;djcran@indiana.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "https://github.com/chenzhenhua986/CAFFE-CapsNet", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gNfkrYvS", "pdf_size": 0, "rating": "1;3;3", "confidence": "0;0;0", "wc_review": "299;308;271", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "228;238;100", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 292.6666666666667, 15.755069730795297 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 188.66666666666666, 62.829575484444874 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "B1gOe6NKPB", "title": "MDE: Multiple Distance Embeddings for Link Prediction in Knowledge Graphs", "track": "main", "status": "Withdraw", "tldr": "A novel method of modelling Knowledge Graphs based on Distance Embeddings and Neural Networks", "abstract": "Over the past decade, knowledge graphs became popular for capturing structured domain knowledge. \nRelational learning models enable the prediction of missing links inside knowledge graphs. More specifically, latent distance approaches model the relationships among entities via a distance between latent representations.\nTranslating embedding models (e.g., TransE) are among the most popular latent distance approaches which use one distance function to learn multiple relation patterns. \nHowever, they are mostly inefficient in capturing symmetric relations since the representation vector norm for all the symmetric relations becomes equal to zero. They also lose information when learning relations with reflexive patterns since they become symmetric and transitive.\nWe propose the Multiple Distance Embedding model (MDE) that addresses these limitations and a framework which enables collaborative combinations of latent distance-based terms (MDE).\nOur solution is based on two principles: 1) using limit-based loss instead of margin ranking loss and 2) by learning independent embedding vectors for each of terms we can collectively train and predict using contradicting distance terms.\nWe further demonstrate that MDE allows modeling relations with (anti)symmetry, inversion, and composition patterns. We propose MDE as a neural network model which allows us to map non-linear relations between the embedding vectors and the expected output of the score function.\nOur empirical results show that MDE outperforms the state-of-the-art embedding models on several benchmark datasets.", "keywords": "Representation Learning;Knowledge Graph embedding;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Afshin Sadeghi;Damien Graux;Hamed Shariat Yazdi;Jens Lehmann", "authorids": "sadeghi@cs.uni-bonn.de;dam.graux@gmail.com;shariat@cs.uni-bonn.de;jens.lehmann@cs.uni-bonn.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "https://drive.google.com/open?id=1eE5KvWtg6IJDlBKW-D7vR7lURCQNLich", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=B1gOe6NKPB", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9398581189907500448&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "B1gR3ANFPS", "title": "Non-linear System Identification from Partial Observations via Iterative Smoothing and Learning", "track": "main", "status": "Reject", "tldr": "This work presents a scalable algorithm for non-linear offline system identification from partial observations.", "abstract": "System identification is the process of building a mathematical model of an unknown system from measurements of its inputs and outputs. It is a key step for model-based control, estimator design, and output prediction. This work presents an algorithm for non-linear offline system identification from partial observations, i.e. situations in which the system's full-state is not directly observable. The algorithm presented, called SISL, iteratively infers the system's full state through non-linear optimization and then updates the model parameters. We test our algorithm on a simulated system of coupled Lorenz attractors, showing our algorithm's ability to identify high-dimensional systems that prove intractable for particle-based approaches. We also use SISL to identify the dynamics of an aerobatic helicopter. By augmenting the state with unobserved fluid states, we learn a model that predicts the acceleration of the helicopter better than state-of-the-art approaches.", "keywords": "System Identification;Dynamical Systems;Partial Observations;Non-linear Programming;Expectation Maximization;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Kunal Menda;Jean de Becdeli\u00e8vre;Jayesh K Gupta;Ilan Kroo;Mykel J. Kochenderfer;Zachary Manchester", "authorids": "kmenda@stanford.edu;jeandb@stanford.edu;jkg@cs.stanford.edu;kroo@stanford.edu;mykel@stanford.edu;zacmanchester@stanford.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nmenda2020nonlinear,\ntitle={Non-linear System Identification from Partial Observations via Iterative Smoothing and Learning},\nauthor={Kunal Menda and Jean de Becdeli{\\`e}vre and Jayesh K Gupta and Ilan Kroo and Mykel J. Kochenderfer and Zachary Manchester},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gR3ANFPS}\n}", "github": "https://drive.google.com/drive/folders/1M4aOCo5HW9MjibSNJqKnMOZAmFCKovBc?usp=sharing", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gR3ANFPS", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "522;278;263", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1271;110;374", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 354.3333333333333, 118.71628176267801 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 585.0, 496.9044173681695 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d8pYlcXoAqQJ:scholar.google.com/&scioq=Non-linear+System+Identification+from+Partial+Observations+via+Iterative+Smoothing+and+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1gUn24tPr", "title": "Classification Attention for Chinese NER", "track": "main", "status": "Reject", "tldr": "Classification Attention for Chinese NER", "abstract": "The character-based model, such as BERT, has achieved remarkable success in Chinese named entity recognition (NER). However, such model would likely miss the overall information of the entity words. In this paper, we propose to combine priori entity information with BERT. Instead of relying on additional lexicons or pre-trained word embeddings, our model has generated entity classification embeddings directly on the pre-trained BERT, having the merit of increasing model practicability and avoiding OOV problem. Experiments show that our model has achieved state-of-the-art results on 3 Chinese NER datasets.", "keywords": "Chinese NER;NER;tagging;deeplearning;nlp", "primary_area": "", "supplementary_material": "", "author": "Yuchen Ge;FanYang;PeiYang", "authorids": "geyc2@lenovo.com;yangfan24@lenovo.com;yangpei4@lenovo.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nge2020classification,\ntitle={Classification Attention for Chinese {\\{}NER{\\}}},\nauthor={Yuchen Ge and FanYang and PeiYang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gUn24tPr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gUn24tPr", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "595;90;438", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 374.3333333333333, 211.0234320848964 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6NuZNq6BaxgJ:scholar.google.com/&scioq=Classification+Attention+for+Chinese+NER&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gX8JrYPr", "title": "Connecting the Dots Between MLE and RL for Sequence Prediction", "track": "main", "status": "Reject", "tldr": "An entropy regularized policy optimization formalism subsumes a set of sequence prediction learning algorithms. A new interpolation algorithm with improved results on text generation and game imitation learning.", "abstract": "Sequence prediction models can be learned from example sequences with a variety of training algorithms. Maximum likelihood learning is simple and efficient, yet can suffer from compounding error at test time. \nReinforcement learning such as policy gradient addresses the issue but can have prohibitively poor exploration efficiency. A rich set of other algorithms, such as data noising, RAML, and softmax policy gradient, have also been developed from different perspectives. \nIn this paper, we present a formalism of entropy regularized policy optimization, and show that the apparently distinct algorithms, including MLE, can be reformulated as special instances of the formulation. The difference between them is characterized by the reward function and two weight hyperparameters.\nThe unifying interpretation enables us to systematically compare the algorithms side-by-side, and gain new insights into the trade-offs of the algorithm design.\nThe new perspective also leads to an improved approach that dynamically interpolates among the family of algorithms, and learns the model in a scheduled way. Experiments on machine translation, text summarization, and game imitation learning demonstrate superiority of the proposed approach.", "keywords": "Sequence generation;sequence prediction;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Bowen Tan;Zhiting Hu;Zichao Yang;Ruslan Salakhutdinov;Eric Xing", "authorids": "bwkevintan@gmail.com;zhitinghu@gmail.com;yangtze2301@gmail.com;rsalakhu@cs.cmu.edu;epxing@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntan2020connecting,\ntitle={Connecting the Dots Between {\\{}MLE{\\}} and {\\{}RL{\\}} for Sequence Prediction},\nauthor={Bowen Tan and Zhiting Hu and Zichao Yang and Ruslan Salakhutdinov and Eric Xing},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gX8JrYPr}\n}", "github": "https://drive.google.com/file/d/13diaxzuxTSB-DReqEhkYPMmZ4BQ6vsEo/view", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gX8JrYPr", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "282;180;218", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "320;165;161", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 226.66666666666666, 42.089850980438925 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 215.33333333333334, 74.02852303147904 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9258702747622648930&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1gX8kBtPr", "title": "Universal Approximation with Certified Networks", "track": "main", "status": "Poster", "tldr": "We prove that for a large class of functions f there exists an interval certified robust network approximating f up to arbitrary precision.", "abstract": "Training neural networks to be certifiably robust is critical to ensure their safety against adversarial attacks. However, it is currently very difficult to train a neural network that is both accurate and certifiably robust. In this work we take a step towards addressing this challenge. We prove that for every continuous function $f$, there exists a network $n$ such that:\n(i) $n$ approximates $f$ arbitrarily close, and (ii) simple interval bound propagation of a region $B$ through $n$ yields a result that is arbitrarily close to the optimal output of $f$ on $B$. Our result can be seen as a Universal Approximation Theorem for interval-certified ReLU networks. To the best of our knowledge, this is the first work to prove the existence of accurate, interval-certified networks.", "keywords": "adversarial robustness;universal approximation;certified network;interval bound propagation", "primary_area": "", "supplementary_material": "", "author": "Maximilian Baader;Matthew Mirman;Martin Vechev", "authorids": "mbaader@inf.ethz.ch;matthew.mirman@inf.ethz.ch;martin.vechev@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nBaader2020Universal,\ntitle={Universal Approximation with Certified Networks},\nauthor={Maximilian Baader and Matthew Mirman and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gX8kBtPr}\n}", "github": "https://github.com/eth-sri/UniversalCertificationTheory", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gX8kBtPr", "pdf_size": 0, "rating": "3;6;8", "confidence": "0;0;0", "wc_review": "514;162;457", "wc_reply_reviewers": "130;0;0", "wc_reply_authors": "1354;115;133", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 377.6666666666667, 154.2645635120249 ], "wc_reply_reviewers_avg": [ 43.333333333333336, 61.282587702834114 ], "wc_reply_authors_avg": [ 534.0, 579.8741242718113 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8301791316229019028&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "B1gXR3NtwS", "title": "Deep Bayesian Structure Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Bayesian neural networks (BNNs) introduce uncertainty estimation to deep networks by performing Bayesian inference on network weights. However, such models bring the challenges of inference, and further BNNs with weight uncertainty rarely achieve superior performance to standard models. In this paper, we investigate a new line of Bayesian deep learning by performing Bayesian reasoning on the structure of deep neural networks. Drawing inspiration from the neural architecture search, we define the network structure as random weights on the redundant operations between computational nodes, and apply stochastic variational inference techniques to learn the structure distributions of networks. Empirically, the proposed method substantially surpasses the advanced deep neural networks across a range of classification and segmentation tasks. More importantly, our approach also preserves benefits of Bayesian principles, producing improved uncertainty estimation than the strong baselines including MC dropout and variational BNNs algorithms (e.g. noisy EK-FAC). ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhijie Deng;Yucen Luo;Jun Zhu;Bo Zhang", "authorids": "dzj17@mails.tsinghua.edu.cn;luoyc15@mails.tsinghua.edu.cn;dcszj@tsinghua.edu.cn;dcszb@tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndeng2020deep,\ntitle={Deep Bayesian Structure Networks},\nauthor={Zhijie Deng and Yucen Luo and Jun Zhu and Bo Zhang},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gXR3NtwS}\n}", "github": "https://github.com/anonymousest/DBSN", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gXR3NtwS", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "537;755;381", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1086;1659;1066", "reply_reviewers": "0;0;0", "reply_authors": "2;3;2", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 557.6666666666666, 153.38260078060426 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1270.3333333333333, 274.9500964821718 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "B1gXWCVtvr", "title": "Adapting Behaviour for Learning Progress", "track": "main", "status": "Reject", "tldr": "Don\u2019t tune exploration by hand: automagically adapt behaviour modulation for learning progress instead!", "abstract": "Determining what experience to generate to best facilitate learning (i.e. exploration) is one of the distinguishing features and open challenges in reinforcement learning. The advent of distributed agents that interact with parallel instances of the environment has enabled larger scale and greater flexibility, but has not removed the need to tune or tailor exploration to the task, because the ideal data for the learning algorithm necessarily depends on its process of learning. We propose to dynamically adapt the data generation by using a non-stationary multi-armed bandit to optimize a proxy of the learning progress. The data distribution is controlled via modulating multiple parameters of the policy (such as stochasticity, consistency or optimism) without significant overhead. The adaptation speed of the bandit can be increased by exploiting the factored modulation structure. We demonstrate on a suite of Atari 2600 games how this unified approach produces results comparable to per-task tuning at a fraction of the cost.", "keywords": "adaptation;behaviour;reinforcement learning;modulated behaviour;exploration;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Tom Schaul;Diana Borsa;David Ding;David Szepesvari;Georg Ostrovski;Will Dabney;Simon Osindero", "authorids": "schaul@google.com;borsa@google.com;fding@google.com;dsz@google.com;ostrovski@google.com;wdabney@google.com;osindero@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nschaul2020adapting,\ntitle={Adapting Behaviour for Learning Progress},\nauthor={Tom Schaul and Diana Borsa and David Ding and David Szepesvari and Georg Ostrovski and Will Dabney and Simon Osindero},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gXWCVtvr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gXWCVtvr", "pdf_size": 0, "rating": "3;3;3;6", "confidence": "0;0;0;0", "wc_review": "437;474;402;104", "wc_reply_reviewers": "0;0;0;14", "wc_reply_authors": "410;672;822;177", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;1", "rating_avg": [ 3.75, 1.299038105676658 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 354.25, 146.70783039769896 ], "wc_reply_reviewers_avg": [ 3.5, 6.06217782649107 ], "wc_reply_authors_avg": [ 520.25, 247.01050078893408 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8896508186467212309&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1gXYR4YDH", "title": "DSReg: Using Distant Supervision as a Regularizer", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we aim at tackling a general issue in NLP tasks where some of the negative examples are highly similar to the positive examples, i.e., hard-negative examples). We propose the distant supervision as a regularizer (DSReg) approach to tackle this issue. We convert the original task to a multi-task learning problem, in which we first utilize the idea of distant supervision to retrieve hard-negative examples. The obtained hard-negative examples are then used as a regularizer, and we jointly optimize the original target objective of distinguishing positive examples from negative examples along with the auxiliary task objective of distinguishing soften positive examples (comprised of positive examples and hard-negative examples) from easy-negative examples. In the neural context, this can be done by feeding the final token representations to different output layers. Using this unbelievably simple strategy, we improve the performance of a range of different NLP tasks, including text classification, sequence labeling and reading comprehension. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxian Meng;Muyu Li;Xiaoya Li;Wei Wu;Fei Wu;Jiwei Li", "authorids": "yuxian_meng@shannonai.com;muyu_li@shannonai.com;xiaoya_li@shannonai.com;wei_wu@shannonai.com;wufei@zju.edu.cn;jiwei_li@shannonai.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nmeng2020dsreg,\ntitle={{\\{}DSR{\\}}eg: Using Distant Supervision as a Regularizer},\nauthor={Yuxian Meng and Muyu Li and Xiaoya Li and Wei Wu and Fei Wu and Jiwei Li},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gXYR4YDH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gXYR4YDH", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "631;268;202", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "358;530;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 367.0, 188.61071019430472 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 296.0, 220.76835522027758 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4670065548239922208&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "B1gZV1HYvS", "title": "Multi-Agent Interactions Modeling with Correlated Policies", "track": "main", "status": "Poster", "tldr": "Modeling complex multi-agent interactions under multi-agent imitation learning framework with explicit modeling of correlated policies by approximating opponents\u2019 policies. ", "abstract": "In multi-agent systems, complex interacting behaviors arise due to the high correlations among agents. However, previous work on modeling multi-agent interactions from demonstrations is primarily constrained by assuming the independence among policies and their reward structures. \nIn this paper, we cast the multi-agent interactions modeling problem into a multi-agent imitation learning framework with explicit modeling of correlated policies by approximating opponents\u2019 policies, which can recover agents' policies that can regenerate similar interactions. Consequently, we develop a Decentralized Adversarial Imitation Learning algorithm with Correlated policies (CoDAIL), which allows for decentralized training and execution. Various experiments demonstrate that CoDAIL can better regenerate complex interactions close to the demonstrators and outperforms state-of-the-art multi-agent imitation learning methods. Our code is available at \\url{https://github.com/apexrl/CoDAIL}.", "keywords": "Multi-agent reinforcement learning;Imitation learning", "primary_area": "", "supplementary_material": "", "author": "Minghuan Liu;Ming Zhou;Weinan Zhang;Yuzheng Zhuang;Jun Wang;Wulong Liu;Yong Yu", "authorids": "minghuanliu@sjtu.edu.cn;mingak@sjtu.edu.cn;wnzhang@sjtu.edu.cn;zhuangyuzheng@huawei.com;w.j@huawei.com;liuwulong@huawei.com;yyu@apex.sjtu.edu.cn", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nLiu2020Multi-Agent,\ntitle={Multi-Agent Interactions Modeling with Correlated Policies},\nauthor={Minghuan Liu and Ming Zhou and Weinan Zhang and Yuzheng Zhuang and Jun Wang and Wulong Liu and Yong Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gZV1HYvS}\n}", "github": "https://github.com/apexrl/CoDAIL", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gZV1HYvS", "pdf_size": 0, "rating": "6;6;8", "confidence": "0;0;0", "wc_review": "276;473;359", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "363;1036;472", "reply_reviewers": "0;0;0", "reply_authors": "1;3;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 369.3333333333333, 80.75614871671107 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 623.6666666666666, 294.9399185521613 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1707555896923900607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "B1g_BT4FvS", "title": "Samples Are Useful? Not Always: denoising policy gradient updates using variance explained", "track": "main", "status": "Reject", "tldr": "SAUNA uses the fraction of variance explained (Vex) as a metric to filter the transitions used for policy gradient updates: such filtering improves the sampling prior for a better exploration of the environment and yields a better performance.", "abstract": "Policy gradient algorithms in reinforcement learning optimize the policy directly and rely on efficiently sampling an environment. However, while most sampling procedures are based solely on sampling the agent's policy, other measures directly accessible through these algorithms could be used to improve sampling before each policy update. Following this line of thoughts, we propose the use of SAUNA, a method where transitions are rejected from the gradient updates if they do not meet a particular criterion, and kept otherwise. This criterion, the fraction of variance explained Vex, is a measure of the discrepancy between a model and actual samples. In this work, Vex is used to evaluate the impact each transition will have on learning: this criterion refines sampling and improves the policy gradient algorithm. In this paper: (a) We introduce and explore Vex, the criterion used for denoising policy gradient updates. (b) We conduct experiments across a variety of benchmark environments, including standard continuous control problems. Our results show better performance with SAUNA. (c) We investigate why Vex provides a reliable assessment for the selection of samples that will positively impact learning. (d) We show how this criterion can work as a dynamic tool to adjust the ratio between exploration and exploitation.", "keywords": "reinforcement learning;policy gradient;sampling", "primary_area": "", "supplementary_material": "", "author": "Yannis Flet-Berliac;Philippe Preux", "authorids": "yannis.flet-berliac@inria.fr;philippe.preux@inria.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nflet-berliac2020samples,\ntitle={Samples Are Useful? Not Always: denoising policy gradient updates using variance explained},\nauthor={Yannis Flet-Berliac and Philippe Preux},\nyear={2020},\nurl={https://openreview.net/forum?id=B1g_BT4FvS}\n}", "github": "https://github.com/iclr2020-submission/denoising-gradient-updates", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1g_BT4FvS", "pdf_size": 0, "rating": "3;6;6", "confidence": "0;0;0", "wc_review": "408;394;408", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "390;822;487", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 403.3333333333333, 6.599663291074444 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 566.3333333333334, 185.06995674308914 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "B1gcblSKwB", "title": "Meta-Learning with Network Pruning for Overfitting Reduction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Meta-Learning has achieved great success in few-shot learning. However, the existing meta-learning models have been evidenced to overfit on meta-training tasks when using deeper and wider convolutional neural networks. This means that we cannot improve the meta-generalization performance by merely deepening or widening the networks. To remedy such a deficiency of meta-overfitting, we propose in this paper a sparsity constrained meta-learning approach to learn from meta-training tasks a subnetwork from which first-order optimization methods can quickly converge towards the optimal network in meta-testing tasks. Our theoretical analysis shows the benefit of sparsity for improving the generalization gap of the learned meta-initialization network. We have implemented our approach on top of the widely applied Reptile algorithm assembled with varying network pruning routines including Dense-Sparse-Dense (DSD) and Iterative Hard Thresholding (IHT). Extensive experimental results on benchmark datasets with different over-parameterized deep networks demonstrate that our method can not only effectively ease meta-overfitting but also in many cases improve the meta-generalization performance when applied to few-shot classification tasks.", "keywords": "Meta-Learning;Few-shot Learning;Network Pruning;Generalization Analysis", "primary_area": "", "supplementary_material": "", "author": "Hongduan Tian;Bo Liu;Xiao-Tong Yuan;Qingshan Liu", "authorids": "hongduan_tian@nuist.edu.cn;kfliubo@gmail.com;xtyuan1980@gmail.com;qsliu@nuist.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ntian2020metalearning,\ntitle={Meta-Learning with Network Pruning for Overfitting Reduction},\nauthor={Hongduan Tian and Bo Liu and Xiao-Tong Yuan and Qingshan Liu},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gcblSKwB}\n}", "github": "https://drive.google.com/open?id=1VOY1sCA1j5G1LE2AbDrPoZM-1ZwwVOHA", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=B1gcblSKwB", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "1842;194;536", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "883;178;529", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 857.3333333333334, 710.1254975159125 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 530.0, 287.8159133890967 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16843083299375496395&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1gd0nEFwS", "title": "Universal Source-Free Domain Adaptation", "track": "main", "status": "Withdraw", "tldr": "A novel unsupervised domain adaptation paradigm - performing adaptation without accessing the source data ('source-free') and without any assumption about the source-target category-gap ('universal').", "abstract": "There is a strong incentive to develop versatile learning techniques that can transfer the knowledge of class-separability from a labeled source domain to an unlabeled target domain in the presence of a domain-shift. Existing domain adaptation (DA) approaches are not equipped for practical DA scenarios as a result of their reliance on the knowledge of source-target label-set relationship (e.g. Closed-set, Open-set or Partial DA). Furthermore, almost all the prior unsupervised DA works require coexistence of source and target samples even during deployment, making them unsuitable for incremental, real-time adaptation. Devoid of such highly impractical assumptions, we propose a novel two-stage learning process. Initially, in the procurement-stage, the objective is to equip the model for future source-free deployment, assuming no prior knowledge of the upcoming category-gap and domain-shift. To achieve this, we enhance the model\u2019s ability to reject out-of-source distribution samples by leveraging the available source data, in a novel generative classifier framework. Subsequently, in the deployment-stage, the objective is to design a unified adaptation algorithm capable of operating across a wide range of category-gaps, with no access to the previously seen source samples. To achieve this, in contrast to the usage of complex adversarial training regimes, we define a simple yet effective source-free adaptation objective by utilizing a novel instance-level weighing mechanism, named as Source Similarity Metric (SSM). A thorough evaluation shows the practical usability of the proposed learning framework with superior DA performance even over state-of-the-art source-dependent approaches.", "keywords": "unsupervised domain adaptation;knowledge transfer;source-free adaptation", "primary_area": "", "supplementary_material": "", "author": "Jogendra Nath Kundu;Naveen Venkat;Rahul M V;R. Venkatesh Babu", "authorids": "jogendrak@iisc.ac.in;nav.naveenvenkat@gmail.com;rmvenkat@andrew.cmu.edu;venky@iisc.ac.in", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gd0nEFwS", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "400;319;174", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "918;645;64", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 297.6666666666667, 93.48915563969021 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 542.3333333333334, 356.12201404699607 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 410, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13396021133130094693&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "B1gd7REFDB", "title": "Context-Aware Object Detection With Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "A deep neural network that leverages conditional random field to enforce context semantics constrains in object detection", "abstract": "Although the state-of-the-art object detection methods are successful in detecting and classifying objects by leveraging deep convolutional neural networks (CNNs), these methods overlook the semantic context which implies the probabilities that different classes of objects occur jointly. In this work, we propose a context-aware CNN (or conCNN for short) that for the first time effectively enforces the semantics context constraints in the CNN-based object detector by leveraging the popular conditional random field (CRF) model in CNN. In particular, conCNN features a context-aware module that naturally models the mean-field inference method for CRF using a stack of common CNN operations. It can be seamlessly plugged into any existing region-based object detection paradigm. Our experiments using COCO datasets showcase that conCNN improves the average precision (AP) of object detection by 2 percentage points, while only introducing negligible extra training overheads.", "keywords": "Object Detection;CNN;Context;CRF", "primary_area": "", "supplementary_material": "", "author": "Yizhou Yan;Lei Cao;Samuel Madden;Elke Rundensteiner", "authorids": "yyan2@wpi.edu;lcao@csail.mit.edu;madden@csail.mit.edu;rundenst@cs.wpi.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyan2020contextaware,\ntitle={Context-Aware Object Detection With Convolutional Neural Networks},\nauthor={Yizhou Yan and Lei Cao and Samuel Madden and Elke Rundensteiner},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gd7REFDB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gd7REFDB", "pdf_size": 0, "rating": "3;3;3", "confidence": "0;0;0", "wc_review": "323;333;283", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "267;189;292", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 313.0, 21.602468994692867 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 249.33333333333334, 43.86595744107522 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8EfYfQK731EJ:scholar.google.com/&scioq=Context-Aware+Object+Detection+With+Convolutional+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gdkxHFDH", "title": "Training individually fair ML models with sensitive subspace robustness", "track": "main", "status": "Spotlight", "tldr": "Algorithm for training individually fair classifier using adversarial robustness", "abstract": "We consider training machine learning models that are fair in the sense that their performance is invariant under certain sensitive perturbations to the inputs. For example, the performance of a resume screening system should be invariant under changes to the gender and/or ethnicity of the applicant. We formalize this notion of algorithmic fairness as a variant of individual fairness and develop a distributionally robust optimization approach to enforce it during training. We also demonstrate the effectiveness of the approach on two ML tasks that are susceptible to gender and racial biases. ", "keywords": "fairness;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Mikhail Yurochkin;Amanda Bower;Yuekai Sun", "authorids": "mikhail.yurochkin@ibm.com;amandarg@umich.edu;yuekai@umich.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nYurochkin2020Training,\ntitle={Training individually fair ML models with sensitive subspace robustness},\nauthor={Mikhail Yurochkin and Amanda Bower and Yuekai Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gdkxHFDH}\n}", "github": "https://github.com/IBM/sensitive-subspace-robustness", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gdkxHFDH", "pdf_size": 0, "rating": "6;6;8", "confidence": "0;0;0", "wc_review": "699;186;211", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "524;497;238", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 365.3333333333333, 236.1586096014475 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 419.6666666666667, 128.92978278461842 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18102623998603329338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "B1gi-TVKwB", "title": "Learning an off-policy predictive state representation for deep reinforcement learning for vision-based steering in autonomous driving", "track": "main", "status": "Withdraw", "tldr": "An algorithm to learn a predictive state representation with general value functions and off-policy learning is applied to the problem of vision-based steering in autonomous driving.", "abstract": "An algorithm is introduced for learning a predictive state representation with off-policy temporal difference (TD) learning that is then used to learn to steer a vehicle with reinforcement learning. There are three components being learned simultaneously: (1) the off-policy predictions as a compact representation of state, (2) the behavior policy distribution for estimating the off-policy predictions, and (3) the deterministic policy gradient for learning to act. A behavior policy discriminator is learned and used for estimating the important sampling ratios needed to learn the predictive representation off-policy with general value functions (GVFs). A linear deterministic policy gradient method is used to train the agent with only the predictive representations while the predictions are being learned. All three components are combined, demonstrated and evaluated on the problem of steering the vehicle from images in the TORCS racing simulator environment.\nSteering from only images is a challenging problem where evaluation is completed on a held-out set of tracks that were never seen during training in order to measure the generalization of the predictions and controller. Experiments show the proposed method is able to steer smoothly and navigate many but not all of the tracks available in TORCS with performance that exceeds DDPG using only images as input and approaches the performance of an ideal non-vision based kinematics model.", "keywords": "Predictive representations;general value functions;reinforcement learning;off-policy learning;behavior estimation", "primary_area": "", "supplementary_material": "", "author": "Daniel Graves", "authorids": "dgraves@ualberta.ca", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=B1gi-TVKwB", "pdf_size": 0, "rating": "1;3;3", "confidence": "0;0;0", "wc_review": "525;420;719", "wc_reply_reviewers": "276;0;0", "wc_reply_authors": "758;563;569", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 554.6666666666666, 123.85565074804711 ], "wc_reply_reviewers_avg": [ 92.0, 130.10764773832474 ], "wc_reply_authors_avg": [ 630.0, 90.54280755532159 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qLTquNVdmpwJ:scholar.google.com/&scioq=Learning+an+off-policy+predictive+state+representation+for+deep+reinforcement+learning+for+vision-based+steering+in+autonomous+driving&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gi0TEFDB", "title": "Understanding Top-k Sparsification in Distributed Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Distributed stochastic gradient descent (SGD) algorithms are widely deployed in training large-scale deep learning models, while the communication overhead among workers becomes the new system bottleneck. Recently proposed gradient sparsification techniques, especially Top-$k$ sparsification with error compensation (TopK-SGD), can significantly reduce the communication traffic without obvious impact on the model accuracy. Some theoretical studies have been carried out to analyze the convergence property of TopK-SGD. However, existing studies do not dive into the details of Top-$k$ operator in gradient sparsification and use relaxed bounds (e.g., exact bound of Random-$k$) for analysis; hence the derived results cannot well describe the real convergence performance of TopK-SGD. To this end, we first study the gradient distributions of TopK-SGD during training process through extensive experiments. We then theoretically derive a tighter bound for the Top-$k$ operator. Finally, we exploit the property of gradient distribution to propose an approximate top-$k$ selection algorithm, which is computing-efficient for GPUs, to improve the scaling efficiency of TopK-SGD by significantly reducing the computing overhead.", "keywords": "Distributed Deep Learning;SGD;Gradient Sparsification;Communication-efficient SGD;Top-k", "primary_area": "", "supplementary_material": "", "author": "Shaohuai Shi;Xiaowen Chu;Ka Chun Cheung;Simon See", "authorids": "csshshi@comp.hkbu.edu.hk;chxw@comp.hkbu.edu.hk;chcheung@nvidia.com;ssee@nvidia.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshi2020understanding,\ntitle={Understanding Top-k Sparsification in Distributed Deep Learning},\nauthor={Shaohuai Shi and Xiaowen Chu and Ka Chun Cheung and Simon See},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gi0TEFDB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gi0TEFDB", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "313;718;288", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "422;1076;211", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 439.6666666666667, 197.07584552371935 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 569.6666666666666, 368.2484064987775 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11182575544128886596&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "B1gikpEtwH", "title": "Anomaly Detection and Localization in Images using Guided Attention", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Anomaly detection and localization is a popular computer vision problem which involves detecting anomalous images and localizing anomalies within them. However, this task is challenging due to small sample size and pixel coverage of the anomaly in real-world scenarios. Previous works have a drawback of using anomalous images to compute a threshold during training to detect and localize anomalies. To tackle these issues, we propose AVAGA - the first end-to-end trainable convolutional adversarial variational autoencoder (CAVAE) framework using guided attention which localizes the anomaly with the help of attention maps. AVAGA detects an image as anomalous from the large pixel-wise difference between the input and reconstructed image. In an unsupervised setting, we propose a guided attention loss, where we encourage AVAGA to focus on all non-anomalous regions in the image without using any anomalous images during training. Furthermore, we also propose a selective gradient backpropagation technique for guided attention, which enhances the performance of anomaly localization while using only 2% anomalous images in a weakly supervised setting. AVAGA outperforms the state-of-the-art (SoTA) methods by 10% and 18% on localization and 8% and 15% on classification accuracy in unsupervised and weakly supervised settings respectively on Mvtec Anomaly Detection (MvAD) dataset and by 11% and 22% on localization and 10% and 19% on classification accuracy in unsupervised and weakly supervised settings respectively on the modified ShanghaiTech Campus (STC) dataset", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shashanka Venkataramanan;Rajat Vikram Singh;Kuan-Chuan Peng", "authorids": "shashankv@knights.ucf.edu;singh.rajat@siemens.com;kp388@cornell.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gikpEtwH", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "234;134;498", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "449;442;697", "reply_reviewers": "0;0;0", "reply_authors": "1;1;3", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 288.6666666666667, 153.5476762731657 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 529.3333333333334, 118.59267356047853 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-iSuQ7wS1zYJ:scholar.google.com/&scioq=Anomaly+Detection+and+Localization+in+Images+using+Guided+Attention&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gjs6EtDr", "title": "Efficient Content-Based Sparse Attention with Routing Transformers", "track": "main", "status": "Reject", "tldr": "We propose a content-based sparse attention model and show improvements on language modeling and image generation.", "abstract": "Self-attention has recently been adopted for a wide range of sequence modeling\nproblems. Despite its effectiveness, self-attention suffers quadratic compute and\nmemory requirements with respect to sequence length. Successful approaches to\nreduce this complexity focused on attention to local sliding windows or a small\nset of locations independent of content. Our work proposes to learn dynamic\nsparse attention patterns that avoid allocating computation and memory to attend\nto content unrelated to the query of interest. This work builds upon two lines of\nresearch: it combines the modeling flexibility of prior work on content-based sparse\nattention with the efficiency gains from approaches based on local, temporal sparse\nattention. Our model, the Routing Transformer, endows self-attention with a sparse\nrouting module based on online k-means while reducing the overall complexity of\nattention to O(n^{1.5}d) from O(n^2d) for sequence length n and hidden dimension\nd. We show that our model outperforms comparable sparse attention models on\nlanguage modeling on Wikitext-103 (15.8 vs 18.3 perplexity) as well as on\nimage generation on ImageNet-64 (3.43 vs 3.44 bits/dim) while using fewer self-attention layers.\nCode will be open-sourced on acceptance.", "keywords": "Sparse attention;autoregressive;generative models", "primary_area": "", "supplementary_material": "", "author": "Aurko Roy*;Mohammad Taghi Saffar*;David Grangier;Ashish Vaswani", "authorids": "aurkor@google.com;msaffar@google.com;grangier@google.com;avaswani@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nroy*2020efficient,\ntitle={Efficient Content-Based Sparse Attention with Routing Transformers},\nauthor={Aurko Roy* and Mohammad Taghi Saffar* and David Grangier and Ashish Vaswani},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gjs6EtDr}\n}", "github": "http://open-sourced-on-acceptance.com", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gjs6EtDr", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "1268;342;126", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "887;0;0", "reply_reviewers": "0;0;0", "reply_authors": "2;0;0", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 578.6666666666666, 495.344548998192 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 295.6666666666667, 418.13580994164505 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.9428090415820634 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 671, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11653633172486276299&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "B1gkpR4FDB", "title": "Statistical Adaptive Stochastic Optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "We investigate statistical methods for automatically scheduling the learning rate (step size) in stochastic optimization. First, we consider a broad family of stochastic optimization methods with constant hyperparameters (including the learning rate and various forms of momentum) and derive a general necessary condition for the resulting dynamics to be stationary. Based on this condition, we develop a simple online statistical test to detect (non-)stationarity and use it to automatically drop the learning rate by a constant factor whenever stationarity is detected. Unlike in prior work, our stationarity condition and our statistical test applies to different algorithms without modification. Finally, we propose a smoothed stochastic line-search method that can be used to warm up the optimization process before the statistical test can be applied effectively. This removes the expensive trial and error for setting a good initial learning rate. The combined method is highly autonomous and it attains state-of-the-art training and testing performance in our experiments on several deep learning tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Pengchuan Zhang;Hunter Lang;Qiang Liu;Lin Xiao", "authorids": "penzhan@microsoft.com;hjl@mit.edu;lqiang@cs.utexas.edu;lin.xiao@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2020statistical,\ntitle={Statistical Adaptive Stochastic Optimization},\nauthor={Pengchuan Zhang and Hunter Lang and Qiang Liu and Lin Xiao},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gkpR4FDB}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gkpR4FDB", "pdf_size": 0, "rating": "3;6;8", "confidence": "0;0;0", "wc_review": "315;447;586", "wc_reply_reviewers": "447;144;0", "wc_reply_authors": "559;751;573", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 449.3333333333333, 110.64758871701132 ], "wc_reply_reviewers_avg": [ 197.0, 186.2954642496698 ], "wc_reply_authors_avg": [ 627.6666666666666, 87.39692341394074 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vVX_x07aUHIJ:scholar.google.com/&scioq=Statistical+Adaptive+Stochastic+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gm-a4tDH", "title": "Modeling treatment events in disease progression", "track": "main", "status": "Reject", "tldr": "A novel matrix completion based algorithm to model disease progression with events", "abstract": "Ability to quantify and predict progression of a disease is fundamental for selecting an appropriate treatment. Many clinical metrics cannot be acquired frequently either because of their cost (e.g. MRI, gait analysis) or because they are inconvenient or harmful to a patient (e.g. biopsy, x-ray). In such scenarios, in order to estimate individual trajectories of disease progression, it is advantageous to leverage similarities between patients, i.e. the covariance of trajectories, and find a latent representation of progression. Most of existing methods for estimating trajectories do not account for events in-between observations, what dramatically decreases their adequacy for clinical practice. In this study, we develop a machine learning framework named Coordinatewise-Soft-Impute (CSI) for analyzing disease progression from sparse observations in the presence of confounding events. CSI is guaranteed to converge to the global minimum of the corresponding optimization problem. Experimental results also demonstrates the effectiveness of CSI using both simulated and real dataset.", "keywords": "disease progression;treatment events;matrix completion", "primary_area": "", "supplementary_material": "", "author": "Guanyang Wang;Yumeng Zhang;Yong Deng;Xuxin Huang;Lukasz Kidzinski", "authorids": "guanyang@stanford.edu;zym3008@gmail.com;yongdeng@stanford.edu;xxhuang@stanford.edu;lukasz.kidzinski@stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwang2020modeling,\ntitle={Modeling treatment events in disease progression},\nauthor={Guanyang Wang and Yumeng Zhang and Yong Deng and Xuxin Huang and Lukasz Kidzinski},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gm-a4tDH}\n}", "github": "https://www.dropbox.com/sh/y7h9utzsord2k79/AABpL0qWjOse-6dgj3-k0vina?dl=0", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gm-a4tDH", "pdf_size": 0, "rating": "1;1;1", "confidence": "0;0;0", "wc_review": "134;328;328", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 1.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 263.3333333333333, 91.45247703346016 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2N8smJhJtioJ:scholar.google.com/&scioq=Modeling+treatment+events+in+disease+progression&hl=en&as_sdt=0,33", "gs_version_total": 5 }, { "id": "B1gn-pEKwH", "title": "INFERENCE, PREDICTION, AND ENTROPY RATE OF CONTINUOUS-TIME, DISCRETE-EVENT PROCESSES", "track": "main", "status": "Reject", "tldr": "A new method for inferring a model of, estimating the entropy rate of, and predicting continuous-time, discrete-event processes.", "abstract": "The inference of models, prediction of future symbols, and entropy rate estimation of discrete-time, discrete-event processes is well-worn ground. However, many time series are better conceptualized as continuous-time, discrete-event processes. Here, we provide new methods for inferring models, predicting future symbols, and estimating the entropy rate of continuous-time, discrete-event processes. The methods rely on an extension of Bayesian structural inference that takes advantage of neural network\u2019s universal approximation power. Based on experiments with simple synthetic data, these new methods seem to be competitive with state-of- the-art methods for prediction and entropy rate estimation as long as the correct model is inferred.", "keywords": "continuous-time prediction", "primary_area": "", "supplementary_material": "", "author": "Sarah Marzen;James P. Crutchfield", "authorids": "smarzen@cmc.edu;chaos@cse.ucdavis.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmarzen2020inference,\ntitle={{\\{}INFERENCE{\\}}, {\\{}PREDICTION{\\}}, {\\{}AND{\\}} {\\{}ENTROPY{\\}} {\\{}RATE{\\}} {\\{}OF{\\}} {\\{}CONTINUOUS{\\}}-{\\{}TIME{\\}}, {\\{}DISCRETE{\\}}-{\\{}EVENT{\\}} {\\{}PROCESSES{\\}}},\nauthor={Sarah Marzen and James P. Crutchfield},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gn-pEKwH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gn-pEKwH", "pdf_size": 0, "rating": "1;1;3", "confidence": "0;0;0", "wc_review": "609;646;401", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 552.0, 107.8362956831635 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OhjoJ0L0qpsJ:scholar.google.com/&scioq=INFERENCE,+PREDICTION,+AND+ENTROPY+RATE+OF+CONTINUOUS-TIME,+DISCRETE-EVENT+PROCESSES&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gqipNYwH", "title": "Option Discovery using Deep Skill Chaining", "track": "main", "status": "Poster", "tldr": "We present a new hierarchical reinforcement learning algorithm which can solve high-dimensional goal-oriented tasks more reliably than non-hierarchical agents and other state-of-the-art skill discovery techniques.", "abstract": "Autonomously discovering temporally extended actions, or skills, is a longstanding goal of hierarchical reinforcement learning. We propose a new algorithm that combines skill chaining with deep neural networks to autonomously discover skills in high-dimensional, continuous domains. The resulting algorithm, deep skill chaining, constructs skills with the property that executing one enables the agent to execute another. We demonstrate that deep skill chaining significantly outperforms both non-hierarchical agents and other state-of-the-art skill discovery techniques in challenging continuous control tasks.", "keywords": "Hierarchical Reinforcement Learning;Reinforcement Learning;Skill Discovery;Deep Learning;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Akhil Bagaria;George Konidaris", "authorids": "akhil_bagaria@brown.edu;gdk@cs.brown.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nBagaria2020Option,\ntitle={Option Discovery using Deep Skill Chaining},\nauthor={Akhil Bagaria and George Konidaris},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gqipNYwH}\n}", "github": "https://github.com/deep-skill-chaining/deep-skill-chaining", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gqipNYwH", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "487;528;743", "wc_reply_reviewers": "103;0;148", "wc_reply_authors": "980;666;1465", "reply_reviewers": "1;0;1", "reply_authors": "2;1;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 586.0, 112.27050666433578 ], "wc_reply_reviewers_avg": [ 83.66666666666667, 61.94800687314771 ], "wc_reply_authors_avg": [ 1037.0, 328.6710614986763 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13264170387120464821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "B1grSREtDH", "title": "Bayesian Residual Policy Optimization: Scalable Bayesian Reinforcement Learning with Clairvoyant Experts", "track": "main", "status": "Reject", "tldr": "We propose a scalable Bayesian Reinforcement Learning algorithm that learns a Bayesian correction over an ensemble of clairvoyant experts to solve problems with complex latent rewards and dynamics.", "abstract": "Informed and robust decision making in the face of uncertainty is critical for robots that perform physical tasks alongside people. We formulate this as a Bayesian Reinforcement Learning problem over latent Markov Decision Processes (MDPs). While Bayes-optimality is theoretically the gold standard, existing algorithms do not scale well to continuous state and action spaces. We propose a scalable solution that builds on the following insight: in the absence of uncertainty, each latent MDP is easier to solve. We split the challenge into two simpler components. First, we obtain an ensemble of clairvoyant experts and fuse their advice to compute a baseline policy. Second, we train a Bayesian residual policy to improve upon the ensemble's recommendation and learn to reduce uncertainty. Our algorithm, Bayesian Residual Policy Optimization (BRPO), imports the scalability of policy gradient methods as well as the initialization from prior models. BRPO significantly improves the ensemble of experts and drastically outperforms existing adaptive RL methods.", "keywords": "Bayesian Residual Reinforcement Learning;Residual Reinforcement Learning;Bayes Policy Optimization", "primary_area": "", "supplementary_material": "", "author": "Gilwoo Lee;Brian Hou;Sanjiban Choudhury;Siddhartha S. Srinivasa", "authorids": "gilwoo@cs.uw.edu;bhou@cs.uw.edu;sanjibac@cs.uw.edu;siddh@cs.uw.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2020bayesian,\ntitle={Bayesian Residual Policy Optimization: Scalable Bayesian Reinforcement Learning with Clairvoyant Experts},\nauthor={Gilwoo Lee and Brian Hou and Sanjiban Choudhury and Siddhartha S. Srinivasa},\nyear={2020},\nurl={https://openreview.net/forum?id=B1grSREtDH}\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1grSREtDH", "pdf_size": 0, "rating": "3;3;6", "confidence": "0;0;0", "wc_review": "479;102;260", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "710;128;408", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 280.3333333333333, 154.57971693883027 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 415.3333333333333, 237.65708255570436 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2389416385153745085&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "B1grayHYDH", "title": "Incorporating Perceptual Prior to Improve Model's Adversarial Robustness", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep Neural Networks trained using human-annotated data are able to achieve human-like accuracy on many computer vision tasks such as classification, object recognition and segmentation. However, they are still far from being as robust as the human visual system. In this paper, we demonstrate that even models that are trained to be robust to random perturbations do not necessarily learn robust representations. We propose to address this by imposing a perception based prior on the learned representations to ensure that perceptually similar images have similar representations. We demonstrate that, although this training method does not use adversarial samples during training, it significantly improves the network\u2019s robustness to single-step and multi-step adversarial attacks, thus validating our hypothesis that the network indeed learns more robust representations. Our proposed method provides a means of achieving adversarial robustness at no additional computational cost when compared to normal training. ", "keywords": "Representation learining;adversarial defense;robust neural networks", "primary_area": "", "supplementary_material": "", "author": "B.S. Vivek;Arya Baburaj;Ashutosh B Sathe;R. Venkatesh Babu", "authorids": "svivek@iisc.ac.in;aryababuraj@iisc.ac.in;satheab16.mech@coep.ac.in;venky@iisc.ac.in", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1grayHYDH", "pdf_size": 0, "rating": "1;1;1", "confidence": "0;0;0", "wc_review": "546;190;331", "wc_reply_reviewers": "0;103;0", "wc_reply_authors": "221;179;118", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 1.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 355.6666666666667, 146.37926386236848 ], "wc_reply_reviewers_avg": [ 34.333333333333336, 48.554665641476255 ], "wc_reply_authors_avg": [ 172.66666666666666, 42.28737662970148 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oopbbnFszZUJ:scholar.google.com/&scioq=Incorporating+Perceptual+Prior+to+Improve+Model%27s+Adversarial+Robustness&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gskyStwr", "title": "Frequency-based Search-control in Dyna", "track": "main", "status": "Poster", "tldr": "Acquire states from high frequency region for search-control in Dyna.", "abstract": "Model-based reinforcement learning has been empirically demonstrated as a successful strategy to improve sample efficiency. In particular, Dyna is an elegant model-based architecture integrating learning and planning that provides huge flexibility of using a model. One of the most important components in Dyna is called search-control, which refers to the process of generating state or state-action pairs from which we query the model to acquire simulated experiences. Search-control is critical in improving learning efficiency. In this work, we propose a simple and novel search-control strategy by searching high frequency regions of the value function. Our main intuition is built on Shannon sampling theorem from signal processing, which indicates that a high frequency signal requires more samples to reconstruct. We empirically show that a high frequency function is more difficult to approximate. This suggests a search-control strategy: we should use states from high frequency regions of the value function to query the model to acquire more samples. We develop a simple strategy to locally measure the frequency of a function by gradient and hessian norms, and provide theoretical justification for this approach. We then apply our strategy to search-control in Dyna, and conduct experiments to show its property and effectiveness on benchmark domains.", "keywords": "Model-based reinforcement learning;search-control;Dyna;frequency of a signal", "primary_area": "", "supplementary_material": "", "author": "Yangchen Pan;Jincheng Mei;Amir-massoud Farahmand", "authorids": "pan6@ualberta.ca;jmei2@ualberta.ca;farahmand@vectorinstitute.ai", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nPan2020Frequency-based,\ntitle={Frequency-based Search-control in Dyna},\nauthor={Yangchen Pan and Jincheng Mei and Amir-massoud Farahmand},\nbooktitle={International Conference on Learning Representations},\nyear={2020},\nurl={https://openreview.net/forum?id=B1gskyStwr}\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer5", "site": "https://openreview.net/forum?id=B1gskyStwr", "pdf_size": 0, "rating": "6;6;6", "confidence": "0;0;0", "wc_review": "285;426;510", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "516;421;816", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 407.0, 92.83318372220141 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 584.3333333333334, 168.34158395621947 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2849858529546206580&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "B1gtK0NKwr", "title": "Correctness Verification of Neural Network", "track": "main", "status": "Withdraw", "tldr": "We present the first verification that a neural network for perception tasks produces a correct output within a specified tolerance for every input of interest. ", "abstract": "We present the first verification that a neural network for perception tasks produces\na correct output within a specified tolerance for every input of interest. We define\ncorrectness relative to a specification which identifies 1) a state space consisting of\nall relevant states of the world and 2) an observation process that produces neural\nnetwork inputs from the states of the world. Tiling the state and input spaces with\na finite number of tiles, obtaining ground truth bounds from the state tiles and\nnetwork output bounds from the input tiles, then comparing the ground truth and\nnetwork output bounds delivers an upper bound on the network output error for\nany input of interest. Results from two case studies highlight the ability of our\ntechnique to deliver tight error bounds for all inputs of interest and show how the\nerror bounds vary over the state and input spaces.", "keywords": "Neural network verification;safety;reliability", "primary_area": "", "supplementary_material": "", "author": "Yichen Yang;Martin Rinard", "authorids": "yicheny@csail.mit.edu;rinard@csail.mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "https://anonymous.4open.science/r/5f526d25-cdbf-46db-b737-b235676481b7/", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gtK0NKwr", "pdf_size": 0, "rating": "1;1;3", "confidence": "0;0;0", "wc_review": "284;258;670", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "93;0;143", "reply_reviewers": "0;0;0", "reply_authors": "1;0;1", "rating_avg": [ 1.6666666666666667, 0.9428090415820634 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 404.0, 188.3896670910235 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 78.66666666666667, 59.252754272598075 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15343493030980332407&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "B1guLAVFDB", "title": "Span Recovery for Deep Neural Networks with Applications to Input Obfuscation", "track": "main", "status": "Poster", "tldr": "We provably recover the span of a deep multi-layered neural network with latent structure and empirically apply efficient span recovery algorithms to attack networks by obfuscating inputs.", "abstract": "The tremendous success of deep neural networks has motivated the need to better understand the fundamental properties of these networks, but many of the theoretical results proposed have only been for shallow networks. In this paper, we study an important primitive for understanding the meaningful input space of a deep network: span recovery. For $k