[ { "id": "B14ejsA5YQ", "title": "Neural Causal Discovery with Learnable Input Noise", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning causal relations from observational time series with nonlinear interactions and complex causal structures is a key component of human intelligence, and has a wide range of applications. Although neural nets have demonstrated their effectiveness in a variety of fields, their application in learning causal relations has been scarce. This is due to both a lack of theoretical results connecting risk minimization and causality (enabling function approximators like neural nets to apply), and a lack of scalability in prior causal measures to allow for expressive function approximators like neural nets to apply. In this work, we propose a novel causal measure and algorithm using risk minimization to infer causal relations from time series. We demonstrate the effectiveness and scalability of our algorithms to learn nonlinear causal models in synthetic datasets as comparing to other methods, and its effectiveness in inferring causal relations in a video game environment and real-world heart-rate vs. breath-rate and rat brain EEG datasets.", "keywords": "neural causal learning;learnable noise", "primary_area": "", "supplementary_material": "", "author": "Tailin Wu;Thomas Breuel;Jan Kautz", "authorids": "tailin@mit.edu;tbreuel@nvidia.com;jkautz@nvidia.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwu2019neural,\ntitle={Neural Causal Discovery with Learnable Input Noise},\nauthor={Tailin Wu and Thomas Breuel and Jan Kautz},\nyear={2019},\nurl={https://openreview.net/forum?id=B14ejsA5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B14ejsA5YQ", "pdf_size": 0, "rating": "4;4;8", "confidence": "4;5;4", "wc_review": "496;222;328", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "438;274;149", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 348.6666666666667, 112.81055900146147 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 287.0, 118.34131428485432 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L9MZ2ybIiJUJ:scholar.google.com/&scioq=Neural+Causal+Discovery+with+Learnable+Input+Noise&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B14rPj0qY7", "title": "RETHINKING SELF-DRIVING : MULTI -TASK KNOWLEDGE FOR BETTER GENERALIZATION AND ACCIDENT EXPLANATION ABILITY", "track": "main", "status": "Reject", "tldr": "we proposed a new self-driving model which is composed of perception module for see and think and driving module for behave to acquire better generalization and accident explanation ability.", "abstract": "Current end-to-end deep learning driving models have two problems: (1) Poor\ngeneralization ability of unobserved driving environment when diversity of train-\ning driving dataset is limited (2) Lack of accident explanation ability when driving\nmodels don\u2019t work as expected. To tackle these two problems, rooted on the be-\nlieve that knowledge of associated easy task is benificial for addressing difficult\ntask, we proposed a new driving model which is composed of perception module\nfor see and think and driving module for behave, and trained it with multi-task\nperception-related basic knowledge and driving knowledge stepwisely. Specifi-\ncally segmentation map and depth map (pixel level understanding of images) were\nconsidered as what & where and how far knowledge for tackling easier driving-\nrelated perception problems before generating final control commands for difficult\ndriving task. The results of experiments demonstrated the effectiveness of multi-\ntask perception knowledge for better generalization and accident explanation abil-\nity. With our method the average sucess rate of finishing most difficult navigation\ntasks in untrained city of CoRL test surpassed current benchmark method for 15\npercent in trained weather and 20 percent in untrained weathers.", "keywords": "Autonomous car;convolution network;image segmentation;depth estimation;generalization ability;explanation ability;multi-task learning", "primary_area": "", "supplementary_material": "", "author": "Zhihao LI;Toshiyuki MOTOYOSHI;Kazuma SASAKI;Tetsuya OGATA;Shigeki SUGANO", "authorids": "mr.zhihao.li@gmail.com;motoyoshi@idr.ias.sci.waseda.ac.jp;ssk.sasaki@suou.waseda.jp;ogata@waseda.jp;sugano@waseda.jp", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019rethinking,\ntitle={{RETHINKING} {SELF}-{DRIVING} : {MULTI} -{TASK} {KNOWLEDGE} {FOR} {BETTER} {GENERALIZATION} {AND} {ACCIDENT} {EXPLANATION} {ABILITY}},\nauthor={Zhihao LI and Toshiyuki MOTOYOSHI and Kazuma SASAKI and Tetsuya OGATA and Shigeki SUGANO},\nyear={2019},\nurl={https://openreview.net/forum?id=B14rPj0qY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B14rPj0qY7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "wc_review": "1048;380;270", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 566.0, 343.771241767933 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15840970540454571465&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1EiIsCctm", "title": "Improving Gaussian mixture latent variable model convergence with Optimal Transport", "track": "main", "status": "Withdraw", "tldr": "This paper shows that the Wasserstein distance objective enables the training of latent variable models with discrete latents in a case where the Variational Autoencoder objective fails to do so.", "abstract": "Generative models with both discrete and continuous latent variables are highly motivated by the structure of many real-world data sets. They present, however, subtleties in training often manifesting in the discrete latent variable not being leveraged. In this paper, we show why such models struggle to train using traditional log-likelihood maximization, and that they are amenable to training using the Optimal Transport framework of Wasserstein Autoencoders. We find our discrete latent variable to be fully leveraged by the model when trained, without any modifications to the objective function or significant fine tuning. Our model generates comparable samples to other approaches while using relatively simple neural networks, since the discrete latent variable carries much of the descriptive burden. Furthermore, the discrete latent provides significant control over generation.", "keywords": "optimal transport;wasserstein autoencoder;variational autoencoder;latent variable modeling;generative modeling;discrete latent variables", "primary_area": "", "supplementary_material": "", "author": "Benoit Gaujac;Ilya Feige;David Barber", "authorids": "benoit.gaujac.16@ucl.ac.uk;ilya@asidatascience.com;david.barber@ucl.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1EiIsCctm", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;3", "wc_review": "197;204;302", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "401;421;823", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 234.33333333333334, 47.932823363072984 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 548.3333333333334, 194.3902146599863 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13459543000721681687&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "B1EjKsRqtQ", "title": "Hierarchical Attention: What Really Counts in Various NLP Tasks", "track": "main", "status": "Reject", "tldr": "The paper proposed a novel hierarchical model to replace the original attention model in various NLP tasks.", "abstract": "Attention mechanisms in sequence to sequence models have shown great ability and wonderful performance in various natural language processing (NLP) tasks, such as sentence embedding, text generation, machine translation, machine reading comprehension, etc. Unfortunately, existing attention mechanisms only learn either high-level or low-level features. In this paper, we think that the lack of hierarchical mechanisms is a bottleneck in improving the performance of the attention mechanisms, and propose a novel Hierarchical Attention Mechanism (Ham) based on the weighted sum of different layers of a multi-level attention. \nHam achieves a state-of-the-art BLEU score of 0.26 on Chinese poem generation task and a nearly 6.5% averaged improvement compared with the existing machine reading comprehension models such as BIDAF and Match-LSTM. Furthermore, our experiments and theorems reveal that Ham has greater generalization and representation ability than existing attention mechanisms. ", "keywords": "attention;hierarchical;machine reading comprehension;poem generation", "primary_area": "", "supplementary_material": "", "author": "Zehao Dou;Zhihua Zhang", "authorids": "zehaodou@pku.edu.cn;zhzhang@math.pku.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndou2019hierarchical,\ntitle={Hierarchical Attention: What Really Counts in Various {NLP} Tasks},\nauthor={Zehao Dou and Zhihua Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=B1EjKsRqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1EjKsRqtQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;3;4", "wc_review": "357;234;220", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 270.3333333333333, 61.548535500223096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6835025580639923184&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Convolutional Neural Networks on Non-uniform Geometrical Signals Using Euclidean Spectral Transformation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/709", "id": "B1G5ViAqFm", "author_site": "Chiyu Jiang, Dequan Wang, Jingwei Huang, Philip Marcus, Matthias Niessner", "tldr": "We use non-Euclidean Fourier Transformation of shapes defined by a simplicial complex for deep learning, achieving significantly better results than point-based sampling techiques used in current 3D learning literature.", "abstract": "Convolutional Neural Networks (CNN) have been successful in processing data signals that are uniformly sampled in the spatial domain (e.g., images). However, most data signals do not natively exist on a grid, and in the process of being sampled onto a uniform physical grid suffer significant aliasing error and information loss. Moreover, signals can exist in different topological structures as, for example, points, lines, surfaces and volumes. It has been challenging to analyze signals with mixed topologies (for example, point cloud with surface mesh). To this end, we develop mathematical formulations for Non-Uniform Fourier Transforms (NUFT) to directly, and optimally, sample nonuniform data signals of different topologies defined on a simplex mesh into the spectral domain with no spatial sampling error. The spectral transform is performed in the Euclidean space, which removes the translation ambiguity from works on the graph spectrum. Our representation has four distinct advantages: (1) the process causes no spatial sampling error during initial sampling, (2) the generality of this approach provides a unified framework for using CNNs to analyze signals of mixed topologies, (3) it allows us to leverage state-of-the-art backbone CNN architectures for effective learning without having to design a particular architecture for a particular data structure in an ad-hoc fashion, and (4) the representation allows weighted meshes where each element has a different weight (i.e., texture) indicating local properties. We achieve good results on-par with state-of-the-art for 3D shape retrieval task, and new state-of-the-art for point cloud to surface reconstruction task.", "keywords": "Non-uniform Fourier Transform;3D Learning;CNN;surface reconstruction", "primary_area": "", "supplementary_material": "", "author": "Chiyu Max Jiang;Dequan Wang;Jingwei Huang;Philip Marcus;Matthias Niessner", "authorids": "chiyu.jiang@berkeley.edu;dqw@berkeley.edu;jingweih@stanford.edu;pmarcus@me.berkeley.edu;niessner@tum.de", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\njiang2018convolutional,\ntitle={Convolutional Neural Networks on Non-uniform Geometrical Signals Using Euclidean Spectral Transformation},\nauthor={Chiyu Max Jiang and Dequan Wang and Jingwei Huang and Philip Marcus and Matthias Niessner},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1G5ViAqFm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=B1G5ViAqFm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;3;4", "wc_review": "202;772;321", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "168;707;214", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 431.6666666666667, 245.5067322000673 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 363.0, 243.96857721162917 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15609737605726839540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=B1G5ViAqFm", "pdf": "https://openreview.net/pdf?id=B1G5ViAqFm", "email": ";;;;", "author_num": 5 }, { "title": "Augmented Cyclic Adversarial Learning for Low Resource Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/780", "id": "B1G9doA9F7", "author_site": "Ehsan Hosseini-Asl, Yingbo Zhou, Caiming Xiong, richard socher", "tldr": "A new cyclic adversarial learning augmented with auxiliary task model which improves domain adaptation performance in low resource supervised and unsupervised situations ", "abstract": "Training a model to perform a task typically requires a large amount of data from the domains in which the task will be applied.\nHowever, it is often the case that data are abundant in some domains but scarce in others. Domain adaptation deals with the challenge of adapting a model trained from a data-rich source domain to perform well in a data-poor target domain. In general, this requires learning plausible mappings between domains. CycleGAN is a powerful framework that efficiently learns to map inputs from one domain to another using adversarial training and a cycle-consistency constraint. However, the conventional approach of enforcing cycle-consistency via reconstruction may be overly restrictive in cases where one or more domains have limited training data. In this paper, we propose an augmented cyclic adversarial learning model that enforces the cycle-consistency constraint via an external task specific model, which encourages the preservation of task-relevant content as opposed to exact reconstruction. We explore digit classification in a low-resource setting in supervised, semi and unsupervised situation, as well as high resource unsupervised. In low-resource supervised setting, the results show that our approach improves absolute performance by 14% and 4% when adapting SVHN to MNIST and vice versa, respectively, which outperforms unsupervised domain adaptation methods that require high-resource unlabeled target domain. Moreover, using only few unsupervised target data, our approach can still outperforms many high-resource unsupervised models. Our model also outperforms on USPS to MNIST and synthetic digit to SVHN for high resource unsupervised adaptation. In speech domains, we similarly adopt a speech recognition model from each domain as the task specific model. Our approach improves absolute performance of speech recognition by 2% for female speakers in the TIMIT dataset, where the majority of training samples are from male voices.", "keywords": "Domain adaptation;generative adversarial network;cyclic adversarial learning;speech", "primary_area": "", "supplementary_material": "", "author": "Ehsan Hosseini-Asl;Yingbo Zhou;Caiming Xiong;Richard Socher", "authorids": "ehosseiniasl@salesforce.com;yingbo.zhou@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhosseini-asl2018augmented,\ntitle={Augmented Cyclic Adversarial Learning for Low Resource Domain Adaptation},\nauthor={Ehsan Hosseini-Asl and Yingbo Zhou and Caiming Xiong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1G9doA9F7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=B1G9doA9F7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;2", "wc_review": "528;658;284", "wc_reply_reviewers": "193;0;0", "wc_reply_authors": "961;273;223", "reply_reviewers": "2;0;0", "reply_authors": "5;2;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 490.0, 155.03117965966288 ], "wc_reply_reviewers_avg": [ 64.33333333333333, 90.98107251266912 ], "wc_reply_authors_avg": [ 485.6666666666667, 336.730686982276 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11441136156642140785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=B1G9doA9F7", "pdf": "https://openreview.net/pdf?id=B1G9doA9F7", "email": ";;;", "author_num": 4 }, { "title": "Variance Networks: When Expectation Does Not Meet Your Expectations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/884", "id": "B1GAUs0cKQ", "author_site": "Kirill Neklyudov, Dmitry Molchanov, Arsenii Ashukha, Dmitry P. Vetrov", "tldr": "It is possible to learn a zero-centered Gaussian distribution over the weights of a neural network by learning only variances, and it works surprisingly well.", "abstract": "Ordinary stochastic neural networks mostly rely on the expected values of their weights to make predictions, whereas the induced noise is mostly used to capture the uncertainty, prevent overfitting and slightly boost the performance through test-time averaging. In this paper, we introduce variance layers, a different kind of stochastic layers. Each weight of a variance layer follows a zero-mean distribution and is only parameterized by its variance. It means that each object is represented by a zero-mean distribution in the space of the activations. We show that such layers can learn surprisingly well, can serve as an efficient exploration tool in reinforcement learning tasks and provide a decent defense against adversarial attacks. We also show that a number of conventional Bayesian neural networks naturally converge to such zero-mean posteriors. We observe that in these cases such zero-mean parameterization leads to a much better training objective than more flexible conventional parameterizations where the mean is being learned.", "keywords": "deep learning;variational inference;variational dropout", "primary_area": "", "supplementary_material": "", "author": "Kirill Neklyudov;Dmitry Molchanov;Arsenii Ashukha;Dmitry Vetrov", "authorids": "k.necludov@gmail.com;dmolch111@gmail.com;ars.ashuha@gmail.com;vetrovd@yandex.ru", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nneklyudov2018variance,\ntitle={Variance Networks: When Expectation Does Not Meet Your Expectations},\nauthor={Kirill Neklyudov and Dmitry Molchanov and Arsenii Ashukha and Dmitry Vetrov},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1GAUs0cKQ},\n}", "github": "[![github](/images/github_icon.svg) da-molchanov/variance-networks](https://github.com/da-molchanov/variance-networks) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1GAUs0cKQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "wc_review": "233;364;249", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "262;626;459", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 282.0, 58.349521563305615 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 449.0, 148.77051679236268 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3938870273847182783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=B1GAUs0cKQ", "pdf": "https://openreview.net/pdf?id=B1GAUs0cKQ", "email": ";;;", "author_num": 4 }, { "id": "B1GHJ3R9tQ", "title": "HyperGAN: Exploring the Manifold of Neural Networks", "track": "main", "status": "Reject", "tldr": "We use a GAN to generate parameters of a neural network in one forward pass.", "abstract": "We introduce HyperGAN, a generative network that learns to generate all the weight parameters of deep neural networks. HyperGAN first transforms low dimensional noise into a latent space, which can be sampled from to obtain diverse, performant sets of parameters for a target architecture. We utilize an architecture that bears resemblance to generative adversarial networks, but we evaluate the likelihood of samples with a classification loss. This is equivalent to minimizing the KL-divergence between the generated network parameter distribution and an unknown true parameter distribution. We apply HyperGAN to classification, showing that HyperGAN can learn to generate parameters which solve the MNIST and CIFAR-10 datasets with competitive performance to fully supervised learning while learning a rich distribution of effective parameters. We also show that HyperGAN can also provide better uncertainty than standard ensembles. This is evaluated by the ability of HyperGAN-generated ensembles to detect out of distribution data as well as adversarial examples. We see that in addition to being highly accurate on inlier data, HyperGAN can provide reasonable uncertainty estimates. ", "keywords": "hypernetworks;generative adversarial networks;anomaly detection", "primary_area": "", "supplementary_material": "", "author": "Neale Ratzlaff;Li Fuxin", "authorids": "ratzlafn@oregonstate.edu;lif@oregonstate.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nratzlaff2019hypergan,\ntitle={Hyper{GAN}: Exploring the Manifold of Neural Networks},\nauthor={Neale Ratzlaff and Li Fuxin},\nyear={2019},\nurl={https://openreview.net/forum?id=B1GHJ3R9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1GHJ3R9tQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;5", "wc_review": "454;754;1061", "wc_reply_reviewers": "0;160;204", "wc_reply_authors": "284;425;445", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 756.3333333333334, 247.81220488282833 ], "wc_reply_reviewers_avg": [ 121.33333333333333, 87.65589287409921 ], "wc_reply_authors_avg": [ 384.6666666666667, 71.64883499091633 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gd1YoaS2EYAJ:scholar.google.com/&scioq=HyperGAN:+Exploring+the+Manifold+of+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1GHb2RqYX", "title": "PolyCNN: Learning Seed Convolutional Filters", "track": "main", "status": "Withdraw", "tldr": "PolyCNN only needs to learn one seed convolutional filter at each layer. This is an efficient variant of traditional CNN, with on-par performance.", "abstract": "In this work, we propose the polynomial convolutional neural network (PolyCNN), as a new design of a weight-learning efficient variant of the traditional CNN. The biggest advantage of the PolyCNN is that at each convolutional layer, only one convolutional filter is needed for learning the weights, which we call the seed filter, and all the other convolutional filters are the polynomial transformations of the seed filter, which is termed as an early fan-out. Alternatively, we can also perform late fan-out on the seed filter response to create the number of response maps needed to be input into the next layer. Both early and late fan-out allow the PolyCNN to learn only one convolutional filter at each layer, which can dramatically reduce the model complexity by saving 10x to 50x parameters during learning. While being efficient during both training and testing, the PolyCNN does not suffer performance due to the non-linear polynomial expansion which translates to richer representational power within the convolutional layers. By allowing direct control over model complexity, PolyCNN provides a flexible trade-off between performance and efficiency. We have verified the on-par performance between the proposed PolyCNN and the standard CNN on several visual datasets, such as MNIST, CIFAR-10, SVHN, and ImageNet.", "keywords": "Efficient CNN;Seed convolutional filter", "primary_area": "", "supplementary_material": "", "author": "Felix Juefei-Xu;Vishnu Naresh Boddeti;Marios Savvides", "authorids": "juefei.xu@gmail.com;vishnu@msu.edu;msavvide@ri.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1GHb2RqYX", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;2", "wc_review": "170;460;153", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 261.0, 140.88529613365145 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13430266183713186886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1GIB3A9YX", "title": "Explicit Recall for Efficient Exploration", "track": "main", "status": "Reject", "tldr": "We advocate the use of explicit memory for efficient exploration in reinforcement learning", "abstract": "In this paper, we advocate the use of explicit memory for efficient exploration in reinforcement learning. This memory records structured trajectories that have led to interesting states in the past, and can be used by the agent to revisit those states more effectively. In high-dimensional decision making problems, where deep reinforcement learning is considered crucial, our approach provides a simple, transparent and effective way that can be naturally combined with complex, deep learning models. We show how such explicit memory may be used to enhance existing exploration algorithms such as intrinsically motivated ones and count-based ones, and demonstrate our method's advantages in various simulated environments.", "keywords": "Exploration;goal-directed;deep reinforcement learning;explicit memory", "primary_area": "", "supplementary_material": "", "author": "Honghua Dong;Jiayuan Mao;Xinyue Cui;Lihong Li", "authorids": "dhh19951@gmail.com;maojiayuan@gmail.com;rogar2233cxy@gmail.com;lihongli.cs@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndong2019explicit,\ntitle={Explicit Recall for Efficient Exploration},\nauthor={Honghua Dong and Jiayuan Mao and Xinyue Cui and Lihong Li},\nyear={2019},\nurl={https://openreview.net/forum?id=B1GIB3A9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1GIB3A9YX", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;4;3", "wc_review": "287;721;86", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "335;409;34", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 364.6666666666667, 264.9909851716134 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 259.3333333333333, 162.17343254128346 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9707253433941508, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10902578508329069573&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1GIQhCcYm", "title": "Unsupervised one-to-many image translation", "track": "main", "status": "Reject", "tldr": "We train an image to image translation network that take as input the source image and a sample from a prior distribution to generate a sample from the target distribution", "abstract": "We perform completely unsupervised one-sided image to image translation between a source domain $X$ and a target domain $Y$ such that we preserve relevant underlying shared semantics (e.g., class, size, shape, etc). \nIn particular, we are interested in a more difficult case than those typically addressed in the literature, where the source and target are ``far\" enough that reconstruction-style or pixel-wise approaches fail.\nWe argue that transferring (i.e., \\emph{translating}) said relevant information should involve both discarding source domain-specific information while incorporate target domain-specific information, the latter of which we model with a noisy prior distribution. \nIn order to avoid the degenerate case where the generated samples are only explained by the prior distribution, we propose to minimize an estimate of the mutual information between the generated sample and the sample from the prior distribution. We discover that the architectural choices are an important factor to consider in order to preserve the shared semantic between $X$ and $Y$. \nWe show state of the art results on the MNIST to SVHN task for unsupervised image to image translation.", "keywords": "Image-to-image;Translation;Unsupervised;Generation;Adversarial;Learning", "primary_area": "", "supplementary_material": "", "author": "Samuel Lavoie-Marchildon;Sebastien Lachapelle;Miko\u0142aj Bi\u0144kowski;Aaron Courville;Yoshua Bengio;R Devon Hjelm", "authorids": "samuel.lavoie-marchildon@umontreal.ca;sebastien.lachapelle@umontreal.ca;mikbinkowski@gmail.com;aaron.courville@gmail.com;yoshua.umontreal@gmail.com;devon.hjelm@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlavoie-marchildon2019unsupervised,\ntitle={Unsupervised one-to-many image translation},\nauthor={Samuel Lavoie-Marchildon and Sebastien Lachapelle and Miko\u0142aj Bi\u0144kowski and Aaron Courville and Yoshua Bengio and R Devon Hjelm},\nyear={2019},\nurl={https://openreview.net/forum?id=B1GIQhCcYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1GIQhCcYm", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "wc_review": "461;249;100", "wc_reply_reviewers": "0;42;0", "wc_reply_authors": "242;175;140", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 270.0, 148.12382207689168 ], "wc_reply_reviewers_avg": [ 14.0, 19.79898987322333 ], "wc_reply_authors_avg": [ 185.66666666666666, 42.3188951756646 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3852471272841883360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Initialized Equilibrium Propagation for Backprop-Free Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/968", "id": "B1GMDsR5tm", "author_site": "Peter OConnor, Efstratios Gavves, Max Welling", "tldr": "We train a feedforward network without backprop by using an energy-based model to provide local targets", "abstract": "Deep neural networks are almost universally trained with reverse-mode automatic differentiation (a.k.a. backpropagation). Biological networks, on the other hand, appear to lack any mechanism for sending gradients back to their input neurons, and thus cannot be learning in this way. In response to this, Scellier & Bengio (2017) proposed Equilibrium Propagation - a method for gradient-based train- ing of neural networks which uses only local learning rules and, crucially, does not rely on neurons having a mechanism for back-propagating an error gradient. Equilibrium propagation, however, has a major practical limitation: inference involves doing an iterative optimization of neural activations to find a fixed-point, and the number of steps required to closely approximate this fixed point scales poorly with the depth of the network. In response to this problem, we propose Initialized Equilibrium Propagation, which trains a feedforward network to initialize the iterative inference procedure for Equilibrium propagation. This feed-forward network learns to approximate the state of the fixed-point using a local learning rule. After training, we can simply use this initializing network for inference, resulting in a learned feedforward network. Our experiments show that this network appears to work as well or better than the original version of Equilibrium propagation. This shows how we might go about training deep networks without using backpropagation.", "keywords": "credit assignment;energy-based models;biologically plausible learning", "primary_area": "", "supplementary_material": "", "author": "Peter O'Connor;Efstratios Gavves;Max Welling", "authorids": "peter.ed.oconnor@gmail.com;egavves@uva.nl;m.welling@uva.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\no'connor2018initialized,\ntitle={Initialized Equilibrium Propagation for Backprop-Free Training},\nauthor={Peter O'Connor and Efstratios Gavves and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1GMDsR5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;5;5", "wc_review": "855;892;430", "wc_reply_reviewers": "151;461;0", "wc_reply_authors": "991;1159;577", "reply_reviewers": "1;3;0", "reply_authors": "2;3;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 725.6666666666666, 209.612870682016 ], "wc_reply_reviewers_avg": [ 204.0, 191.89754210689273 ], "wc_reply_authors_avg": [ 909.0, 244.57309745759036 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15931102343166725964&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=B1GMDsR5tm", "pdf": "https://openreview.net/pdf?id=B1GMDsR5tm", "email": ";;", "author_num": 3 }, { "id": "B1GRtj05t7", "title": "NA", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "NA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "NA;NA", "authorids": "v-ziclin@microsoft.com;lizo@microsoft.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1GRtj05t7", "pdf_size": 0, "rating": "2;4;6", "confidence": "5;3;2", "wc_review": "160;165;218", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 181.0, 26.242459234352765 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "B1GSBsRcFX", "title": "Stop memorizing: A data-dependent regularization framework for intrinsic pattern learning", "track": "main", "status": "Reject", "tldr": "we propose a new framework for data-dependent DNN regularization that can prevent DNNs from overfitting random data or random labels.", "abstract": "Deep neural networks (DNNs) typically have enough capacity to fit random data by brute force even when conventional data-dependent regularizations focusing on the geometry of the features are imposed. We find out that the reason for this is the inconsistency between the enforced geometry and the standard softmax cross entropy loss. To resolve this, we propose a new framework for data-dependent DNN regularization, the Geometrically-Regularized-Self-Validating neural Networks (GRSVNet). During training, the geometry enforced on one batch of features is simultaneously validated on a separate batch using a validation loss consistent with the geometry. We study a particular case of GRSVNet, the Orthogonal-Low-rank Embedding (OLE)-GRSVNet, which is capable of producing highly discriminative features residing in orthogonal low-rank subspaces. Numerical experiments show that OLE-GRSVNet outperforms DNNs with conventional regularization when trained on real data. More importantly, unlike conventional DNNs, OLE-GRSVNet refuses to memorize random data or random labels, suggesting it only learns intrinsic patterns by reducing the memorizing capacity of the baseline DNN.", "keywords": "deep neural networks;memorizing;data-dependent regularization", "primary_area": "", "supplementary_material": "", "author": "Wei Zhu;Qiang Qiu;Bao Wang;Jianfeng Lu;Guillermo Sapiro;Ingrid Daubechies", "authorids": "zhu@math.duke.edu;qiang.qiu@duke.edu;wangbao@math.ucla.edu;jianfeng@math.duke.edu;guillermo.sapiro@duke.edu;ingrid@math.duke.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzhu2019stop,\ntitle={Stop memorizing: A data-dependent regularization framework for intrinsic pattern learning},\nauthor={Wei Zhu and Qiang Qiu and Bao Wang and Jianfeng Lu and Guillermo Sapiro and Ingrid Daubechies},\nyear={2019},\nurl={https://openreview.net/forum?id=B1GSBsRcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1GSBsRcFX", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;3;4", "wc_review": "365;99;284", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 249.33333333333334, 111.3263470861932 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2383279999121467116&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "B1M9FjC5FQ", "title": "Gradient Acceleration in Activation Functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Dropout has been one of standard approaches to train deep neural networks, and it is known to regularize large models to avoid overfitting. The effect of dropout has been explained by avoiding co-adaptation.\nIn this paper, however, we propose a new explanation of why dropout works and propose a new technique to design better activation functions. First, we show that dropout can be explained as an optimization technique to push the input towards the saturation area of nonlinear activation function by accelerating gradient information flowing even in the saturation area in backpropagation. Based on this explanation, we propose a new technique for activation functions, {\\em gradient acceleration in activation function (GAAF)}, that accelerates gradients to flow even in the saturation area. Then, input to the activation function can climb onto the saturation area which makes the network more robust because the model converges on a flat region. \nExperiment results support our explanation of dropout and confirm that the proposed GAAF technique improves performances with expected properties.", "keywords": "Gradient Acceleration;Saturation Areas;Dropout;Coadaptation", "primary_area": "", "supplementary_material": "", "author": "Sangchul Hahn;Heeyoul Choi", "authorids": "s.hahn@handong.edu;hchoi@handong.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhahn2019gradient,\ntitle={Gradient Acceleration in Activation Functions},\nauthor={Sangchul Hahn and Heeyoul Choi},\nyear={2019},\nurl={https://openreview.net/forum?id=B1M9FjC5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1M9FjC5FQ", "pdf_size": 0, "rating": "2;3;5", "confidence": "5;4;3", "wc_review": "173;310;112", "wc_reply_reviewers": "20;0;0", "wc_reply_authors": "201;146;139", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 198.33333333333334, 82.79425637620575 ], "wc_reply_reviewers_avg": [ 6.666666666666667, 9.428090415820632 ], "wc_reply_authors_avg": [ 162.0, 27.72483844257107 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17524918709685192983&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1MAJhR5YX", "title": "Empirical Bounds on Linear Regions of Deep Rectifier Networks", "track": "main", "status": "Reject", "tldr": "We provide improved upper bounds for the number of linear regions used in network expressivity, and an highly efficient algorithm (w.r.t. exact counting) to obtain probabilistic lower bounds on the actual number of linear regions.", "abstract": "One form of characterizing the expressiveness of a piecewise linear neural network is by the number of linear regions, or pieces, of the function modeled. We have observed substantial progress in this topic through lower and upper bounds on the maximum number of linear regions and a counting procedure. However, these bounds only account for the dimensions of the network and the exact counting may take a prohibitive amount of time, therefore making it infeasible to benchmark the expressiveness of networks. In this work, we approximate the number of linear regions of specific rectifier networks with an algorithm for probabilistic lower bounds of mixed-integer linear sets. In addition, we present a tighter upper bound that leverages network coefficients. We test both on trained networks. The algorithm for probabilistic lower bounds is several orders of magnitude faster than exact counting and the values reach similar orders of magnitude, hence making our approach a viable method to compare the expressiveness of such networks. The refined upper bound is particularly stronger on networks with narrow layers. ", "keywords": "linear regions;approximate model counting;mixed-integer linear programming", "primary_area": "", "supplementary_material": "", "author": "Thiago Serra;Srikumar Ramalingam", "authorids": "tserra@gmail.com;srikumar.ramalingam@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nserra2019empirical,\ntitle={Empirical Bounds on Linear Regions of Deep Rectifier Networks},\nauthor={Thiago Serra and Srikumar Ramalingam},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MAJhR5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1MAJhR5YX", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "156;282;400", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "191;477;688", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 279.3333333333333, 99.63042819451405 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 452.0, 203.66803054644257 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14957069135265611655&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "B1MB5oRqtQ", "title": "On-Policy Trust Region Policy Optimisation with Replay Buffers", "track": "main", "status": "Reject", "tldr": "We investigate the theoretical and practical evidence of on-policy reinforcement learning improvement by reusing the data from several consecutive policies.", "abstract": "Building upon the recent success of deep reinforcement learning methods, we investigate the possibility of on-policy reinforcement learning improvement by reusing the data from several consecutive policies. On-policy methods bring many benefits, such as ability to evaluate each resulting policy. However, they usually discard all the information about the policies which existed before. In this work, we propose adaptation of the replay buffer concept, borrowed from the off-policy learning setting, to the on-policy algorithms. To achieve this, the proposed algorithm generalises the Q-, value and advantage functions for data from multiple policies. The method uses trust region optimisation, while avoiding some of the common problems of the algorithms such as TRPO or ACKTR: it uses hyperparameters to replace the trust region selection heuristics, as well as the trainable covariance matrix instead of the fixed one. In many cases, the method not only improves the results comparing to the state-of-the-art trust region on-policy learning algorithms such as ACKTR and TRPO, but also with respect to their off-policy counterpart DDPG. ", "keywords": "reinforcement learning;on-policy learning;trust region policy optimisation;replay buffer", "primary_area": "", "supplementary_material": "", "author": "Dmitry Kangin;Nicolas Pugeault", "authorids": "d.kangin@exeter.ac.uk;n.pugeault@exeter.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkangin2019onpolicy,\ntitle={On-Policy Trust Region Policy Optimisation with Replay Buffers},\nauthor={Dmitry Kangin and Nicolas Pugeault},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MB5oRqtQ},\n}", "github": "[![github](/images/github_icon.svg) dkangin/baselines](https://github.com/dkangin/baselines) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1MB5oRqtQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1MB5oRqtQ", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;5", "wc_review": "373;431;121", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "747;769;677", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 308.3333333333333, 134.56431754204218 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 731.0, 39.22584182228174 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5926318716272949146&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "B1MIBs05F7", "title": "On the Ineffectiveness of Variance Reduced Optimization for Deep Learning", "track": "main", "status": "Reject", "tldr": "The SVRG method fails on modern deep learning problems", "abstract": "The application of stochastic variance reduction to optimization has shown remarkable recent theoretical and practical success. The applicability of these techniques to the hard non-convex optimization problems encountered during training of modern deep neural networks is an open problem. We show that naive application of the SVRG technique and related approaches fail, and explore why.", "keywords": "machine learning;optimization;variance reduction", "primary_area": "", "supplementary_material": "", "author": "Aaron Defazio", "authorids": "aaron.defazio@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ndefazio2019on,\ntitle={On the Ineffectiveness of Variance Reduced Optimization for Deep Learning},\nauthor={Aaron Defazio},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MIBs05F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1MIBs05F7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;3", "wc_review": "880;509;203", "wc_reply_reviewers": "0;596;50", "wc_reply_authors": "398;670;129", "reply_reviewers": "0;2;1", "reply_authors": "1;3;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 530.6666666666666, 276.8083974320304 ], "wc_reply_reviewers_avg": [ 215.33333333333334, 269.9448503346975 ], "wc_reply_authors_avg": [ 399.0, 220.86345706491753 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 140, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=851481462151708992&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "B1MUroRct7", "title": "Online Learning for Supervised Dimension Reduction", "track": "main", "status": "Reject", "tldr": "We proposed two new approaches, the incremental sliced inverse regression and incremental overlapping sliced inverse regression, to implement supervised dimension reduction in an online learning manner.", "abstract": " Online learning has attracted great attention due to the increasing demand for systems that have the ability of learning and evolving. When the data to be processed is also high dimensional and dimension reduction is necessary for visualization or prediction enhancement, online dimension reduction will play an essential role. The purpose of this paper is to propose new online learning approaches for supervised dimension reduction. Our first algorithm is motivated by adapting the sliced inverse regression (SIR), a pioneer and effective algorithm for supervised dimension reduction, and making it implementable in an incremental manner. The new algorithm, called incremental sliced inverse regression (ISIR), is able to update the subspace of significant factors with intrinsic lower dimensionality fast and efficiently when new observations come in. We also refine the algorithm by using an overlapping technique and develop an incremental overlapping sliced inverse regression (IOSIR) algorithm. We verify the effectiveness and efficiency of both algorithms by simulations and real data applications.", "keywords": "Online Learning;Supervised Dimension Reduction;Incremental Sliced Inverse Regression;Effective Dimension Reduction Space", "primary_area": "", "supplementary_material": "", "author": "Ning Zhang;Qiang Wu", "authorids": "ningzhang0123@gmail.com;qwu@mtsu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhang2019online,\ntitle={Online Learning for Supervised Dimension Reduction},\nauthor={Ning Zhang and Qiang Wu},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MUroRct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1MUroRct7", "pdf_size": 0, "rating": "2;5;6", "confidence": "5;4;5", "wc_review": "357;228;171", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 252.0, 77.80745465570763 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.2773500981126146, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3782471342973614126&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "B1MX5j0cFX", "title": "Universal Attacks on Equivariant Networks", "track": "main", "status": "Reject", "tldr": "Universal attacks on equivariant networks using a small sample of test data", "abstract": "Adversarial attacks on neural networks perturb the input at test time in order to fool trained and deployed neural network models. Most attacks such as gradient-based Fast Gradient Sign Method (FGSM) by Goodfellow et al. 2015 and DeepFool by Moosavi-Dezfooli et al. 2016 are input-dependent, small, pixel-wise perturbations, and they give different attack directions for different inputs. On the other hand, universal adversarial attacks are input-agnostic and the same attack works for most inputs. Translation or rotation-equivariant neural network models provide one approach to prevent universal attacks based on simple geometric transformations. In this paper, we observe an interesting spectral property shared by all of the above input-dependent, pixel-wise adversarial attacks on translation and rotation-equivariant networks. We exploit this property to get a single universal attack direction that fools the model on most inputs. Moreover, we show how to compute this universal attack direction using principal components of the existing input-dependent attacks on a very small sample of test inputs. We complement our empirical results by a theoretical justification, using matrix concentration inequalities and spectral perturbation bounds. We also empirically observe that the top few principal adversarial attack directions are nearly orthogonal to the top few principal invariant directions.\n", "keywords": "adversarial;equivariance;universal;rotation;translation;CNN;GCNN", "primary_area": "", "supplementary_material": "", "author": "Amit Deshpande;Sandesh Kamath;K V Subrahmanyam", "authorids": "amitdesh@microsoft.com;ksandeshk@cmi.ac.in;kv@cmi.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndeshpande2019universal,\ntitle={Universal Attacks on Equivariant Networks},\nauthor={Amit Deshpande and Sandesh Kamath and K V Subrahmanyam},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MX5j0cFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1MX5j0cFX", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;5;4", "wc_review": "149;377;174", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "135;119;119", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 233.33333333333334, 102.0990804181028 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 124.33333333333333, 7.542472332656507 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AicDG9IaEygJ:scholar.google.com/&scioq=Universal+Attacks+on+Equivariant+Networks&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "title": "Explaining Image Classifiers by Counterfactual Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1104", "id": "B1MXz20cYQ", "author_site": "Chun-Hao Chang, Elliot Creager, Anna Goldenberg, David Duvenaud", "tldr": "We compute saliency by using a strong generative model to efficiently marginalize over plausible alternative inputs, revealing concentrated pixel areas that preserve label information.", "abstract": "When an image classifier makes a prediction, which parts of the image are relevant and why? We can rephrase this question to ask: which parts of the image, if they were not seen by the classifier, would most change its decision? Producing an answer requires marginalizing over images that could have been seen but weren't. We can sample plausible image in-fills by conditioning a generative model on the rest of the image. We then optimize to find the image regions that most change the classifier's decision after in-fill. Our approach contrasts with ad-hoc in-filling approaches, such as blurring or injecting noise, which generate inputs far from the data distribution, and ignore informative relationships between different parts of the image. Our method produces more compact and relevant saliency maps, with fewer artifacts compared to previous methods.", "keywords": "Explainability;Interpretability;Generative Models;Saliency Map;Machine Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Chun-Hao Chang;Elliot Creager;Anna Goldenberg;David Duvenaud", "authorids": "kingsley@cs.toronto.edu;creager@cs.toronto.edu;anna.goldenberg@utoronto.ca;duvenaud@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchang2018explaining,\ntitle={Explaining Image Classifiers by Counterfactual Generation},\nauthor={Chun-Hao Chang and Elliot Creager and Anna Goldenberg and David Duvenaud},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MXz20cYQ},\n}", "github": "[![github](/images/github_icon.svg) zzzace2000/FIDO-saliency](https://github.com/zzzace2000/FIDO-saliency)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;5;7", "confidence": "5;4;3", "wc_review": "412;217;376", "wc_reply_reviewers": "0;20;0", "wc_reply_authors": "293;532;370", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 335.0, 84.72307831990054 ], "wc_reply_reviewers_avg": [ 6.666666666666667, 9.428090415820632 ], "wc_reply_authors_avg": [ 398.3333333333333, 99.60700555460724 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 308, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6313449476805696850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1MXz20cYQ", "pdf": "https://openreview.net/pdf?id=B1MXz20cYQ", "email": ";;;", "author_num": 4 }, { "id": "B1MbDj0ctQ", "title": "Switching Linear Dynamics for Variational Bayes Filtering", "track": "main", "status": "Reject", "tldr": "A recurrent variational autoencoder with a latent transition function modeled by switching linear dynamical systems.", "abstract": "System identification of complex and nonlinear systems is a central problem for model predictive control and model-based reinforcement learning. Despite their complexity, such systems can often be approximated well by a set of linear dynamical systems if broken into appropriate subsequences. This mechanism not only helps us find good approximations of dynamics, but also gives us deeper insight into the underlying system. Leveraging Bayesian inference and Variational Autoencoders, we show how to learn a richer and more meaningful state space, e.g. encoding joint constraints and collisions with walls in a maze, from partial and high-dimensional observations. This representation translates into a gain of accuracy of the learned dynamics which we showcase on various simulated tasks.", "keywords": "sequence model;switching linear dynamical systems;variational bayes;filter;variational inference;stochastic recurrent neural network", "primary_area": "", "supplementary_material": "", "author": "Philip Becker-Ehmck;Jan Peters;Patrick van der Smagt", "authorids": "philip.becker-ehmck@volkswagen.de;peters@ias.tu-darmstadt.de;smagt@volkswagen.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbecker-ehmck2019switching,\ntitle={Switching Linear Dynamics for Variational Bayes Filtering},\nauthor={Philip Becker-Ehmck and Jan Peters and Patrick van der Smagt},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MbDj0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1MbDj0ctQ", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;5", "wc_review": "600;648;593", "wc_reply_reviewers": "0;14;0", "wc_reply_authors": "766;655;410", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 613.6666666666666, 24.44494948973214 ], "wc_reply_reviewers_avg": [ 4.666666666666667, 6.599663291074443 ], "wc_reply_authors_avg": [ 610.3333333333334, 148.72868661499777 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.3273268353539886, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8387734584820293660&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "B1MhpiRqFm", "title": "A Convergent Variant of the Boltzmann Softmax Operator in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The Boltzmann softmax operator can trade-off well between exploration and exploitation according to current estimation in an exponential weighting scheme, which is a promising way to address the exploration-exploitation dilemma in reinforcement learning. Unfortunately, the Boltzmann softmax operator is not a non-expansion, which may lead to unstable or even divergent learning behavior when used in estimating the value function. The non-expansion is a vital and widely-used sufficient condition to guarantee the convergence of value iteration. However, how to characterize the effect of such non-expansive operators in value iteration remains an open problem. In this paper, we propose a new technique to analyze the error bound of value iteration with the the Boltzmann softmax operator. We then propose the dynamic Boltzmann softmax(DBS) operator to enable the convergence to the optimal value function in value iteration. We also present convergence rate analysis of the algorithm.\nUsing Q-learning as an application, we show that the DBS operator can be applied in a model-free reinforcement learning algorithm. Finally, we demonstrate the effectiveness of the DBS operator in a toy problem called GridWorld and a suite of Atari games. Experimental results show that outperforms DQN substantially in benchmark games.", "keywords": "Reinforcement Learning;Boltzmann Softmax Operator;Value Function Estimation", "primary_area": "", "supplementary_material": "", "author": "Ling Pan;Qingpeng Cai;Qi Meng;Wei Chen;Tie-Yan Liu", "authorids": "v-lip@microsoft.com;cqp14@mails.tsinghua.edu.cn;v-qimeng@microsoft.com;wche@microsoft.com;tie-yan.liu@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npan2019a,\ntitle={A Convergent Variant of the Boltzmann Softmax Operator in Reinforcement Learning},\nauthor={Ling Pan and Qingpeng Cai and Qi Meng and Wei Chen and Tie-Yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=B1MhpiRqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1MhpiRqFm", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "146;667;333", "wc_reply_reviewers": "67;0;0", "wc_reply_authors": "389;965;168", "reply_reviewers": "1;0;0", "reply_authors": "2;2;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 382.0, 215.50096674183777 ], "wc_reply_reviewers_avg": [ 22.333333333333332, 31.584102892999123 ], "wc_reply_authors_avg": [ 507.3333333333333, 335.9606458434612 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CexnzBS0sKIJ:scholar.google.com/&scioq=A+Convergent+Variant+of+the+Boltzmann+Softmax+Operator+in+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "B1VWtsA5tQ", "title": "PPO-CMA: Proximal Policy Optimization with Covariance Matrix Adaptation", "track": "main", "status": "Reject", "tldr": "We propose a new continuous control reinforcement learning method with a variance adaptation strategy inspired by the Covariance Matrix Adaptation Evolution Strategy (CMA-ES) optimization method", "abstract": "Proximal Policy Optimization (PPO) is a highly popular model-free reinforcement learning (RL) approach. However, in continuous state and actions spaces and a Gaussian policy -- common in computer animation and robotics -- PPO is prone to getting stuck in local optima. In this paper, we observe a tendency of PPO to prematurely shrink the exploration variance, which naturally leads to slow progress. Motivated by this, we borrow ideas from CMA-ES, a black-box optimization method designed for intelligent adaptive Gaussian exploration, to derive PPO-CMA, a novel proximal policy optimization approach that expands the exploration variance on objective function slopes and only shrinks the variance when close to the optimum. This is implemented by using separate neural networks for policy mean and variance and training the mean and variance in separate passes. Our experiments demonstrate a clear improvement over vanilla PPO in many difficult OpenAI Gym MuJoCo tasks.", "keywords": "Continuous Control;Reinforcement Learning;Policy Optimization;Policy Gradient;Evolution Strategies;CMA-ES;PPO", "primary_area": "", "supplementary_material": "", "author": "Perttu H\u00e4m\u00e4l\u00e4inen;Amin Babadi;Xiaoxiao Ma;Jaakko Lehtinen", "authorids": "perttu.hamalainen@aalto.fi;amin.babadi@aalto.fi;xiaoxiao.ma@aalto.fi;jaakko.lehtinen@aalto.fi", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nh\u00e4m\u00e4l\u00e4inen2019ppocma,\ntitle={{PPO}-{CMA}: Proximal Policy Optimization with Covariance Matrix Adaptation},\nauthor={Perttu H\u00e4m\u00e4l\u00e4inen and Amin Babadi and Xiaoxiao Ma and Jaakko Lehtinen},\nyear={2019},\nurl={https://openreview.net/forum?id=B1VWtsA5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1VWtsA5tQ", "pdf_size": 0, "rating": "4;4;9", "confidence": "2;4;3", "wc_review": "135;560;306", "wc_reply_reviewers": "0;113;0", "wc_reply_authors": "156;804;0", "reply_reviewers": "0;1;0", "reply_authors": "1;2;0", "rating_avg": [ 5.666666666666667, 2.357022603955158 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 333.6666666666667, 174.60495092891523 ], "wc_reply_reviewers_avg": [ 37.666666666666664, 53.268710849386586 ], "wc_reply_authors_avg": [ 320.0, 348.11492355255325 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6411214467031627925&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "title": "SNIP: SINGLE-SHOT NETWORK PRUNING BASED ON CONNECTION SENSITIVITY", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/902", "id": "B1VZqjAcYX", "author_site": "Namhoon Lee, Thalaiyasingam Ajanthan, Philip H.S Torr", "tldr": "We present a new approach, SNIP, that is simple, versatile and interpretable; it prunes irrelevant connections for a given task at single-shot prior to training and is applicable to a variety of neural network models without modifications.", "abstract": "Pruning large neural networks while maintaining their performance is often desirable due to the reduced space and time complexity. In existing methods, pruning is done within an iterative optimization procedure with either heuristically designed pruning schedules or additional hyperparameters, undermining their utility. In this work, we present a new approach that prunes a given network once at initialization prior to training. To achieve this, we introduce a saliency criterion based on connection sensitivity that identifies structurally important connections in the network for the given task. This eliminates the need for both pretraining and the complex pruning schedule while making it robust to architecture variations. After pruning, the sparse network is trained in the standard way. Our method obtains extremely sparse networks with virtually the same accuracy as the reference network on the MNIST, CIFAR-10, and Tiny-ImageNet classification tasks and is broadly applicable to various architectures including convolutional, residual and recurrent networks. Unlike existing methods, our approach enables us to demonstrate that the retained connections are indeed relevant to the given task.", "keywords": "neural network pruning;connection sensitivity", "primary_area": "", "supplementary_material": "", "author": "Namhoon Lee;Thalaiyasingam Ajanthan;Philip Torr", "authorids": "namhoon@robots.ox.ac.uk;ajanthan@robots.ox.ac.uk;phst@robots.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlee2018snip,\ntitle={{SNIP}: {SINGLE}-{SHOT} {NETWORK} {PRUNING} {BASED} {ON} {CONNECTION} {SENSITIVITY}},\nauthor={Namhoon Lee and Thalaiyasingam Ajanthan and Philip Torr},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1VZqjAcYX},\n}", "github": "[![github](/images/github_icon.svg) namhoonlee/snip-public](https://github.com/namhoonlee/snip-public) + [![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=B1VZqjAcYX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;5;4", "wc_review": "581;1220;437", "wc_reply_reviewers": "289;17;0", "wc_reply_authors": "794;1523;660", "reply_reviewers": "3;1;0", "reply_authors": "3;5;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 746.0, 340.2851745227817 ], "wc_reply_reviewers_avg": [ 102.0, 132.41097638287647 ], "wc_reply_authors_avg": [ 992.3333333333334, 379.2047233648629 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1524, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9820036975414969048&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=B1VZqjAcYX", "pdf": "https://openreview.net/pdf?id=B1VZqjAcYX", "email": ";;", "author_num": 3 }, { "id": "B1e0KsRcYQ", "title": "Efficient Codebook and Factorization for Second Order Representation Learning", "track": "main", "status": "Reject", "tldr": "We propose a joint codebook and factorization scheme to improve second order pooling.", "abstract": "Learning rich and compact representations is an open topic in many fields such as word embedding, visual question-answering, object recognition or image retrieval. Although deep neural networks (convolutional or not) have made a major breakthrough during the last few years by providing hierarchical, semantic and abstract representations for all of these tasks, these representations are not necessary as rich as needed nor as compact as expected. Models using higher order statistics, such as bilinear pooling, provide richer representations at the cost of higher dimensional features. Factorization schemes have been proposed but without being able to reach the original compactness of first order models, or at a heavy loss in performances. This paper addresses these two points by extending factorization schemes to codebook strategies, allowing compact representations with the same dimensionality as first order representations, but with second order performances. Moreover, we extend this framework with a joint codebook and factorization scheme, granting a reduction both in terms of parameters and computation cost. This formulation leads to state-of-the-art results and compact second-order models with few additional parameters and intermediate representations with a dimension similar to that of first-order statistics.", "keywords": "Second order pooling", "primary_area": "", "supplementary_material": "", "author": "Pierre jacob;David Picard;Aymeric Histace;Edouard Klein", "authorids": "pierre.jacob@ensea.fr;picard@ensea.fr;aymeric.histace@ensea.fr;edouard.klein@gendarmerie.interieur.gouv.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\njacob2019efficient,\ntitle={Efficient Codebook and Factorization for Second Order Representation Learning},\nauthor={Pierre jacob and David Picard and Aymeric Histace and Edouard Klein},\nyear={2019},\nurl={https://openreview.net/forum?id=B1e0KsRcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1e0KsRcYQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;2;4", "wc_review": "373;388;645", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "499;693;400", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 468.6666666666667, 124.8367823288562 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 530.6666666666666, 121.6945173602967 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.32732683535398854, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2454245634313097444&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5 }, { "title": "Diagnosing and Enhancing VAE Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/899", "id": "B1e0X3C9tQ", "author_site": "Bin Dai, David Wipf", "tldr": "We closely analyze the VAE objective function and draw novel conclusions that lead to simple enhancements.", "abstract": "Although variational autoencoders (VAEs) represent a widely influential deep generative model, many aspects of the underlying energy function remain poorly understood. In particular, it is commonly believed that Gaussian encoder/decoder assumptions reduce the effectiveness of VAEs in generating realistic samples. In this regard, we rigorously analyze the VAE objective, differentiating situations where this belief is and is not actually true. We then leverage the corresponding insights to develop a simple VAE enhancement that requires no additional hyperparameters or sensitive tuning. Quantitatively, this proposal produces crisp samples and stable FID scores that are actually competitive with a variety of GAN models, all while retaining desirable attributes of the original VAE architecture. The code for our model is available at \\url{https://github.com/daib13/TwoStageVAE}.", "keywords": "variational autoencoder;generative models", "primary_area": "", "supplementary_material": "", "author": "Bin Dai;David Wipf", "authorids": "v-bindai@microsoft.com;davidwipf@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ndai2018diagnosing,\ntitle={Diagnosing and Enhancing {VAE} Models},\nauthor={Bin Dai and David Wipf},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1e0X3C9tQ},\n}", "github": "[![github](/images/github_icon.svg) daib13/TwoStageVAE](https://github.com/daib13/TwoStageVAE) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=B1e0X3C9tQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;4;4", "wc_review": "276;830;831", "wc_reply_reviewers": "122;0;0", "wc_reply_authors": "2162;1104;339", "reply_reviewers": "1;0;0", "reply_authors": "5;3;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 645.6666666666666, 261.39412558221136 ], "wc_reply_reviewers_avg": [ 40.666666666666664, 57.51135153650587 ], "wc_reply_authors_avg": [ 1201.6666666666667, 747.4339807337873 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 505, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15377413262741867924&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1e0X3C9tQ", "pdf": "https://openreview.net/pdf?id=B1e0X3C9tQ", "email": ";", "author_num": 2 }, { "id": "B1e4wo09K7", "title": "Invariant-equivariant representation learning for multi-class data", "track": "main", "status": "Reject", "tldr": "This paper presents a novel latent-variable generative modelling technique that enables the representation of global information into one latent variable and local information into another latent variable.", "abstract": "Representations learnt through deep neural networks tend to be highly informative, but opaque in terms of what information they learn to encode. We introduce an approach to probabilistic modelling that learns to represent data with two separate deep representations: an invariant representation that encodes the information of the class from which the data belongs, and an equivariant representation that encodes the symmetry transformation defining the particular data point within the class manifold (equivariant in the sense that the representation varies naturally with symmetry transformations). This approach to representation learning is conceptually transparent, easy to implement, and in-principle generally applicable to any data comprised of discrete classes of continuous distributions (e.g. objects in images, topics in language, individuals in behavioural data). We demonstrate qualitatively compelling representation learning and competitive quantitative performance, in both supervised and semi-supervised settings, versus comparable modelling approaches in the literature with little fine tuning.", "keywords": "representation learning;semantic representations;local vs global information;latent variable modelling;generative modelling;semi-supervised learning;variational autoencoders.", "primary_area": "", "supplementary_material": "", "author": "Ilya Feige", "authorids": "ilya@asidatascience.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nfeige2019invariantequivariant,\ntitle={Invariant-equivariant representation learning for multi-class data},\nauthor={Ilya Feige},\nyear={2019},\nurl={https://openreview.net/forum?id=B1e4wo09K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1e4wo09K7", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;5;2", "wc_review": "327;559;164", "wc_reply_reviewers": "0;9;0", "wc_reply_authors": "368;866;79", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 350.0, 162.07611380665156 ], "wc_reply_reviewers_avg": [ 3.0, 4.242640687119285 ], "wc_reply_authors_avg": [ 437.6666666666667, 325.04597965347745 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13745121487961455933&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "B1e7hs05Km", "title": "Efficient Exploration through Bayesian Deep Q-Networks", "track": "main", "status": "Reject", "tldr": "Using Bayesian regression for the last layer of DQN, and do Thompson Sampling for exploration. With Bayesian Regret bound", "abstract": "We propose Bayesian Deep Q-Networks (BDQN), a principled and a practical Deep Reinforcement Learning (DRL) algorithm for Markov decision processes (MDP). It combines Thompson sampling with deep-Q networks (DQN). Thompson sampling ensures more efficient exploration-exploitation tradeoff in high dimensions. It is typically carried out through posterior sampling over the model parameters, which makes it computationally expensive. To overcome this limitation, we directly incorporate uncertainty over the value (Q) function. Further, we only introduce randomness in the last layer (i.e. the output layer) of the DQN and use independent Gaussian priors on the weights. This allows us to efficiently carry out Thompson sampling through Gaussian sampling and Bayesian Linear Regression (BLR), which has fast closed-form updates. The rest of the layers of the Q network are trained through back propagation, as in a standard DQN. We apply our method to a wide range of Atari games in Arcade Learning Environments and compare BDQN to a powerful baseline: the double deep Q-network (DDQN). Since BDQN carries out more efficient exploration, it is able to reach higher rewards substantially faster: in less than 5M\u00b11M samples for almost half of the games to reach DDQN scores while a typical run of DDQN is 50-200M. We also establish theoretical guarantees for the special case when the feature representation is fixed and not learnt. We show that the Bayesian regret is bounded by O\udbff\udc12(d \\sqrt(N)) after N time steps for a d-dimensional feature map, and this bound is shown to be tight up-to logarithmic factors. To the best of our knowledge, this is the first Bayesian theoretical guarantee for Markov Decision Processes (MDP) beyond the tabula rasa setting.", "keywords": "Deep RL;Exploration Exploitation;DQN;Bayesian Regret;Thompson Sampling", "primary_area": "", "supplementary_material": "", "author": "Kamyar Azizzadenesheli;Animashree Anandkumar", "authorids": "kazizzad@uci.edu;anima@caltech.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nazizzadenesheli2019efficient,\ntitle={Efficient Exploration through Bayesian Deep Q-Networks},\nauthor={Kamyar Azizzadenesheli and Animashree Anandkumar},\nyear={2019},\nurl={https://openreview.net/forum?id=B1e7hs05Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1e7hs05Km", "pdf_size": 0, "rating": "2;4;4;6", "confidence": "5;4;2;2", "wc_review": "430;631;1931;672", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "743;376;0;744", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;0;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "wc_review_avg": [ 916.0, 593.1235115892811 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 465.75, 307.92399630428287 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.7071067811865476 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8164965809277259, "gs_citation": 220, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13679807550374293657&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "id": "B1e8CsRctX", "title": "Generative Ensembles for Robust Anomaly Detection", "track": "main", "status": "Reject", "tldr": "We use generative models to perform out-of-distribution detection, and improve their robustness with uncertainty estimation.", "abstract": "Deep generative models are capable of learning probability distributions over large, high-dimensional datasets such as images, video and natural language. Generative models trained on samples from p(x) ought to assign low likelihoods to out-of-distribution (OoD) samples from q(x), making them suitable for anomaly detection applications. We show that in practice, likelihood models are themselves susceptible to OoD errors, and even assign large likelihoods to images from other natural datasets. To mitigate these issues, we propose Generative Ensembles, a model-independent technique for OoD detection that combines density-based anomaly detection with uncertainty estimation. Our method outperforms ODIN and VIB baselines on image datasets, and achieves comparable performance to a classification model on the Kaggle Credit Fraud dataset.", "keywords": "Anomaly Detection;Uncertainty;Out-of-Distribution;Generative Models", "primary_area": "", "supplementary_material": "", "author": "Hyunsun Choi;Eric Jang", "authorids": "hyunsunchoi@kaist.ac.kr;ejang@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchoi2019generative,\ntitle={Generative Ensembles for Robust Anomaly Detection},\nauthor={Hyunsun Choi and Eric Jang},\nyear={2019},\nurl={https://openreview.net/forum?id=B1e8CsRctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1e8CsRctX", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;3", "wc_review": "225;498;233", "wc_reply_reviewers": "0;207;0", "wc_reply_authors": "460;1100;87", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 318.6666666666667, 126.84986751624493 ], "wc_reply_reviewers_avg": [ 69.0, 97.58073580374356 ], "wc_reply_authors_avg": [ 549.0, 418.31646712347657 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17598844101613428995&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "B1e9W3AqFX", "title": "Multi-task Learning with Gradient Communication", "track": "main", "status": "Reject", "tldr": "We introduce an inductive bias for multi-task learning, allowing different tasks to communicate by gradient passing.", "abstract": " In this paper, we describe a general framework to systematically analyze current neural models for multi-task learning, in which we find that existing models expect to disentangle features into different spaces while features learned in practice are still entangled in shared space, leaving potential hazards for other training or unseen tasks. We propose to alleviate this problem by incorporating a new inductive bias into the process of multi-task learning, that different tasks can communicate with each other not only by passing hidden variables but gradients explicitly. Experimentally, we evaluate proposed methods on three groups of tasks and two types of settings (\\textsc{in-task} and \\textsc{out-of-task}). Quantitative and qualitative results show their effectiveness.", "keywords": "Pretend to share;Gradient Communication", "primary_area": "", "supplementary_material": "", "author": "Pengfei Liu;Xuanjing Huang", "authorids": "pfliu14@fudan.edu.cn;xjhuang@fudan.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nliu2019multitask,\ntitle={Multi-task Learning with Gradient Communication},\nauthor={Pengfei Liu and Xuanjing Huang},\nyear={2019},\nurl={https://openreview.net/forum?id=B1e9W3AqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1e9W3AqFX", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "wc_review": "484;425;247", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "370;172;118", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 385.3333333333333, 100.73838504871031 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 220.0, 108.33282051160673 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Lj2y2ovUhYcJ:scholar.google.com/&scioq=Multi-task+Learning+with+Gradient+Communication&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "B1e9csRcFm", "title": "The Importance of Norm Regularization in Linear Graph Embedding: Theoretical Analysis and Empirical Demonstration", "track": "main", "status": "Reject", "tldr": "We argue that the generalization of linear graph embedding is not due to the dimensionality constraint but rather the small norm of embedding vectors.", "abstract": "Learning distributed representations for nodes in graphs is a crucial primitive in network analysis with a wide spectrum of applications. Linear graph embedding methods learn such representations by optimizing the likelihood of both positive and negative edges while constraining the dimension of the embedding vectors. We argue that the generalization performance of these methods is not due to the dimensionality constraint as commonly believed, but rather the small norm of embedding vectors. Both theoretical and empirical evidence are provided to support this argument: (a) we prove that the generalization error of these methods can be bounded by limiting the norm of vectors, regardless of the embedding dimension; (b) we show that the generalization performance of linear graph embedding methods is correlated with the norm of embedding vectors, which is small due to the early stopping of SGD and the vanishing gradients. We performed extensive experiments to validate our analysis and showcased the importance of proper norm regularization in practice.", "keywords": "Graph Embedding;Generalization Analysis;Matrix Factorization", "primary_area": "", "supplementary_material": "", "author": "Yihan Gao;Chao Zhang;Jian Peng;Aditya Parameswaran", "authorids": "gaoyihan@gmail.com;czhang82@illinois.edu;jianpeng@illinois.edu;adityagp@illinois.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngao2019the,\ntitle={The Importance of Norm Regularization in Linear Graph Embedding: Theoretical Analysis and Empirical Demonstration},\nauthor={Yihan Gao and Chao Zhang and Jian Peng and Aditya Parameswaran},\nyear={2019},\nurl={https://openreview.net/forum?id=B1e9csRcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1e9csRcFm", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;3;3", "wc_review": "348;486;184", "wc_reply_reviewers": "0;352;0", "wc_reply_authors": "606;683;85", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 339.3333333333333, 123.44319431310187 ], "wc_reply_reviewers_avg": [ 117.33333333333333, 165.93439131844315 ], "wc_reply_authors_avg": [ 458.0, 265.61751950251073 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=502024832176536824&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1eCCoR5tm", "title": "Pseudosaccades: A simple ensemble scheme for improving classification performance of deep nets", "track": "main", "status": "Reject", "tldr": "Inspired by saccades we describe a simple, cheap, effective way to improve deep net performance on an image labelling task.", "abstract": "We describe a simple ensemble approach that, unlike conventional ensembles,\nuses multiple random data sketches (\u2018pseudosaccades\u2019) rather than multiple classifiers\nto improve classification performance. Using this simple, but novel, approach\nwe obtain statistically significant improvements in classification performance on\nAlexNet, GoogLeNet, ResNet-50 and ResNet-152 baselines on Imagenet data \u2013\ne.g. of the order of 0.3% to 0.6% in Top-1 accuracy and similar improvements in\nTop-k accuracy \u2013 essentially nearly for free.", "keywords": "Ensemble classification;random subspace;data sketching", "primary_area": "", "supplementary_material": "", "author": "Jin Sean Lim;Robert John Durrant", "authorids": "me@nicklim.com;bobd@waikato.ac.nz", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlim2019pseudosaccades,\ntitle={Pseudosaccades: A simple ensemble scheme for improving classification performance of deep nets},\nauthor={Jin Sean Lim and Robert John Durrant},\nyear={2019},\nurl={https://openreview.net/forum?id=B1eCCoR5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1eCCoR5tm", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;4", "wc_review": "316;288;98", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 234.0, 96.84351638941384 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2ZzrM4LsUuYJ:scholar.google.com/&scioq=Pseudosaccades:+A+simple+ensemble+scheme+for+improving+classification+performance+of+deep+nets&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1eEKi0qYQ", "title": "Interactive Parallel Exploration for Reinforcement Learning in Continuous Action Spaces", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, a new interactive parallel learning scheme is proposed to enhance the performance of off-policy continuous-action reinforcement learning. In the proposed interactive parallel learning scheme, multiple identical learners with their own value-functions and policies share a common experience replay buffer, and search a good policy in collaboration with the guidance of the best policy information. The information of the best policy is fused in a soft manner by constructing an augmented loss function for policy update to enlarge the overall search space by the multiple learners. The guidance by the previous best policy and the enlarged search space by the proposed interactive parallel learning scheme enable faster and better policy search in the policy parameter space. Working algorithms are constructed by applying the proposed interactive parallel learning scheme to several off-policy reinforcement learning algorithms such as the twin delayed deep deterministic (TD3) policy gradient algorithm and the soft actor-critic (SAC) algorithm, and numerical results show that the constructed IPE-enhanced algorithms outperform most of the current state-of-the-art reinforcement learning algorithms for continuous action control.", "keywords": "reinforcement learning;continuous action space RL", "primary_area": "", "supplementary_material": "", "author": "Whiyoung Jung;Giseung Park;Youngchul Sung", "authorids": "wy.jung@kaist.ac.kr;gs.park@kaist.ac.kr;ycsung@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njung2019interactive,\ntitle={Interactive Parallel Exploration for Reinforcement Learning in Continuous Action Spaces},\nauthor={Whiyoung Jung and Giseung Park and Youngchul Sung},\nyear={2019},\nurl={https://openreview.net/forum?id=B1eEKi0qYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eEKi0qYQ", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "wc_review": "902;804;238", "wc_reply_reviewers": "0;0;63", "wc_reply_authors": "772;475;270", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 648.0, 292.6613515083033 ], "wc_reply_reviewers_avg": [ 21.0, 29.698484809834994 ], "wc_reply_authors_avg": [ 505.6666666666667, 206.08466437095433 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rET6fyfgQ48J:scholar.google.com/&scioq=Interactive+Parallel+Exploration+for+Reinforcement+Learning+in+Continuous+Action+Spaces&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1eKk2CcKm", "title": "Towards the Latent Transcriptome", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work we propose a method to compute continuous embeddings for kmers from raw RNA-seq data, in a reference-free fashion. We report that our model captures information of both DNA sequence similarity as well as DNA sequence abundance in the embedding latent space. We confirm the quality of these vectors by comparing them to known gene sub-structures and report that the latent space recovers exon information from raw RNA-Seq data from acute myeloid leukemia patients. Furthermore we show that this latent space allows the detection of genomic abnormalities such as translocations as well as patient-specific mutations, making this representation space both useful for visualization as well as analysis.", "keywords": "representation learning;RNA-Seq;gene expression;bioinformatics;computational biology;transcriptomics;deep learning;genomics", "primary_area": "", "supplementary_material": "", "author": "Assya Trofimov;Francis Dutil;Claude Perreault;Sebastien Lemieux;Yoshua Bengio;Joseph Paul Cohen", "authorids": "trofimov.assya@gmail.com;frdutil@gmail.com;claude.perreault@umontreal.ca;s.lemieux@umontreal.ca;yoshua.bengio@mila.quebec;joseph@josephpcohen.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ntrofimov2019towards,\ntitle={Towards the Latent Transcriptome},\nauthor={Assya Trofimov and Francis Dutil and Claude Perreault and Sebastien Lemieux and Yoshua Bengio and Joseph Paul Cohen},\nyear={2019},\nurl={https://openreview.net/forum?id=B1eKk2CcKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1eKk2CcKm", "pdf_size": 0, "rating": "2;4;5", "confidence": "5;4;4", "wc_review": "946;229;616", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "393;340;367", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 597.0, 293.0221834605701 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 366.6666666666667, 21.638443156156644 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9900273605198587869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "B1eO9oA5Km", "title": "A Guider Network for Multi-Dual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "A large amount of parallel data is needed to train a strong neural machine translation (NMT) system. This is a major challenge for low-resource languages. Building on recent work on unsupervised and semi-supervised methods, we propose a multi-dual learning framework to improve the performance of NMT by using an almost infinite amount of available monolingual data and some parallel data of other languages. Since our framework involves multiple languages and components, we further propose a timing optimization method that uses reinforcement learning (RL) to optimally schedule the different components in order to avoid imbalanced training. Experimental results demonstrate the validity of our model, and confirm its superiority to existing dual learning methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wenpeng Hu;Zhengwei Tao;Zhanxing Zhu;Bing Liu;Zhou Lin;Jinwen Ma;Dongyan Zhao;Rui Yan", "authorids": "wenpeng.hu@pku.edu.cn;tttzw@pku.edu.cn;zhanxing.zhu@pku.edu.cn;liub@uic.edu;jokerlin@pku.edu.cn;jwma@math.pku.edu.cn;zhaody@pku.edu.cn;ruiyan@pku.edu.cn", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nhu2019a,\ntitle={A Guider Network for Multi-Dual Learning},\nauthor={Wenpeng Hu and Zhengwei Tao and Zhanxing Zhu and Bing Liu and Zhou Lin and Jinwen Ma and Dongyan Zhao and Rui Yan},\nyear={2019},\nurl={https://openreview.net/forum?id=B1eO9oA5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eO9oA5Km", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;2", "wc_review": "630;245;304", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 393.0, 169.30642830875226 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vrloj4MIhBMJ:scholar.google.com/&scioq=A+Guider+Network+for+Multi-Dual+Learning&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "id": "B1ePui0ctQ", "title": "SnapQuant: A Probabilistic and Nested Parameterization for Binary Networks", "track": "main", "status": "Reject", "tldr": "We propose SnapQuant, a reinforcement learning method for training binary weight networks from scratch under the Bayesian deep learning perspective, which approximates the posterior distribution of binary weights instead of a single point estimation.", "abstract": "In this paper, we study the problem of training real binary weight networks (without layer-wise or filter-wise scaling factors) from scratch under the Bayesian deep learning perspective, meaning that the final objective is to approximate the posterior distribution of binary weights rather than reach a point estimation. The proposed method, named as SnapQuant, has two intriguing features: (1) The posterior distribution is parameterized as a policy network trained with a reinforcement learning scheme. During the training phase, we generate binary weights on-the-fly since what we actually maintain is the policy network, and all the binary weights are used in a burn-after-reading style. At the testing phase, we can sample binary weight instances for a given recognition architecture from the learnt policy network. (2) The policy network, which has a nested parameter structure consisting of layer-wise, filter-wise and kernel-wise parameter sharing designs, is applicable to any neural network architecture. Such a nested parameterization explicitly and hierarchically models the joint posterior distribution of binary weights. The performance of SnapQuant is evaluated with several visual recognition tasks including ImageNet. The code will be made publicly available.", "keywords": "Binary weight networks;neural network quantization;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Kuan Wang;Hao Zhao;Anbang Yao;Aojun Zhou;Dawei Sun;Yurong Chen", "authorids": "wangkuan15@mails.tsinghua.edu.cn;hao.zhao@intel.com;anbang.yao@intel.com;aojun.zhou@intel.com;dawei.sun@intel.com;yurong.chen@intel.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nwang2019snapquant,\ntitle={SnapQuant: A Probabilistic and Nested Parameterization for Binary Networks},\nauthor={Kuan Wang and Hao Zhao and Anbang Yao and Aojun Zhou and Dawei Sun and Yurong Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=B1ePui0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1ePui0ctQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;3", "wc_review": "610;623;255", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "563;248;101", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 496.0, 170.49535673051824 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 304.0, 192.72259857110686 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:b1cddnzZ-uoJ:scholar.google.com/&scioq=SnapQuant:+A+Probabilistic+and+Nested+Parameterization+for+Binary+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1eSg3C9Ym", "title": "MEAN-FIELD ANALYSIS OF BATCH NORMALIZATION", "track": "main", "status": "Reject", "tldr": "", "abstract": "Batch Normalization (BatchNorm) is an extremely useful component of modern neural network architectures, enabling optimization using higher learning rates and achieving faster convergence. In this paper, we use mean-field theory to analytically quantify the impact of BatchNorm on the geometry of the loss landscape for multi-layer networks consisting of fully-connected and convolutional layers. We show that it has a flattening effect on the loss landscape, as quantified by the maximum eigenvalue of the Fisher Information Matrix. These findings are then used to justify the use of larger learning rates for networks that use BatchNorm, and we provide quantitative characterization of the maximal allowable learning rate to ensure convergence. Experiments support our theoretically predicted maximum learning rate, and furthermore suggest that networks with smaller values of the BatchNorm parameter achieve lower loss after the same number of epochs of training.", "keywords": "neural networks;optimization;batch normalization;mean field theory;Fisher information", "primary_area": "", "supplementary_material": "", "author": "Mingwei Wei;James Stokes;David J Schwab", "authorids": "m.wei@u.northwestern.edu;james@tunnel.tech;dschwab@gc.cuny.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwei2019meanfield,\ntitle={{MEAN}-{FIELD} {ANALYSIS} {OF} {BATCH} {NORMALIZATION}},\nauthor={Mingwei Wei and James Stokes and David J Schwab},\nyear={2019},\nurl={https://openreview.net/forum?id=B1eSg3C9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1eSg3C9Ym", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;3", "wc_review": "209;360;164", "wc_reply_reviewers": "0;122;0", "wc_reply_authors": "768;1122;348", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 244.33333333333334, 83.82654048026131 ], "wc_reply_reviewers_avg": [ 40.666666666666664, 57.51135153650587 ], "wc_reply_authors_avg": [ 746.0, 316.36687563649895 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5123572354936580088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "B1eXbn05t7", "title": "Open-Ended Content-Style Recombination Via Leakage Filtering", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider visual domains in which a class label specifies the content of an image, and class-irrelevant properties that differentiate instances constitute the style. We present a domain-independent method that permits the open-ended recombination of style of one image with the content of another. Open ended simply means that the method generalizes to style and content not present in the training data. The method starts by constructing a content embedding using an existing deep metric-learning technique. This trained content encoder is incorporated into a variational autoencoder (VAE), paired with a to-be-trained style encoder. The VAE reconstruction loss alone is inadequate to ensure a decomposition of the latent representation into style and content. Our method thus includes an auxiliary loss, leakage filtering, which ensures that no style information remaining in the content representation is used for reconstruction and vice versa. We synthesize novel images by decoding the style representation obtained from one image with the content representation from another. Using this method for data-set augmentation, we obtain state-of-the-art performance on few-shot learning tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Karl Ridgeway;Michael C. Mozer", "authorids": "karl.ridgeway@colorado.edu;mozer@colorado.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nridgeway2019openended,\ntitle={Open-Ended Content-Style Recombination Via Leakage Filtering},\nauthor={Karl Ridgeway and Michael C. Mozer},\nyear={2019},\nurl={https://openreview.net/forum?id=B1eXbn05t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1eXbn05t7", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "wc_review": "440;630;193", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "313;311;112", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 421.0, 178.9096606297901 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 245.33333333333334, 94.2844396258235 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1499918327860699029&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "B1eZCjA9KX", "title": "IncSQL: Training Incremental Text-to-SQL Parsers with Non-Deterministic Oracles", "track": "main", "status": "Withdraw", "tldr": "We design incremental sequence-to-action parsers for text-to-SQL task and achieve SOTA results. We further improve by using non-deterministic oracles to allow multiple correct action sequences. ", "abstract": "We present a sequence-to-action parsing approach for the natural language to SQL task that incrementally fills the slots of a SQL query with feasible actions from a pre-defined inventory. To account for the fact that typically there are multiple correct SQL queries with the same or very similar semantics, we draw inspiration from syntactic parsing techniques and propose to train our sequence-to-action models with non-deterministic oracles. We evaluate our models on the WikiSQL dataset and achieve an execution accuracy of 83.7% on the test set, a 2.1% absolute improvement over the models trained with traditional static oracles assuming a single correct target SQL query. When further combined with the execution-guided decoding strategy, our model sets a new state-of-the-art performance at an execution accuracy of 87.1%.", "keywords": "semantic parsing;non-deterministic oracles;natural language to SQL;incremental parsing;sequence prediction", "primary_area": "", "supplementary_material": "", "author": "Tianze Shi;Kedar Tatwawadi;Kaushik Chakrabarti;Yi Mao;Oleksandr Polozov;Weizhu Chen", "authorids": "tianze@cs.cornell.edu;kedart@stanford.edu;kaushik@microsoft.com;maoyi@microsoft.com;polozov@microsoft.com;wzchen@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1eZCjA9KX", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;4;3", "wc_review": "238;262;380", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 293.3333333333333, 62.06090198793082 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9196661678604702076&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1eZRiC9YX", "title": "Sufficient Conditions for Robustness to Adversarial Examples: a Theoretical and Empirical Study with Bayesian Neural Networks", "track": "main", "status": "Reject", "tldr": "We prove that idealised Bayesian neural networks can have no adversarial examples, and give empirical evidence with real-world BNNs.", "abstract": "We prove, under two sufficient conditions, that idealised models can have no adversarial examples. We discuss which idealised models satisfy our conditions, and show that idealised Bayesian neural networks (BNNs) satisfy these. We continue by studying near-idealised BNNs using HMC inference, demonstrating the theoretical ideas in practice. We experiment with HMC on synthetic data derived from MNIST for which we know the ground-truth image density, showing that near-perfect epistemic uncertainty correlates to density under image manifold, and that adversarial images lie off the manifold in our setting. This suggests why MC dropout, which can be seen as performing approximate inference, has been observed to be an effective defence against adversarial examples in practice; We highlight failure-cases of non-idealised BNNs relying on dropout, suggesting a new attack for dropout models and a new defence as well. Lastly, we demonstrate the defence on a cats-vs-dogs image classification task with a VGG13 variant.", "keywords": "Bayesian deep learning;Bayesian neural networks;adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Yarin Gal;Lewis Smith", "authorids": "yarin@cs.ox.ac.uk;lsgs@robots.ox.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngal2019sufficient,\ntitle={Sufficient Conditions for Robustness to Adversarial Examples: a Theoretical and Empirical Study with Bayesian Neural Networks},\nauthor={Yarin Gal and Lewis Smith},\nyear={2019},\nurl={https://openreview.net/forum?id=B1eZRiC9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1eZRiC9YX", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "wc_review": "400;815;823", "wc_reply_reviewers": "1154;315;599", "wc_reply_authors": "1924;1055;2031", "reply_reviewers": "3;1;1", "reply_authors": "5;3;3", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 679.3333333333334, 197.54549405699495 ], "wc_reply_reviewers_avg": [ 689.3333333333334, 348.42534669886203 ], "wc_reply_authors_avg": [ 1670.0, 437.05911118139005 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 3.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17173125324302338765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1edvs05Y7", "title": "Sparse Binary Compression: Towards Distributed Deep Learning with minimal Communication", "track": "main", "status": "Reject", "tldr": "", "abstract": "Currently, progressively larger deep neural networks are trained on ever growing data corpora. In result, distributed training schemes are becoming increasingly relevant. A major issue in distributed training is the limited communication bandwidth between contributing nodes or prohibitive communication cost in general. \n%These challenges become even more pressing, as the number of computation nodes increases. \nTo mitigate this problem we propose Sparse Binary Compression (SBC), a compression framework that allows for a drastic reduction of communication cost for distributed training. SBC combines existing techniques of communication delay and gradient sparsification with a novel binarization method and optimal weight update encoding to push compression gains to new limits. By doing so, our method also allows us to smoothly trade-off gradient sparsity and temporal sparsity to adapt to the requirements of the learning task. \n%We use tools from information theory to reason why SBC can achieve the striking compression rates observed in the experiments.\nOur experiments show, that SBC can reduce the upstream communication on a variety of convolutional and recurrent neural network architectures by more than four orders of magnitude without significantly harming the convergence speed in terms of forward-backward passes. For instance, we can train ResNet50 on ImageNet in the same number of iterations to the baseline accuracy, using $\\times 3531$ less bits or train it to a $1\\%$ lower accuracy using $\\times 37208$ less bits. In the latter case, the total upstream communication required is cut from 125 terabytes to 3.35 gigabytes for every participating client. Our method also achieves state-of-the-art compression rates in a Federated Learning setting with 400 clients.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Felix Sattler;Simon Wiedemann;Klaus-Robert M\u00fcller;Wojciech Samek", "authorids": "felix.sattler@hhi.fraunhofer.de;simon.wiedemann@hhi.fraunhofer.de;klaus-robert.mueller@tu-berlin.de;wojciech.samek@hhi.fraunhofer.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsattler2019sparse,\ntitle={Sparse Binary Compression: Towards Distributed Deep Learning with minimal Communication},\nauthor={Felix Sattler and Simon Wiedemann and Klaus-Robert M\u00fcller and Wojciech Samek},\nyear={2019},\nurl={https://openreview.net/forum?id=B1edvs05Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1edvs05Y7", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "488;343;334", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 388.3333333333333, 70.57068954050416 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 262, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15863896832467551927&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "B1enCo0cK7", "title": "One Bit Matters: Understanding Adversarial Examples as the Abuse of Redundancy", "track": "main", "status": "Withdraw", "tldr": "A new theoretical explanation for the existence of adversarial examples", "abstract": "Adversarial examples have somewhat disrupted the enormous success of machine learning (ML) and are causing concern with regards to its trustworthiness: A small perturbation of an input results in an arbitrary failure of an otherwise seemingly well-trained ML system. While studies are being conducted to discover the intrinsic properties of adversarial examples, such as their transferability and universality, there is insufficient theoretic analysis to help understand the phenomenon in a way that can influence the design process of ML experiments. In this paper, we deduce an information-theoretic model which explains adversarial attacks universally as the abuse of feature redundancies in ML algorithms. We prove that feature redundancy is a necessary condition for the existence of adversarial examples. Our model helps to explain the major questions raised in many anecdotal studies on adversarial examples. Our theory is backed up by empirical measurements of the information content of benign and adversarial examples on both image and text datasets. Our measurements show that typical adversarial examples introduce just enough redundancy to overflow the decision making of a machine learner trained on corresponding benign examples. We conclude with actionable recommendations to improve the robustness of machine learners against adversarial examples.", "keywords": "adversarial examples;information theory;robust neural networks", "primary_area": "", "supplementary_material": "", "author": "Jingkang Wang;Ruoxi Jia;Gerald Friedland;Bo Li;Costas Spanos", "authorids": "wangjksjtu_01@sjtu.edu.cn;ruoxijia@berkeley.edu;fractor@eecs.berkeley.edu;lxbosky@gmail.com;spanos@berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1enCo0cK7", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "wc_review": "191;1344;311", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 615.3333333333334, 517.5688896712741 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18371063052828696035&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "B1epooR5FX", "title": "Predicted Variables in Programming", "track": "main", "status": "Reject", "tldr": "We present Predicted Variables, an approach to making machine learning a first class citizen in programming languages.", "abstract": "We present Predicted Variables, an approach to making machine learning (ML) a first class citizen in programming languages.\nThere is a growing divide in approaches to building systems: using human experts (e.g. programming) on the one hand, and using behavior learned from data (e.g. ML) on the other hand. PVars aim to make using ML in programming easier by hybridizing the two. We leverage the existing concept of variables and create a new type, a predicted variable. PVars are akin to native variables with one important distinction: PVars determine their value using ML when evaluated. We describe PVars and their interface, how they can be used in programming, and demonstrate the feasibility of our approach on three algorithmic problems: binary search, QuickSort, and caches.\nWe show experimentally that PVars are able to improve over the commonly used heuristics and lead to a better performance than the original algorithms.\nAs opposed to previous work applying ML to algorithmic problems, PVars have the advantage that they can be used within the existing frameworks and do not require the existing domain knowledge to be replaced. PVars allow for a seamless integration of ML into existing systems and algorithms.\nOur PVars implementation currently relies on standard Reinforcement Learning (RL) methods. To learn faster, PVars use the heuristic function, which they are replacing, as an initial function. We show that PVars quickly pick up the behavior of the initial function and then improve performance beyond that without ever performing substantially worse -- allowing for a safe deployment in critical applications.", "keywords": "predicted variables;machine learning;programming;computing systems;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Victor Carbune;Thierry Coppey;Alexander Daryin;Thomas Deselaers;Nikhil Sarda;Jay Yagnik", "authorids": "victor.carbune@gmail.com;thierryc@google.com;shurick@google.com;deselaers@google.com;nikhilsarda@google.com;jyagnik@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ncarbune2019predicted,\ntitle={Predicted Variables in Programming},\nauthor={Victor Carbune and Thierry Coppey and Alexander Daryin and Thomas Deselaers and Nikhil Sarda and Jay Yagnik},\nyear={2019},\nurl={https://openreview.net/forum?id=B1epooR5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1epooR5FX", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;3;3", "wc_review": "245;499;189", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "332;528;263", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 311.0, 134.8876075355578 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 374.3333333333333, 112.25071145530535 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5251307769537158544&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1ethsR9Ym", "title": "Look Ma, No GANs! Image Transformation with ModifAE", "track": "main", "status": "Reject", "tldr": "ModifAE is a standalone neural network, trained exclusively on an autoencoding task, that implicitly learns to make image modifications (without GANs).", "abstract": "Existing methods of image to image translation require multiple steps in the training or modification process, and suffer from either an inability to generalize, or long training times. These methods also focus on binary trait modification, ignoring continuous traits. To address these problems, we propose ModifAE: a novel standalone neural network, trained exclusively on an autoencoding task, that implicitly learns to make continuous trait image modifications. As a standalone image modification network, ModifAE requires fewer parameters and less time to train than existing models. We empirically show that ModifAE produces significantly more convincing and more consistent continuous face trait modifications than the previous state-of-the-art model.", "keywords": "Computer Vision;Deep Learning;Autoencoder;GAN;Image Modification;Social Traits;Social Psychology", "primary_area": "", "supplementary_material": "", "author": "Chad Atalla;Bartholomew Tam;Amanda Song;Gary Cottrell", "authorids": ";;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\natalla2019look,\ntitle={Look Ma, No {GAN}s! Image Transformation with Modif{AE}},\nauthor={Chad Atalla and Bartholomew Tam and Amanda Song and Gary Cottrell},\nyear={2019},\nurl={https://openreview.net/forum?id=B1ethsR9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1ethsR9Ym", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;3", "wc_review": "309;195;310", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 271.3333333333333, 53.977361509762176 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4614642986431432537&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1euhoAcKX", "title": "DppNet: Approximating Determinantal Point Processes with Deep Networks", "track": "main", "status": "Reject", "tldr": "We approximate Determinantal Point Processes with neural nets; we justify our model theoretically and empirically.", "abstract": "Determinantal Point Processes (DPPs) provide an elegant and versatile way to sample sets of items that balance the point-wise quality with the set-wise diversity of selected items. For this reason, they have gained prominence in many machine learning applications that rely on subset selection. However, sampling from a DPP over a ground set of size N is a costly operation, requiring in general an O(N^3) preprocessing cost and an O(Nk^3) sampling cost for subsets of size k. We approach this problem by introducing DppNets: generative deep models that produce DPP-like samples for arbitrary ground sets. We develop an inhibitive attention mechanism based on transformer networks that captures a notion of dissimilarity between feature vectors. We show theoretically that such an approximation is sensible as it maintains the guarantees of inhibition or dissimilarity that makes DPP so powerful and unique. Empirically, we demonstrate that samples from our model receive high likelihood under the more expensive DPP alternative.", "keywords": "dpp;submodularity;determinant", "primary_area": "", "supplementary_material": "", "author": "Zelda Mariet;Jasper Snoek;Yaniv Ovadia", "authorids": "zelda@csail.mit.edu;jsnoek@google.com;yovadia@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmariet2019dppnet,\ntitle={DppNet: Approximating Determinantal Point Processes with Deep Networks},\nauthor={Zelda Mariet and Jasper Snoek and Yaniv Ovadia},\nyear={2019},\nurl={https://openreview.net/forum?id=B1euhoAcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1euhoAcKX", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;3;4", "wc_review": "132;285;393", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "170;83;324", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 270.0, 107.07940978544848 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 192.33333333333334, 99.64715528079843 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16138195067742792795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "B1excoAqKQ", "title": "What Would pi* Do?: Imitation Learning via Off-Policy Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We propose a simple and effective imitation learning algorithm based on off-policy RL, which works well on image-based tasks and implicitly performs approximate inference of the expert policy.", "abstract": "Learning to imitate expert actions given demonstrations containing image observations is a difficult problem in robotic control. The key challenge is generalizing behavior to out-of-distribution states that differ from those in the demonstrations. State-of-the-art imitation learning algorithms perform well in environments with low-dimensional observations, but typically involve adversarial optimization procedures, which can be difficult to use with high-dimensional image observations. We propose a remarkably simple alternative based on off-policy soft Q-learning, which we call soft Q imitation learning (SQIL, pronounced \"skill\"), that rewards the agent for matching demonstrated actions in demonstrated states. The key idea is initially filling the agent's experience replay buffer with demonstrations, where rewards are set to a positive constant, and setting rewards to zero in all additional experiences. We derive SQIL from first principles as a method for performing approximate inference under the MaxCausalEnt model of expert behavior. The approximate inference objective trades off between a pure behavioral cloning loss and a regularization term that incorporates information about state transitions via the soft Bellman error. Our experiments show that SQIL matches the state of the art in low-dimensional environments, and significantly outperforms prior work in playing video games from high-dimensional images.", "keywords": "imitation learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Siddharth Reddy;Anca D. Dragan;Sergey Levine", "authorids": "sgr@berkeley.edu;anca@berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nreddy2019what,\ntitle={What Would pi* Do?: Imitation Learning via Off-Policy Reinforcement Learning},\nauthor={Siddharth Reddy and Anca D. Dragan and Sergey Levine},\nyear={2019},\nurl={https://openreview.net/forum?id=B1excoAqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1excoAqKQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;4", "wc_review": "476;322;772", "wc_reply_reviewers": "143;0;37", "wc_reply_authors": "1136;617;820", "reply_reviewers": "1;0;1", "reply_authors": "3;2;3", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 523.3333333333334, 186.73570152014912 ], "wc_reply_reviewers_avg": [ 60.0, 60.602530200204235 ], "wc_reply_authors_avg": [ 857.6666666666666, 213.54832916435774 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4316718804482300585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Disjoint Mapping Network for Cross-modal Matching of Voices and Faces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/810", "id": "B1exrnCcF7", "author_site": "Yandong Wen, Mahmoud Al Ismail, Weiyang Liu, Bhiksha Raj, Rita Singh", "tldr": "", "abstract": "We propose a novel framework, called Disjoint Mapping Network (DIMNet), for cross-modal biometric matching, in particular of voices and faces. Different from the existing methods, DIMNet does not explicitly learn the joint relationship between the modalities. Instead, DIMNet learns a shared representation for different modalities by mapping them individually to their common covariates. These shared representations can then be used to find the correspondences between the modalities. We show empirically that DIMNet is able to achieve better performance than the current state-of-the-art methods, with the additional benefits of being conceptually simpler and less data-intensive.", "keywords": "cross-modal matching;voices;faces", "primary_area": "", "supplementary_material": "", "author": "Yandong Wen;Mahmoud Al Ismail;Weiyang Liu;Bhiksha Raj;Rita Singh", "authorids": "yandongw@andrew.cmu.edu;mahmoudi@andrew.cmu.edu;wyliu@gatech.edu;bhiksha@cs.cmu.edu;rsingh@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nwen2018disjoint,\ntitle={Disjoint Mapping Network for Cross-modal Matching of Voices and Faces},\nauthor={Yandong Wen and Mahmoud Al Ismail and Weiyang Liu and Bhiksha Raj and Rita Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1exrnCcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "wc_review": "745;473;158", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "787;286;632", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 458.6666666666667, 239.85597530369947 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 568.3333333333334, 209.42832239747858 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5133289555977246135&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=B1exrnCcF7", "pdf": "https://openreview.net/pdf?id=B1exrnCcF7", "email": ";;;;", "author_num": 5 }, { "id": "B1fA3oActQ", "title": "GraphSeq2Seq: Graph-Sequence-to-Sequence for Neural Machine Translation", "track": "main", "status": "Reject", "tldr": "Graph-Sequence-to-Sequence for Neural Machine Translation", "abstract": "Sequence-to-Sequence (Seq2Seq) neural models have become popular for text generation problems, e.g. neural machine translation (NMT) (Bahdanau et al.,2014; Britz et al., 2017), text summarization (Nallapati et al., 2017; Wang &Ling, 2016), and image captioning (Venugopalan et al., 2015; Liu et al., 2017). Though sequential modeling has been shown to be effective, the dependency graph among words contains additional semantic information and thus can be utilized for sentence modeling. In this paper, we propose a Graph-Sequence-to-Sequence(GraphSeq2Seq) model to fuse the dependency graph among words into the traditional Seq2Seq framework. For each sample, the sub-graph of each word is encoded to a graph representation, which is then utilized to sequential encoding. At last, a sequence decoder is leveraged for output generation. Since above model fuses different features by contacting them together to encode, we also propose a variant of our model that regards the graph representations as additional annotations in attention mechanism (Bahdanau et al., 2014) by separately encoding different features. Experiments on several translation benchmarks show that our models can outperform existing state-of-the-art methods, demonstrating the effectiveness of the combination of Graph2Seq and Seq2Seq.", "keywords": "Neural Machine Translation;Natural Language Generation;Graph Embedding;LSTM", "primary_area": "", "supplementary_material": "", "author": "Guoshuai Zhao;Jun Li;Lu Wang;Xueming Qian;Yun Fu", "authorids": "zgs2012@stu.xjtu.edu.cn;junl.mldl@gmail.com;luwang@ccs.neu.edu;qianxm@mail.xjtu.edu.cn;yunfu@ece.neu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhao2019graphseqseq,\ntitle={GraphSeq2Seq: Graph-Sequence-to-Sequence for Neural Machine Translation},\nauthor={Guoshuai Zhao and Jun Li and Lu Wang and Xueming Qian and Yun Fu},\nyear={2019},\nurl={https://openreview.net/forum?id=B1fA3oActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1fA3oActQ", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;5", "wc_review": "156;243;234", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "89;515;326", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 211.0, 39.06404996924922 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 310.0, 174.28138167916848 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16984536813681249202&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1fPYj0qt7", "title": "Riemannian Stochastic Gradient Descent for Tensor-Train Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "Applying the Riemannian SGD (RSGD) algorithm for training Tensor-Train RNNs to further reduce model parameters.", "abstract": "The Tensor-Train factorization (TTF) is an efficient way to compress large weight matrices of fully-connected layers and recurrent layers in recurrent neural networks (RNNs). However, high Tensor-Train ranks for all the core tensors of parameters need to be element-wise fixed, which results in an unnecessary redundancy of model parameters. This work applies Riemannian stochastic gradient descent (RSGD) to train core tensors of parameters in the Riemannian Manifold before finding vectors of lower Tensor-Train ranks for parameters. The paper first presents the RSGD algorithm with a convergence analysis and then tests it on more advanced Tensor-Train RNNs such as bi-directional GRU/LSTM and Encoder-Decoder RNNs with a Tensor-Train attention model. The experiments on digit recognition and machine translation tasks suggest the effectiveness of the RSGD algorithm for Tensor-Train RNNs. ", "keywords": "Riemannian Stochastic Gradient Descent;Tensor-Train;Recurrent Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Jun Qi;Chin-Hui Lee;Javier Tejedor", "authorids": "jqi41@gatech.edu;qij13@uw.edu;javiertejedornoguerales@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nqi2019riemannian,\ntitle={Riemannian Stochastic Gradient Descent for Tensor-Train Recurrent Neural Networks},\nauthor={Jun Qi and Chin-Hui Lee and Javier Tejedor},\nyear={2019},\nurl={https://openreview.net/forum?id=B1fPYj0qt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1fPYj0qt7", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;4", "wc_review": "1103;334;177", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 538.0, 404.62410539495374 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B_VndPWGE8wJ:scholar.google.com/&scioq=Riemannian+Stochastic+Gradient+Descent+for+Tensor-Train+Recurrent+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1fbosCcYm", "title": "A Biologically Inspired Visual Working Memory for Deep Networks", "track": "main", "status": "Reject", "tldr": "A biologically inspired working memory that can be integrated in recurrent visual attention models for state of the art performance", "abstract": "The ability to look multiple times through a series of pose-adjusted glimpses is fundamental to human vision. This critical faculty allows us to understand highly complex visual scenes. Short term memory plays an integral role in aggregating the information obtained from these glimpses and informing our interpretation of the scene. Computational models have attempted to address glimpsing and visual attention but have failed to incorporate the notion of memory. We introduce a novel, biologically inspired visual working memory architecture that we term the Hebb-Rosenblatt memory. We subsequently introduce a fully differentiable Short Term Attentive Working Memory model (STAWM) which uses transformational attention to learn a memory over each image it sees. The state of our Hebb-Rosenblatt memory is embedded in STAWM as the weights space of a layer. By projecting different queries through this layer we can obtain goal-oriented latent representations for tasks including classification and visual reconstruction. Our model obtains highly competitive classification performance on MNIST and CIFAR-10. As demonstrated through the CelebA dataset, to perform reconstruction the model learns to make a sequence of updates to a canvas which constitute a parts-based representation. Classification with the self supervised representation obtained from MNIST is shown to be in line with the state of the art models (none of which use a visual attention mechanism). Finally, we show that STAWM can be trained under the dual constraints of classification and reconstruction to provide an interpretable visual sketchpad which helps open the `black-box' of deep learning.", "keywords": "memory;visual attention;image classification;image reconstruction;latent representations", "primary_area": "", "supplementary_material": "", "author": "Ethan Harris;Mahesan Niranjan;Jonathon Hare", "authorids": "ewah1g13@ecs.soton.ac.uk;mn@ecs.soton.ac.uk;jsh2@ecs.soton.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nharris2019a,\ntitle={A Biologically Inspired Visual Working Memory for Deep Networks},\nauthor={Ethan Harris and Mahesan Niranjan and Jonathon Hare},\nyear={2019},\nurl={https://openreview.net/forum?id=B1fbosCcYm},\n}", "github": "[![github](/images/github_icon.svg) ethanwharris/STAWM](https://github.com/ethanwharris/STAWM) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1fbosCcYm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1fbosCcYm", "pdf_size": 0, "rating": "4;5;9", "confidence": "4;4;5", "wc_review": "518;673;673", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "807;1161;508", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.0, 2.160246899469287 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 621.3333333333334, 73.06770072260991 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 825.3333333333334, 266.90114691065344 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9819805060619656, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8035997303264808112&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Automatically Composing Representation Transformations as a Means for Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/864", "id": "B1ffQnRcKX", "author_site": "Michael Chang, Abhishek Gupta, Sergey Levine, Thomas L Griffiths", "tldr": "We explore the problem of compositional generalization and propose a means for endowing neural network architectures with the ability to compose themselves to solve these problems.", "abstract": "A generally intelligent learner should generalize to more complex tasks than it has previously encountered, but the two common paradigms in machine learning -- either training a separate learner per task or training a single learner for all tasks -- both have difficulty with such generalization because they do not leverage the compositional structure of the task distribution. This paper introduces the compositional problem graph as a broadly applicable formalism to relate tasks of different complexity in terms of problems with shared subproblems. We propose the compositional generalization problem for measuring how readily old knowledge can be reused and hence built upon. As a first step for tackling compositional generalization, we introduce the compositional recursive learner, a domain-general framework for learning algorithmic procedures for composing representation transformations, producing a learner that reasons about what computation to execute by making analogies to previously seen problems. We show on a symbolic and a high-dimensional domain that our compositional approach can generalize to more complex problems than the learner has previously encountered, whereas baselines that are not explicitly compositional do not.", "keywords": "compositionality;deep learning;metareasoning", "primary_area": "", "supplementary_material": "", "author": "Michael Chang;Abhishek Gupta;Sergey Levine;Thomas L. Griffiths", "authorids": "mbchang@berkeley.edu;abhigupta@berkeley.edu;svlevine@eecs.berkeley.edu;tom_griffiths@berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchang2018automatically,\ntitle={Automatically Composing Representation Transformations as a Means for Generalization},\nauthor={Michael Chang and Abhishek Gupta and Sergey Levine and Thomas L. Griffiths},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1ffQnRcKX},\n}", "github": "[![github](/images/github_icon.svg) mbchang/crl](https://github.com/mbchang/crl)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;9", "confidence": "3;2;4", "wc_review": "969;463;138", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "235;733;0", "reply_reviewers": "0;0;0", "reply_authors": "1;2;0", "rating_avg": [ 7.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 523.3333333333334, 341.92624285882994 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 322.6666666666667, 305.5992292020529 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2301953604663446405&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=B1ffQnRcKX", "pdf": "https://openreview.net/pdf?id=B1ffQnRcKX", "email": ";;;", "author_num": 4 }, { "title": "Visual Reasoning by Progressive Module Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/647", "id": "B1fpDsAqt7", "author_site": "Seung Wook Kim, Makarand Tapaswi, Sanja Fidler", "tldr": "", "abstract": "Humans learn to solve tasks of increasing complexity by building on top of previously acquired knowledge. Typically, there exists a natural progression in the tasks that we learn \u2013 most do not require completely independent solutions, but can be broken down into simpler subtasks. We propose to represent a solver for each task as a neural module that calls existing modules (solvers for simpler tasks) in a functional program-like manner. Lower modules are a black box to the calling module, and communicate only via a query and an output. Thus, a module for a new task learns to query existing modules and composes their outputs in order to produce its own output. Our model effectively combines previous skill-sets, does not suffer from forgetting, and is fully differentiable. We test our model in learning a set of visual reasoning tasks, and demonstrate improved performances in all tasks by learning progressively. By evaluating the reasoning process using human judges, we show that our model is more interpretable than an attention-based baseline.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Seung Wook Kim;Makarand Tapaswi;Sanja Fidler", "authorids": "seung@cs.toronto.edu;makarand@cs.toronto.edu;fidler@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkim2018visual,\ntitle={Visual Reasoning by Progressive Module Networks},\nauthor={Seung Wook Kim and Makarand Tapaswi and Sanja Fidler},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1fpDsAqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;5", "wc_review": "410;495;279", "wc_reply_reviewers": "455;0;40", "wc_reply_authors": "2034;487;330", "reply_reviewers": "3;0;1", "reply_authors": "4;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 394.6666666666667, 88.8456839444413 ], "wc_reply_reviewers_avg": [ 165.0, 205.71015207487127 ], "wc_reply_authors_avg": [ 950.3333333333334, 768.9440089427114 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12652585625953214305&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1fpDsAqt7", "pdf": "https://openreview.net/pdf?id=B1fpDsAqt7", "email": ";;", "author_num": 3 }, { "id": "B1fysiAqK7", "title": "Probabilistic Binary Neural Networks", "track": "main", "status": "Reject", "tldr": "We introduce a stochastic training method for training Binary Neural Network with both binary weights and activations.", "abstract": "Low bit-width weights and activations are an effective way of combating the increasing need for both memory and compute power of Deep Neural Networks. In this work, we present a probabilistic training method for Neural Network with both binary weights and activations, called PBNet. By embracing stochasticity during training, we circumvent the need to approximate the gradient of functions for which the derivative is zero almost always, such as $\\textrm{sign}(\\cdot)$, while still obtaining a fully Binary Neural Network at test time. Moreover, it allows for anytime ensemble predictions for improved performance and uncertainty estimates by sampling from the weight distribution. Since all operations in a layer of the PBNet operate on random variables, we introduce stochastic versions of Batch Normalization and max pooling, which transfer well to a deterministic network at test time. We evaluate two related training methods for the PBNet: one in which activation distributions are propagated throughout the network, and one in which binary activations are sampled in each layer. Our experiments indicate that sampling the binary activations is an important element for stochastic training of binary Neural Networks.\n", "keywords": "binary neural Network;efficient deep learning;stochastic training;discrete neural network;efficient inference", "primary_area": "", "supplementary_material": "", "author": "Jorn W.T. Peters;Tim Genewein;Max Welling", "authorids": "jornpeters@gmail.com;tim.genewein@gmail.com;welling.max@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npeters2019probabilistic,\ntitle={Probabilistic Binary Neural Networks},\nauthor={Jorn W.T. Peters and Tim Genewein and Max Welling},\nyear={2019},\nurl={https://openreview.net/forum?id=B1fysiAqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1fysiAqK7", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;2;4", "wc_review": "315;191;94", "wc_reply_reviewers": "63;0;0", "wc_reply_authors": "1744;561;256", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 200.0, 90.44703790985456 ], "wc_reply_reviewers_avg": [ 21.0, 29.698484809834994 ], "wc_reply_authors_avg": [ 853.6666666666666, 641.7560976639714 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.3273268353539886, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15141356549168935279&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1g-X3RqKm", "title": "A Proposed Hierarchy of Deep Learning Tasks", "track": "main", "status": "Reject", "tldr": "We use 50 GPU years of compute time to study how deep learning scales with more data, and propose a new way to organize the space of problems by difficulty.", "abstract": "As the pace of deep learning innovation accelerates, it becomes increasingly important to organize the space of problems by relative difficultly. Looking to other fields for inspiration, we see analogies to the Chomsky Hierarchy in computational linguistics and time and space complexity in theoretical computer science.\n\nAs a complement to prior theoretical work on the data and computational requirements of learning, this paper presents an empirical approach. We introduce a methodology for measuring validation error scaling with data and model size and test tasks in natural language, vision, and speech domains. We find that power-law validation error scaling exists across a breadth of factors and that model size scales sublinearly with data size, suggesting that simple learning theoretic models offer insights into the scaling behavior of realistic deep learning settings, and providing a new perspective on how to organize the space of problems. \n\nWe measure the power-law exponent---the \"steepness\" of the learning curve---and propose using this metric to sort problems by degree of difficulty. There is no data like more data, but some tasks are more effective at taking advantage of more data. Those that are more effective are easier on the proposed scale. \n\nUsing this approach, we can observe that studied tasks in speech and vision domains scale faster than those in the natural language domain, offering insight into the observation that progress in these areas has proceeded more rapidly than in natural language.", "keywords": "Deep learning;scaling with data;computational complexity;learning curves;speech recognition;image recognition;machine translation;language modeling", "primary_area": "", "supplementary_material": "", "author": "Joel Hestness;Sharan Narang;Newsha Ardalani;Heewoo Jun;Hassan Kianinejad;Md. Mostofa Ali Patwary;Yang Yang;Yanqi Zhou;Gregory Diamos;Kenneth Church", "authorids": "joel@baidu.com;sharan@baidu.com;ardalaninewsha@baidu.com;junheewoo@baidu.com;hassankianinejad@baidu.com;patwarymostofa@baidu.com;yangyang62@baidu.com;zhouyanqi@baidu.com;gregdiamos@baidu.com;kennethchurch@baidu.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@misc{\nhestness2019a,\ntitle={A Proposed Hierarchy of Deep Learning Tasks},\nauthor={Joel Hestness and Sharan Narang and Newsha Ardalani and Heewoo Jun and Hassan Kianinejad and Md. Mostofa Ali Patwary and Yang Yang and Yanqi Zhou and Gregory Diamos and Kenneth Church},\nyear={2019},\nurl={https://openreview.net/forum?id=B1g-X3RqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1g-X3RqKm", "pdf_size": 0, "rating": "4;4;6", "confidence": "2;5;3", "wc_review": "361;570;576", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 502.3333333333333, 99.96777258474631 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": -0.1889822365046136, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "B1g29oAqtm", "title": "Understanding the Asymptotic Performance of Model-Based RL Methods", "track": "main", "status": "Reject", "tldr": "Long-term prediction accuracy limits the performance of model-based RL, and can be improved with a simple change to the form of the model.", "abstract": "In complex simulated environments, model-based reinforcement learning methods typically lag the asymptotic performance of model-free approaches. This paper uses two MuJoCo environments to understand this gap through a series of ablation experiments designed to separate the contributions of the dynamics model and planner. These reveal the importance of long planning horizons, beyond those typically used. A dynamics model that directly predicts distant states, based on current state and a long sequence of actions, is introduced. This avoids the need for many recursions during long-range planning, and thus is able to yield more accurate state estimates. These accurate predictions allow us to uncover the relationship between model accuracy and performance, and translate to higher task reward that matches or exceeds current state-of-the-art model-free approaches.", "keywords": "model-based reinforcement learning;mbrl;reinforcement learning;predictive models;predictive learning;forward models;deep learning", "primary_area": "", "supplementary_material": "", "author": "William Whitney;Rob Fergus", "authorids": "wfwhitney@gmail.com;fergus@cs.nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwhitney2019understanding,\ntitle={Understanding the Asymptotic Performance of Model-Based {RL} Methods},\nauthor={William Whitney and Rob Fergus},\nyear={2019},\nurl={https://openreview.net/forum?id=B1g29oAqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=B1g29oAqtm", "pdf_size": 0, "rating": "2;4;5;6", "confidence": "4;3;3;4", "wc_review": "64;207;544;389", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "6;140;462;260", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 4.25, 1.479019945774904 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 301.0, 181.5199713530167 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 217.0, 167.57386431063765 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.16903085094570333, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16545849137156463579&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Bayesian Deep Convolutional Networks with Many Channels are Gaussian Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/703", "id": "B1g30j0qF7", "author_site": "Roman Novak, Lechao Xiao, Yasaman Bahri, Jaehoon Lee, Greg Yang, Jiri Hron, Daniel Abolafia, Jeffrey Pennington, Jascha Sohl-Dickstein", "tldr": "Finite-width SGD trained CNNs vs. infinitely wide fully Bayesian CNNs. Who wins?", "abstract": "There is a previously identified equivalence between wide fully connected neural networks (FCNs) and Gaussian processes (GPs). This equivalence enables, for instance, test set predictions that would have resulted from a fully Bayesian, infinitely wide trained FCN to be computed without ever instantiating the FCN, but by instead evaluating the corresponding GP. In this work, we derive an analogous equivalence for multi-layer convolutional neural networks (CNNs) both with and without pooling layers, and achieve state of the art results on CIFAR10 for GPs without trainable kernels. We also introduce a Monte Carlo method to estimate the GP corresponding to a given neural network architecture, even in cases where the analytic form has too many terms to be computationally feasible. \n\nSurprisingly, in the absence of pooling layers, the GPs corresponding to CNNs with and without weight sharing are identical. As a consequence, translation equivariance, beneficial in finite channel CNNs trained with stochastic gradient descent (SGD), is guaranteed to play no role in the Bayesian treatment of the infinite channel limit - a qualitative difference between the two regimes that is not present in the FCN case. We confirm experimentally, that while in some scenarios the performance of SGD-trained finite CNNs approaches that of the corresponding GPs as the channel count increases, with careful tuning SGD-trained CNNs can significantly outperform their corresponding GPs, suggesting advantages from SGD training compared to fully Bayesian parameter estimation.", "keywords": "Deep Convolutional Neural Networks;Gaussian Processes;Bayesian", "primary_area": "", "supplementary_material": "", "author": "Roman Novak;Lechao Xiao;Yasaman Bahri;Jaehoon Lee;Greg Yang;Jiri Hron;Daniel A. Abolafia;Jeffrey Pennington;Jascha Sohl-dickstein", "authorids": "romann@google.com;xlc@google.com;yasamanb@google.com;jaehlee@google.com;gregyang@microsoft.com;jh2084@cam.ac.uk;danabo@google.com;jpennin@google.com;jaschasd@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nnovak2019bayesian,\ntitle={Bayesian Deep Convolutional Networks with Many Channels are Gaussian Processes},\nauthor={Roman Novak and Lechao Xiao and Yasaman Bahri and Jaehoon Lee and Greg Yang and Daniel A. Abolafia and Jeffrey Pennington and Jascha Sohl-dickstein},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1g30j0qF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;7;7;7", "confidence": "4;5;2;3", "wc_review": "1081;4540;632;810", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "2330;5515;260;1349", "reply_reviewers": "0;0;0;0", "reply_authors": "5;10;2;2", "rating_avg": [ 6.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "wc_review_avg": [ 1765.75, 1609.6733170118712 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 2363.5, 1961.3131442989923 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 4.75, 3.2691742076555053 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": -0.2581988897471611, "gs_citation": 401, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18154043069761963462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1g30j0qF7", "pdf": "https://openreview.net/pdf?id=B1g30j0qF7", "email": ";;;;;;;;", "author_num": 9 }, { "id": "B1g6XnCcKQ", "title": "Object-Contrastive Networks: Unsupervised Object Representations", "track": "main", "status": "Withdraw", "tldr": "An unsupervised approach for learning disentangled representations of objects entirely from unlabeled monocular videos.", "abstract": "Discovering objects and their attributes is of great importance for autonomous agents to effectively operate in human environments. This task is particularly challenging due to the ubiquitousness of objects and all their nuances in perceptual and semantic detail. In this paper we present an unsupervised approach for learning disentangled representations of objects entirely from unlabeled monocular videos. These continuous representations are not biased by or limited by a discrete set of labels determined by human labelers. The proposed representation is trained with a metric learning loss, where objects with homogeneous features are pushed together, while those with heterogeneous features are pulled apart. We show these unsupervised embeddings allow to discover object attributes and can enable robots to self-supervise in previously unseen environments. We quantitatively evaluate performance on a large-scale synthetic dataset with 12k object models, as well as on a real dataset collected by a robot and show that our unsupervised object understanding generalizes to previously unseen objects. Specifically, we demonstrate the effectiveness of our approach on robotic manipulation tasks, such as pointing at and grasping of objects. An interesting and perhaps surprising finding in this approach is that given a limited set of objects, object correspondences will naturally emerge when using metric learning without requiring explicit positive pairs.", "keywords": "self-supervised robotics;object understanding;object representations;metric learning;unsupervised vision", "primary_area": "", "supplementary_material": "", "author": "Soeren Pirk;Mohi Khansari;Yunfei Bai;Corey Lynch;Pierre Sermanet", "authorids": "pirk@google.com;khansari@google.com;yunfeibai@google.com;coreylynch@google.com;sermanet@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1g6XnCcKQ", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;5;4", "wc_review": "1338;647;435", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 806.6666666666666, 385.54924746680837 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:IsWDaP1d_9cJ:scholar.google.com/&scioq=Object-Contrastive+Networks:+Unsupervised+Object+Representations&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "B1gHjoRqYQ", "title": "An Efficient and Margin-Approaching Zero-Confidence Adversarial Attack", "track": "main", "status": "Reject", "tldr": "This paper introduces MarginAttack, a stronger and faster zero-confidence adversarial attack.", "abstract": "There are two major paradigms of white-box adversarial attacks that attempt to impose input perturbations. The first paradigm, called the fix-perturbation attack, crafts adversarial samples within a given perturbation level. The second paradigm, called the zero-confidence attack, finds the smallest perturbation needed to cause misclassification, also known as the margin of an input feature. While the former paradigm is well-resolved, the latter is not. Existing zero-confidence attacks either introduce significant approximation errors, or are too time-consuming. We therefore propose MarginAttack, a zero-confidence attack framework that is able to compute the margin with improved accuracy and efficiency. Our experiments show that MarginAttack is able to compute a smaller margin than the state-of-the-art zero-confidence attacks, and matches the state-of-the-art fix-perturbation attacks. In addition, it runs significantly faster than the Carlini-Wagner attack, currently the most accurate zero-confidence attack algorithm.", "keywords": "adversarial attack;zero-confidence attack", "primary_area": "", "supplementary_material": "", "author": "Yang Zhang;Shiyu Chang;Mo Yu;Kaizhi Qian", "authorids": "yang.zhang2@ibm.com;shiyu.chang@ibm.com;yum@us.ibm.com;kqian3@illinois.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2019an,\ntitle={An Efficient and Margin-Approaching Zero-Confidence Adversarial Attack},\nauthor={Yang Zhang and Shiyu Chang and Mo Yu and Kaizhi Qian},\nyear={2019},\nurl={https://openreview.net/forum?id=B1gHjoRqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=B1gHjoRqYQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;5", "wc_review": "825;97;268", "wc_reply_reviewers": "130;0;28", "wc_reply_authors": "1387;268;511", "reply_reviewers": "2;0;1", "reply_authors": "6;1;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 396.6666666666667, 310.81863236013095 ], "wc_reply_reviewers_avg": [ 52.666666666666664, 55.86491644036433 ], "wc_reply_authors_avg": [ 722.0, 480.5767368485495 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.0, 2.160246899469287 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7761062532667484786&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1gIf305Ym", "title": "NSGA-Net: A Multi-Objective Genetic Algorithm for Neural Architecture Search", "track": "main", "status": "Reject", "tldr": "An efficient multi-objective neural architecture search algorithm using NSGA-II", "abstract": "This paper introduces NSGA-Net, an evolutionary approach for neural architecture search (NAS). NSGA-Net is designed with three goals in mind: (1) a NAS procedure for multiple, possibly conflicting, objectives, (2) efficient exploration and exploitation of the space of potential neural network architectures, and (3) output of a diverse set of network architectures spanning a trade-off frontier of the objectives in a single run. NSGA-Net is a population-based search algorithm that explores a space of potential neural network architectures in three steps, namely, a population initialization step that is based on prior-knowledge from hand-crafted architectures, an exploration step comprising crossover and mutation of architectures and finally an exploitation step that applies the entire history of evaluated neural architectures in the form of a Bayesian Network prior. Experimental results suggest that combining the objectives of minimizing both an error metric and computational complexity, as measured by FLOPS, allows NSGA-Net to find competitive neural architectures near the Pareto front of both objectives on two different tasks, object classification and object alignment. NSGA-Net obtains networks that achieve 3.72% (at 4.5 million FLOP) error on CIFAR-10 classification and 8.64% (at 26.6 million FLOP) error on the CMU-Car alignment task.", "keywords": "neural architecture search;evolutionary algorithms", "primary_area": "", "supplementary_material": "", "author": "Zhichao Lu;Ian Whalen;Vishnu Boddeti;Yashesh Dhebar;Kalyanmoy Deb;Erik Goodman;Wolfgang Banzhaf", "authorids": "mikelzc1990@gmail.com;whalenia@msu.edu;vishnu@msu.edu;dhebarya@egr.msu.edu;kdeb@egr.msu.edu;goodman@egr.msu.edu;banzhafw@msu.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nlu2019nsganet,\ntitle={{NSGA}-Net: A Multi-Objective Genetic Algorithm for Neural Architecture Search},\nauthor={Zhichao Lu and Ian Whalen and Vishnu Boddeti and Yashesh Dhebar and Kalyanmoy Deb and Erik Goodman and Wolfgang Banzhaf},\nyear={2019},\nurl={https://openreview.net/forum?id=B1gIf305Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gIf305Ym", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;4", "wc_review": "662;282;553", "wc_reply_reviewers": "0;0;275", "wc_reply_authors": "993;626;497", "reply_reviewers": "0;0;1", "reply_authors": "2;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 499.0, 159.76440988739222 ], "wc_reply_reviewers_avg": [ 91.66666666666667, 129.6362432175337 ], "wc_reply_authors_avg": [ 705.3333333333334, 210.1179562901647 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5797333683656608939&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1gJOoRcYQ", "title": "S3TA: A Soft, Spatial, Sequential, Top-Down Attention Model", "track": "main", "status": "Reject", "tldr": "http://sites.google.com/view/s3ta", "abstract": "We present a soft, spatial, sequential, top-down attention model (S3TA). This model uses a soft attention mechanism to bottleneck its view of the input. A recurrent core is used to generate query vectors, which actively select information from the input by correlating the query with input- and space-dependent key maps at different spatial locations.\n\nWe demonstrate the power and interpretabilty of this model under two settings. First, we build an agent which uses this attention model in RL environments and show that we can achieve performance competitive with state-of-the-art models while producing attention maps that elucidate some of the strategies used to solve the task. Second, we use this model in supervised learning tasks and show that it also achieves competitive performance and provides interpretable attention maps that show some of the underlying logic in the model's decision making.", "keywords": "Attention;RL;Top-Down;Interpretability", "primary_area": "", "supplementary_material": "", "author": "Alex Mott;Daniel Zoran;Mike Chrzanowski;Daan Wierstra;Danilo J. Rezende", "authorids": "alexmott@google.com;danielzoran@google.com;chrzanowskim@google.com;wierstra@google.com;danilor@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmott2019sta,\ntitle={S3{TA}: A Soft, Spatial, Sequential, Top-Down Attention Model},\nauthor={Alex Mott and Daniel Zoran and Mike Chrzanowski and Daan Wierstra and Danilo J. Rezende},\nyear={2019},\nurl={https://openreview.net/forum?id=B1gJOoRcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1gJOoRcYQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;4", "wc_review": "207;338;407", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "456;648;809", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 317.3333333333333, 82.94710496588908 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 637.6666666666666, 144.2967621104353 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3109315502561247826&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1gTE2AcKQ", "title": "From Amortised to Memoised Inference: Combining Wake-Sleep and Variational-Bayes for Unsupervised Few-Shot Program Learning", "track": "main", "status": "Withdraw", "tldr": "We extend the wake-sleep algorithm and use it to learn to learn structured models from few examples, ", "abstract": "Given a large database of concepts but only one or a few examples of each, can we learn models for each concept that are not only generalisable, but interpretable? In this work, we aim to tackle this problem through hierarchical Bayesian program induction. We present a novel learning algorithm which can infer concepts as short, generative, stochastic programs, while learning a global prior over programs to improve generalisation and a recognition network for efficient inference. Our algorithm, Wake-Sleep-Remember (WSR), combines gradient learning for continuous parameters with neurally-guided search over programs. We show that WSR learns compelling latent programs in two tough symbolic domains: cellular automata and Gaussian process kernels. We also collect and evaluate on a new dataset, Text-Concepts, for discovering structured patterns in natural text data.", "keywords": "wake-sleep;variational;amortised inference;hierarchical bayes;program learning", "primary_area": "", "supplementary_material": "", "author": "Luke B. Hewitt;Joshua B. Tenenbaum", "authorids": "lbh@mit.edu;jbt@mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gTE2AcKQ", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;5", "wc_review": "867;779;878", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 841.3333333333334, 44.304501903180096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:k5Zx2twZtTcJ:scholar.google.com/&scioq=From+Amortised+to+Memoised+Inference:+Combining+Wake-Sleep+and+Variational-Bayes+for+Unsupervised+Few-Shot+Program+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Learning to Learn without Forgetting by Maximizing Transfer and Minimizing Interference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1085", "id": "B1gTShAct7", "author_site": "Matt Riemer, Juan Ignacio Cases Martin, Robert Ajemian, Miao Liu, Irina Rish, Yuhai Tu, Gerald Tesauro", "tldr": "", "abstract": "Lack of performance when it comes to continual learning over non-stationary distributions of data remains a major challenge in scaling neural network learning to more human realistic settings. In this work we propose a new conceptualization of the continual learning problem in terms of a temporally symmetric trade-off between transfer and interference that can be optimized by enforcing gradient alignment across examples. We then propose a new algorithm, Meta-Experience Replay (MER), that directly exploits this view by combining experience replay with optimization based meta-learning. This method learns parameters that make interference based on future gradients less likely and transfer based on future gradients more likely. We conduct experiments across continual lifelong supervised learning benchmarks and non-stationary reinforcement learning environments demonstrating that our approach consistently outperforms recently proposed baselines for continual learning. Our experiments show that the gap between the performance of MER and baseline algorithms grows both as the environment gets more non-stationary and as the fraction of the total experiences stored gets smaller. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matthew Riemer;Ignacio Cases;Robert Ajemian;Miao Liu;Irina Rish;Yuhai Tu;and Gerald Tesauro", "authorids": "mdriemer@us.ibm.com;cases@stanford.edu;ajemian@mit.edu;miao.liu1@ibm.com;rish@us.ibm.com;yuhai@us.ibm.com;gtesauro@us.ibm.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nriemer2018learning,\ntitle={Learning to Learn without Forgetting By Maximizing Transfer and Minimizing Interference},\nauthor={Matthew Riemer and Ignacio Cases and Robert Ajemian and Miao Liu and Irina Rish and Yuhai Tu and and Gerald Tesauro},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1gTShAct7},\n}", "github": "[![github](/images/github_icon.svg) mattriemer/mer](https://github.com/mattriemer/mer) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=B1gTShAct7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;5;4", "wc_review": "578;442;579", "wc_reply_reviewers": "37;39;0", "wc_reply_authors": "1450;652;618", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 533.0, 64.34801214230838 ], "wc_reply_reviewers_avg": [ 25.333333333333332, 17.93197020841702 ], "wc_reply_authors_avg": [ 906.6666666666666, 384.4453436084539 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 979, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1577299111936747730&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1gTShAct7", "pdf": "https://openreview.net/pdf?id=B1gTShAct7", "email": ";;;;;;", "author_num": 7 }, { "id": "B1gVRi0qFQ", "title": "Online abstraction with MDP homomorphisms for Deep Learning", "track": "main", "status": "Withdraw", "tldr": "We create abstract models of environments from experience and use them to learn new tasks faster.", "abstract": "Abstraction of Markov Decision Processes is a useful tool for solving complex problems, as it can ignore unimportant aspects of an environment, simplifying the process of learning an optimal policy. In this paper, we propose a new algorithm for finding abstract MDPs in environments with continuous state spaces. It is based on MDP homomorphisms, a structure-preserving mapping between MDPs. We demonstrate our algorithm's ability to learns abstractions from collected experience and show how to reuse the abstractions to guide exploration in new tasks the agent encounters. Our novel task transfer method beats a baseline based on a deep Q-network.", "keywords": "reinforcement learning;abstraction;mdp homomorphism;deep learning;robotics", "primary_area": "", "supplementary_material": "", "author": "Ondrej Biza;Robert Platt", "authorids": "bizaondr@fit.cvut.cz;rplatt@ccs.neu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1gVRi0qFQ", "pdf_size": 0, "rating": "4;5", "confidence": "3;3", "wc_review": "583;425", "wc_reply_reviewers": "0;0", "wc_reply_authors": "0;0", "reply_reviewers": "0;0", "reply_authors": "0;0", "rating_avg": [ 4.5, 0.5 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 504.0, 79.0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 3, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2340661629444747433&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "B1gWWh05Y7", "title": "Exploration in Policy Mirror Descent", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Policy optimization is a core problem in reinforcement learning. In this paper, we investigate Reversed Entropy Policy Mirror Descent (REPMD), an on-line policy optimization strategy that improves exploration behavior while assuring monotonic progress in a principled objective. REPMD conducts a form of maximum entropy exploration within a mirror descent framework, but uses an alternative policy update with a reversed KL projection. This modified formulation bypasses undesirable mode seeking behavior and avoids premature convergence to sub-optimal policies, while still supporting strong theoretical properties such as guaranteed policy improvement. An experimental evaluation demonstrates that this approach significantly improves practical exploration and surpasses the empirical performance of state-of-the art policy optimization methods in a set of benchmark tasks.", "keywords": "Reinforcement Learning;Exploration;Policy Optimization", "primary_area": "", "supplementary_material": "", "author": "Jincheng Mei;Chenjun Xiao;Ruitong Huang;Dale Schuurmans;Martin Muller", "authorids": "jmei2@ualberta.ca;chenjun@ualberta.ca;ruitong.huang@borealisai.com;daes@ualberta.ca;mmueller@ualberta.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=B1gWWh05Y7", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "BA-Net: Dense Bundle Adjustment Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/944", "id": "B1gabhRcYX", "author_site": "Chengzhou Tang, Ping Tan", "tldr": "This paper introduces a network architecture to solve the structure-from-motion (SfM) problem via feature bundle adjustment (BA)", "abstract": "This paper introduces a network architecture to solve the structure-from-motion (SfM) problem via feature-metric bundle adjustment (BA), which explicitly enforces multi-view geometry constraints in the form of feature-metric error. The whole pipeline is differentiable, so that the network can learn suitable features that make the BA problem more tractable. Furthermore, this work introduces a novel depth parameterization to recover dense per-pixel depth. The network first generates several basis depth maps according to the input image, and optimizes the final depth as a linear combination of these basis depth maps via feature-metric BA. The basis depth maps generator is also learned via end-to-end training. The whole system nicely combines domain knowledge (i.e. hard-coded multi-view geometry constraints) and deep learning (i.e. feature learning and basis depth maps learning) to address the challenging dense SfM problem. Experiments on large scale real data prove the success of the proposed method.", "keywords": "Structure-from-Motion;Bundle Adjustment;Dense Depth Estimation", "primary_area": "", "supplementary_material": "", "author": "Chengzhou Tang;Ping Tan", "authorids": "cta73@sfu.ca;pingtan@sfu.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ntang2018banet,\ntitle={{BA}-Net: Dense Bundle Adjustment Networks},\nauthor={Chengzhou Tang and Ping Tan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1gabhRcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;4;4", "wc_review": "687;684;371", "wc_reply_reviewers": "19;0;0", "wc_reply_authors": "399;692;165", "reply_reviewers": "1;0;0", "reply_authors": "1;2;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 580.6666666666666, 148.26178049502695 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 8.956685895029603 ], "wc_reply_authors_avg": [ 418.6666666666667, 215.59581525520284 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 351, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3363265714419824671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1gabhRcYX", "pdf": "https://openreview.net/pdf?id=B1gabhRcYX", "email": ";", "author_num": 2 }, { "id": "B1ggosR9Ym", "title": "Using Deep Siamese Neural Networks to Speed up Natural Products Research", "track": "main", "status": "Reject", "tldr": "We learn a direct mapping from NMR spectra of small molecules to a molecular structure based cluster space. ", "abstract": "Natural products (NPs, compounds derived from plants and animals) are an important source of novel disease treatments. A bottleneck in the search for new NPs is structure determination. One method is to use 2D Nuclear Magnetic Resonance (NMR) imaging, which indicates bonds between nuclei in the compound, and hence is the \"fingerprint\" of the compound. Computing a similarity score between 2D NMR spectra for a novel compound and a compound whose structure is known helps determine the structure of the novel compound. Standard approaches to this problem do not appear to scale to larger databases of compounds. Here we use deep convolutional Siamese networks to map NMR spectra to a cluster space, where similarity is given by the distance in the space. This approach results in an AUC score that is more than four times better than an approach using Latent Dirichlet Allocation.", "keywords": "clustering;deep learning;application;chemistry;natural products", "primary_area": "", "supplementary_material": "", "author": "Nicholas Roberts;Poornav S. Purushothama;Vishal T. Vasudevan;Siddarth Ravichandran;Chen Zhang;William H. Gerwick;Garrison W. Cottrell", "authorids": "n3robert@ucsd.edu;poornavsargoor@gmail.com;vthanvan@eng.ucsd.edu;s2ravich@eng.ucsd.edu;beowulf.zc@gmail.com;wgerwick@ucsd.edu;gary@ucsd.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nroberts2019using,\ntitle={Using Deep Siamese Neural Networks to Speed up Natural Products Research},\nauthor={Nicholas Roberts and Poornav S. Purushothama and Vishal T. Vasudevan and Siddarth Ravichandran and Chen Zhang and William H. Gerwick and Garrison W. Cottrell},\nyear={2019},\nurl={https://openreview.net/forum?id=B1ggosR9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1ggosR9Ym", "pdf_size": 0, "rating": "3;4;4", "confidence": "2;4;4", "wc_review": "258;192;481", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 310.3333333333333, 123.65094239655255 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17010153528688945172&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Sparse Dictionary Learning by Dynamical Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/846", "id": "B1gstsCqt7", "author_site": "Tsung-Han Lin, Ping Tak P Tang", "tldr": "", "abstract": "A dynamical neural network consists of a set of interconnected neurons that interact over time continuously. It can exhibit computational properties in the sense that the dynamical system\u2019s evolution and/or limit points in the associated state space can correspond to numerical solutions to certain mathematical optimization or learning problems. Such a computational system is particularly attractive in that it can be mapped to a massively parallel computer architecture for power and throughput efficiency, especially if each neuron can rely solely on local information (i.e., local memory). Deriving gradients from the dynamical network\u2019s various states while conforming to this last constraint, however, is challenging. We show that by combining ideas of top-down feedback and contrastive learning, a dynamical network for solving the l1-minimizing dictionary learning problem can be constructed, and the true gradients for learning are provably computable by individual neurons. Using spiking neurons to construct our dynamical network, we present a learning process, its rigorous mathematical analysis, and numerical results on several dictionary learning problems.", "keywords": "dynamical neural networks;spiking neural networks;dynamical system;hardware friendly learning;feedback;contrastive learning;dictionary learning;sparse coding", "primary_area": "", "supplementary_material": "", "author": "Tsung-Han Lin;Ping Tak Peter Tang", "authorids": "tsung-han.lin@intel.com;peter.tang@intel.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlin2018sparse,\ntitle={Sparse Dictionary Learning by Dynamical Neural Networks},\nauthor={Tsung-Han Lin and Ping Tak Peter Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1gstsCqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;8;9", "confidence": "4;4;4", "wc_review": "854;188;171", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "638;104;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 7.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 404.3333333333333, 318.03808297470005 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 247.33333333333334, 279.48683133358696 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7039049773024187969&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1gstsCqt7", "pdf": "https://openreview.net/pdf?id=B1gstsCqt7", "email": ";", "author_num": 2 }, { "title": "Deterministic Variational Inference for Robust Bayesian Neural Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1019", "id": "B1l08oAct7", "author_site": "Anqi Wu, Sebastian Nowozin, Ted Meeds, Richard E Turner, Jos\u00e9 Miguel Hern\u00e1ndez Lobato, Alexander Gaunt", "tldr": "A method for eliminating gradient variance and automatically tuning priors for effective training of bayesian neural networks", "abstract": "Bayesian neural networks (BNNs) hold great promise as a flexible and principled solution to deal with uncertainty when learning from finite data. Among approaches to realize probabilistic inference in deep neural networks, variational Bayes (VB) is theoretically grounded, generally applicable, and computationally efficient. With wide recognition of potential advantages, why is it that variational Bayes has seen very limited practical use for BNNs in real applications? We argue that variational inference in neural networks is fragile: successful implementations require careful initialization and tuning of prior variances, as well as controlling the variance of Monte Carlo gradient estimates. We provide two innovations that aim to turn VB into a robust inference tool for Bayesian neural networks: first, we introduce a novel deterministic method to approximate moments in neural networks, eliminating gradient variance; second, we introduce a hierarchical prior for parameters and a novel Empirical Bayes procedure for automatically selecting prior variances. Combining these two innovations, the resulting method is highly efficient and robust. On the application of heteroscedastic regression we demonstrate good predictive performance over alternative approaches.", "keywords": "Bayesian neural network;variational inference;variational bayes;variance reduction;empirical bayes", "primary_area": "", "supplementary_material": "", "author": "Anqi Wu;Sebastian Nowozin;Edward Meeds;Richard E. Turner;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;Alexander L. Gaunt", "authorids": "anqiw@princeton.edu;sebastian.nowozin@microsoft.com;ted.meeds@microsoft.com;ret26@cam.ac.uk;jmh233@cam.ac.uk;algaunt@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nwu2018deterministic,\ntitle={Deterministic Variational Inference for Robust Bayesian Neural Networks},\nauthor={Anqi Wu and Sebastian Nowozin and Edward Meeds and Richard E. Turner and Jose Miguel Hernandez-Lobato and Alexander L. Gaunt},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1l08oAct7},\n}", "github": "[![github](/images/github_icon.svg) Microsoft/deterministic-variational-inference](https://github.com/Microsoft/deterministic-variational-inference) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=B1l08oAct7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;3;3", "wc_review": "666;249;271", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "786;556;750", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 395.3333333333333, 191.6008582676207 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 697.3333333333334, 101.01265047287669 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 248, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=180186411545863756&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=B1l08oAct7", "pdf": "https://openreview.net/pdf?id=B1l08oAct7", "email": ";;;;;", "author_num": 6 }, { "id": "B1l1b205KX", "title": "Unsupervised Disentangling Structure and Appearance", "track": "main", "status": "Reject", "tldr": "We present a novel framework to learn the disentangled representation of structure and appearance in a completely unsupervised manner. ", "abstract": "It is challenging to disentangle an object into two orthogonal spaces of structure and appearance since each can influence the visual observation in a different and unpredictable way. It is rare for one to have access to a large number of data to help separate the influences. In this paper, we present a novel framework to learn this disentangled representation in a completely unsupervised manner. We address this problem in a two-branch Variational Autoencoder framework. For the structure branch, we project the latent factor into a soft structured point tensor and constrain it with losses derived from prior knowledge. This encourages the branch to distill geometry information. Another branch learns the complementary appearance information. The two branches form an effective framework that can disentangle object's structure-appearance representation without any human annotation. We evaluate our approach on four image datasets, on which we demonstrate the superior disentanglement and visual analogy quality both in synthesis and real-world data. We are able to generate photo-realistic images with 256*256 resolution that are clearly disentangled in structure and appearance.", "keywords": "disentangled representations;VAE;generative models;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Wayne Wu;Kaidi Cao;Cheng Li;Chen Qian;Chen Change Loy", "authorids": "~Wayne_Wu1;kaidicao@cs.stanford.edu;chengli@sensetime.com;qianchen@sensetime.com;ccloy225@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwu2019unsupervised,\ntitle={Unsupervised Disentangling Structure and Appearance},\nauthor={Wayne Wu and Kaidi Cao and Cheng Li and Chen Qian and Chen Change Loy},\nyear={2019},\nurl={https://openreview.net/forum?id=B1l1b205KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1l1b205KX", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "772;261;106", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 379.6666666666667, 284.546813176477 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WucP6rd8-SAJ:scholar.google.com/&scioq=Unsupervised+Disentangling+Structure+and+Appearance&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1l6e3RcF7", "title": "A Walk with SGD: How SGD Explores Regions of Deep Network Loss?", "track": "main", "status": "Reject", "tldr": "", "abstract": "The non-convex nature of the loss landscape of deep neural networks (DNN) lends them the intuition that over the course of training, stochastic optimization algorithms explore different regions of the loss surface by entering and escaping many local minima due to the noise induced by mini-batches. But is this really the case? This question couples the geometry of the DNN loss landscape with how stochastic optimization algorithms like SGD interact with it during training. Answering this question may help us qualitatively understand the dynamics of deep neural network optimization. We show evidence through qualitative and quantitative experiments that mini-batch SGD rarely crosses barriers during DNN optimization. As we show, the mini-batch induced noise helps SGD explore different regions of the loss surface using a seemingly different mechanism. To complement this finding, we also investigate the qualitative reason behind the slowing down of this exploration when using larger batch-sizes. We show this happens because gradients from larger batch-sizes align more with the top eigenvectors of the Hessian, which makes SGD oscillate in the proximity of the parameter initialization, thus preventing exploration.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chen Xing;Devansh Arpit;Christos Tsirigotis;Yoshua Bengio", "authorids": "xingchen1113@gmail.com;devansharpit@gmail.com;tsirif@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxing2019a,\ntitle={A Walk with {SGD}: How {SGD} Explores Regions of Deep Network Loss?},\nauthor={Chen Xing and Devansh Arpit and Christos Tsirigotis and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=B1l6e3RcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1l6e3RcF7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;5", "wc_review": "512;401;1074", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 662.3333333333334, 294.59840838372196 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1432126185912300295&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "title": "Ordered Neurons: Integrating Tree Structures into Recurrent Neural Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/660", "id": "B1l6qiR5F7", "author_site": "Yikang Shen, Shawn Tan, Alessandro Sordoni, Aaron Courville", "tldr": "We introduce a new inductive bias that integrates tree structures in recurrent neural networks.", "abstract": "Natural language is hierarchically structured: smaller units (e.g., phrases) are nested within larger units (e.g., clauses). When a larger constituent ends, all of the smaller constituents that are nested within it must also be closed. While the standard LSTM architecture allows different neurons to track information at different time scales, it does not have an explicit bias towards modeling a hierarchy of constituents. This paper proposes to add such inductive bias by ordering the neurons; a vector of master input and forget gates ensures that when a given neuron is updated, all the neurons that follow it in the ordering are also updated. Our novel recurrent architecture, ordered neurons LSTM (ON-LSTM), achieves good performance on four different tasks: language modeling, unsupervised parsing, targeted syntactic evaluation, and logical inference.", "keywords": "Deep Learning;Natural Language Processing;Recurrent Neural Networks;Language Modeling", "primary_area": "", "supplementary_material": "", "author": "Yikang Shen;Shawn Tan;Alessandro Sordoni;Aaron Courville", "authorids": "yikang.shn@gmail.com;shawn@wtf.sg;alsordon@microsoft.com;aaron.courville@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nshen2018ordered,\ntitle={Ordered Neurons: Integrating Tree Structures into Recurrent Neural Networks},\nauthor={Yikang Shen and Shawn Tan and Alessandro Sordoni and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1l6qiR5F7},\n}", "github": "[![github](/images/github_icon.svg) yikangshen/Ordered-Neurons](https://github.com/yikangshen/Ordered-Neurons) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=B1l6qiR5F7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;8;9", "confidence": "3;4;4", "wc_review": "101;736;595", "wc_reply_reviewers": "221;0;6", "wc_reply_authors": "232;308;232", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 477.3333333333333, 272.2625366973739 ], "wc_reply_reviewers_avg": [ 75.66666666666667, 102.7953738691041 ], "wc_reply_authors_avg": [ 257.3333333333333, 35.82674358011841 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 418, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18012332994072296158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=B1l6qiR5F7", "pdf": "https://openreview.net/pdf?id=B1l6qiR5F7", "email": ";;;", "author_num": 4 }, { "id": "B1l8SsR9Fm", "title": "Learning and Data Selection in Big Datasets", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Finding a dataset of minimal cardinality to characterize the optimal parameters of a model is of paramount importance in machine learning and distributed optimization over a network. This paper investigates the compressibility of large datasets. More specifically, we propose a framework that jointly learns the input-output mapping as well as the most representative samples of the dataset (sufficient dataset). Our analytical results show that the cardinality of the sufficient dataset increases sub-linearly with respect to the original dataset size. Numerical evaluations of real datasets reveal a large compressibility, up to 95%, without a noticeable drop in the learnability performance, measured by the generalization error.\n", "keywords": "Data selection;non-convex optimization;learning theory;active learning", "primary_area": "", "supplementary_material": "", "author": "Hossein S. Ghadikolaei;Hadi Ghauch;Carlo Fischione;Mikael Skoglund", "authorids": "hshokri@kth.se;ghauch@kth.se;carlofi@kth.se;skoglund@kth.se", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1l8SsR9Fm", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;3", "wc_review": "361;535;440", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 445.3333333333333, 71.13523896229084 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10470233448992585915&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "B1l8iiA9tQ", "title": "Backdrop: Stochastic Backpropagation", "track": "main", "status": "Reject", "tldr": "We introduce backdrop, intuitively described as dropout acting on the backpropagation pipeline and find significant improvements in generalization for problems with non-decomposable losses and problems with multi-scale, hierarchical data structure.", "abstract": "We introduce backdrop, a flexible and simple-to-implement method, intuitively described as dropout acting only along the backpropagation pipeline. Backdrop is implemented via one or more masking layers which are inserted at specific points along the network. Each backdrop masking layer acts as the identity in the forward pass, but randomly masks parts of the backward gradient propagation. Intuitively, inserting a backdrop layer after any convolutional layer leads to stochastic gradients corresponding to features of that scale. Therefore, backdrop is well suited for problems in which the data have a multi-scale, hierarchical structure. Backdrop can also be applied to problems with non-decomposable loss functions where standard SGD methods are not well suited. We perform a number of experiments and demonstrate that backdrop leads to significant improvements in generalization.", "keywords": "stochastic optimization;multi-scale data analysis;non-decomposable loss;generalization;one-shot learning", "primary_area": "", "supplementary_material": "", "author": "Siavash Golkar;Kyle Cranmer", "authorids": "siavash.golkar@gmail.com;kyle.cranmer@nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngolkar2019backdrop,\ntitle={Backdrop: Stochastic Backpropagation},\nauthor={Siavash Golkar and Kyle Cranmer},\nyear={2019},\nurl={https://openreview.net/forum?id=B1l8iiA9tQ},\n}", "github": "[![github](/images/github_icon.svg) dexgen/backdrop](https://github.com/dexgen/backdrop)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1l8iiA9tQ", "pdf_size": 0, "rating": "3;5;5", "confidence": "3;3;3", "wc_review": "195;388;150", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 244.33333333333334, 103.23543749873662 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3308266295637318805&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "B1l9qsA5KQ", "title": "Mental Fatigue Monitoring using Brain Dynamics Preferences", "track": "main", "status": "Reject", "tldr": "", "abstract": "Driver's cognitive state of mental fatigue significantly affects driving performance and more importantly public safety. Previous studies leverage the response time (RT) as the metric for mental fatigue and aim at estimating the exact value of RT using electroencephalogram (EEG) signals within a regression model. However, due to the easily corrupted EEG signals and also non-smooth RTs during data collection, regular regression methods generally suffer from poor generalization performance. Considering that human response time is the reflection of brain dynamics preference rather than a single value, a novel model called Brain Dynamic ranking (BDrank) has been proposed. BDrank could learn from brain dynamics preferences using EEG data robustly and preserve the ordering corresponding to RTs. BDrank model is based on the regularized alternative ordinal classification comparing to regular regression based practices. Furthermore, a transition matrix is introduced to characterize the reliability of each channel used in EEG data, which helps in learning brain dynamics preferences only from informative EEG channels. In order to handle large-scale EEG signals~and obtain higher generalization, an online-generalized Expectation Maximum (OnlineGEM) algorithm also has been proposed to update BDrank in an online fashion. Comprehensive empirical analysis on EEG signals from 44 participants shows that BDrank together with OnlineGEM achieves substantial improvements in reliability while simultaneously detecting possible less informative and noisy EEG channels.", "keywords": "mental fatigue;brain dynamics preference;brain dynamics ranking;channel reliability;channel Selection", "primary_area": "", "supplementary_material": "", "author": "Yuangang Pan;Avinash K Singh;Ivor W. Tsang;Chin-teng Lin", "authorids": "yuangang.pan@student.uts.edu.au;avinashsingh@outlook.com;ivor.tsang@uts.edu.au;chin-teng.lin@uts.edu.au", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\npan2019mental,\ntitle={Mental Fatigue Monitoring using Brain Dynamics Preferences},\nauthor={Yuangang Pan and Avinash K Singh and Ivor W. Tsang and Chin-teng Lin},\nyear={2019},\nurl={https://openreview.net/forum?id=B1l9qsA5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1l9qsA5KQ", "pdf_size": 0, "rating": "2;4;7", "confidence": "5;3;3", "wc_review": "116;491;244", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "842;1372;1023", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 4.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 283.6666666666667, 155.64132556047645 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1079.0, 219.96514875467582 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8029550685469663, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:im8Omt8sgxUJ:scholar.google.com/&scioq=Mental+Fatigue+Monitoring+using+Brain+Dynamics+Preferences&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1lG42C9Km", "title": "Intrinsic Social Motivation via Causal Influence in Multi-Agent RL", "track": "main", "status": "Reject", "tldr": "We reward agents for having a causal influence on the actions of other agents, and show that this gives rise to better cooperation and more meaningful emergent communication protocols. ", "abstract": "We derive a new intrinsic social motivation for multi-agent reinforcement learning (MARL), in which agents are rewarded for having causal influence over another agent's actions, where causal influence is assessed using counterfactual reasoning. The reward does not depend on observing another agent's reward function, and is thus a more realistic approach to MARL than taken in previous work. We show that the causal influence reward is related to maximizing the mutual information between agents' actions. We test the approach in challenging social dilemma environments, where it consistently leads to enhanced cooperation between agents and higher collective reward. Moreover, we find that rewarding influence can lead agents to develop emergent communication protocols. Therefore, we also employ influence to train agents to use an explicit communication channel, and find that it leads to more effective communication and higher collective reward. Finally, we show that influence can be computed by equipping each agent with an internal model that predicts the actions of other agents. This allows the social influence reward to be computed without the use of a centralised controller, and as such represents a significantly more general and scalable inductive bias for MARL with independent agents.", "keywords": "multi-agent reinforcement learning;causal inference;game theory;social dilemma;intrinsic motivation;counterfactual reasoning;empowerment;communication", "primary_area": "", "supplementary_material": "", "author": "Natasha Jaques;Angeliki Lazaridou;Edward Hughes;Caglar Gulcehre;Pedro A. Ortega;DJ Strouse;Joel Z. Leibo;Nando de Freitas", "authorids": "jaquesn@mit.edu;angeliki@google.com;edwardhughes@google.com;caglarg@google.com;pedroortega@google.com;danieljstrouse@gmail.com;jzl@google.com;nandodefreitas@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\njaques2019intrinsic,\ntitle={Intrinsic Social Motivation via Causal Influence in Multi-Agent {RL}},\nauthor={Natasha Jaques and Angeliki Lazaridou and Edward Hughes and Caglar Gulcehre and Pedro A. Ortega and DJ Strouse and Joel Z. Leibo and Nando de Freitas},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lG42C9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1lG42C9Km", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;3;3", "wc_review": "630;508;377", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1212;400;417", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 505.0, 103.30859919032233 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 676.3333333333334, 378.8371095280339 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15520747228778153089&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Eidetic 3D LSTM: A Model for Video Prediction and Beyond", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/997", "id": "B1lKS2AqtX", "author_site": "Yunbo Wang, Lu Jiang, Ming-Hsuan Yang, Li-Jia Li, Mingsheng Long, Li Fei-Fei", "tldr": "", "abstract": "Spatiotemporal predictive learning, though long considered to be a promising self-supervised feature learning method, seldom shows its effectiveness beyond future video prediction. The reason is that it is difficult to learn good representations for both short-term frame dependency and long-term high-level relations. We present a new model, Eidetic 3D LSTM (E3D-LSTM), that integrates 3D convolutions into RNNs. The encapsulated 3D-Conv makes local perceptrons of RNNs motion-aware and enables the memory cell to store better short-term features. For long-term relations, we make the present memory state interact with its historical records via a gate-controlled self-attention module. We describe this memory transition mechanism eidetic as it is able to effectively recall the stored memories across multiple time stamps even after long periods of disturbance. We first evaluate the E3D-LSTM network on widely-used future video prediction datasets and achieve the state-of-the-art performance. Then we show that the E3D-LSTM network also performs well on the early activity recognition to infer what is happening or what will happen after observing only limited frames of video. This task aligns well with video prediction in modeling action intentions and tendency.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yunbo Wang;Lu Jiang;Ming-Hsuan Yang;Li-Jia Li;Mingsheng Long;Li Fei-Fei", "authorids": "yunbowang1989@gmail.com;lujiang@google.com;mhyang@ucmerced.edu;lijiali@google.com;mingsheng@tsinghua.edu.cn;feifeili@cs.stanford.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nwang2018eidetic,\ntitle={Eidetic 3D {LSTM}: A Model for Video Prediction and Beyond},\nauthor={Yunbo Wang and Lu Jiang and Ming-Hsuan Yang and Li-Jia Li and Mingsheng Long and Li Fei-Fei},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lKS2AqtX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=B1lKS2AqtX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;5", "wc_review": "692;284;419", "wc_reply_reviewers": "99;108;0", "wc_reply_authors": "678;654;286", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 465.0, 169.7115199389835 ], "wc_reply_reviewers_avg": [ 69.0, 48.92851929090027 ], "wc_reply_authors_avg": [ 539.3333333333334, 179.40147404324438 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 524, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1521149270382251505&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=B1lKS2AqtX", "pdf": "https://openreview.net/pdf?id=B1lKS2AqtX", "email": ";;;;;", "author_num": 6 }, { "id": "B1lKtjA9FQ", "title": "Overfitting Detection of Deep Neural Networks without a Hold Out Set", "track": "main", "status": "Reject", "tldr": "We introduce and analyze several criteria for detecting overfitting.", "abstract": "Overfitting is an ubiquitous problem in neural network training and usually mitigated using a holdout data set.\nHere we challenge this rationale and investigate criteria for overfitting without using a holdout data set.\nSpecifically, we train a model for a fixed number of epochs multiple times with varying fractions of randomized labels and for a range of regularization strengths. \nA properly trained model should not be able to attain an accuracy greater than the fraction of properly labeled data points. Otherwise the model overfits. \nWe introduce two criteria for detecting overfitting and one to detect underfitting. We analyze early stopping, the regularization factor, and network depth.\nIn safety critical applications we are interested in models and parameter settings which perform well and are not likely to overfit. The methods of this paper allow characterizing and identifying such models.", "keywords": "deep learning;overfitting;generalization;memorization", "primary_area": "", "supplementary_material": "", "author": "Konrad Groh", "authorids": "konrad.groh@de.bosch.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ngroh2019overfitting,\ntitle={Overfitting Detection of Deep Neural Networks without a Hold Out Set},\nauthor={Konrad Groh},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lKtjA9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1lKtjA9FQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;3", "wc_review": "725;315;191", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 410.3333333333333, 228.18900548059327 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iqAElxxoAB0J:scholar.google.com/&scioq=Overfitting+Detection+of+Deep+Neural+Networks+without+a+Hold+Out+Set&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1lXGnRctX", "title": "Classification in the dark using tactile exploration", "track": "main", "status": "Reject", "tldr": "In this work, we study the problem of learning representations to identify novel objects by exploring objects using tactile sensing. Key point here is that the query is provided in image domain.", "abstract": "Combining information from different sensory modalities to execute goal directed actions is a key aspect of human intelligence. Specifically, human agents are very easily able to translate the task communicated in one sensory domain (say vision) into a representation that enables them to complete this task when they can only sense their environment using a separate sensory modality (say touch). In order to build agents with similar capabilities, in this work we consider the problem of a retrieving a target object from a drawer. The agent is provided with an image of a previously unseen object and it explores objects in the drawer using only tactile sensing to retrieve the object that was shown in the image without receiving any visual feedback. Success at this task requires close integration of visual and tactile sensing. We present a method for performing this task in a simulated environment using an anthropomorphic hand. We hope that future research in the direction of combining sensory signals for acting will find the object retrieval from a drawer to be a useful benchmark problem", "keywords": "tactile sensing;multimodal representations;vision;object identification", "primary_area": "", "supplementary_material": "", "author": "Mayur Mudigonda;Blake Tickell;Pulkit Agrawal", "authorids": "mudigonda@berkeley.edu;btickell@berkeley.edu;pulkitag@berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmudigonda2019classification,\ntitle={Classification in the dark using tactile exploration},\nauthor={Mayur Mudigonda and Blake Tickell and Pulkit Agrawal},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lXGnRctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1lXGnRctX", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;3", "wc_review": "275;149;214", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 212.66666666666666, 51.44792404838983 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:aT5WEZHb6WkJ:scholar.google.com/&scioq=Classification+in+the+dark+using+tactile+exploration&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "B1lf43A5Y7", "title": "How to learn (and how not to learn) multi-hop reasoning with memory networks", "track": "main", "status": "Withdraw", "tldr": "Memory Networks do not learn multi-hop reasoning unless we supervise them.", "abstract": "Answering questions about a text frequently requires aggregating information from multiple places in that text. End-to-end neural network models, the dominant approach in the current literature, can theoretically learn how to distill and manipulate representations of the text without explicit supervision about how to do so. We investigate a canonical architecture for this task, the memory network, and analyze how effective it really is in the context of three multi-hop reasoning settings. In a simple synthetic setting, the path-finding task of the bAbI dataset, the model fails to learn the correct reasoning without additional supervision of its attention mechanism. However, with this supervision, it can perform well. On a real text dataset, WikiHop, the memory network gives nearly state-of-the-art performance, but does so without using its multi-hop capabilities. A tougher anonymized version of the WikiHop dataset is qualitatively similar to bAbI: the model fails to perform well unless it has additional supervision. We hypothesize that many \"multi-hop\" architectures do not truly learn this reasoning as advertised, though they could learn this reasoning if appropriately supervised.", "keywords": "NLP;Reading Comprehension;Memory Networks;Multi-hop Reasoning", "primary_area": "", "supplementary_material": "", "author": "Jifan Chen;Greg Durrett", "authorids": "jf_chen@utexas.edu;gdurrett@cs.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1lf43A5Y7", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;4;5", "wc_review": "230;582;230", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 347.3333333333333, 165.93439131844315 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9898192883699445808&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1lfHhR9tm", "title": "The Natural Language Decathlon: Multitask Learning as Question Answering", "track": "main", "status": "Reject", "tldr": "We introduce a multitask learning challenge that spans ten natural language processing tasks and propose a new model that jointly learns them. ", "abstract": "Deep learning has improved performance on many natural language processing (NLP) tasks individually.\nHowever, general NLP models cannot emerge within a paradigm that focuses on the particularities of a single metric, dataset, and task.\nWe introduce the Natural Language Decathlon (decaNLP), a challenge that spans ten tasks:\nquestion answering, machine translation, summarization, natural language inference, sentiment analysis, semantic role labeling, relation extraction, goal-oriented dialogue, semantic parsing, and commonsense pronoun resolution.\nWe cast all tasks as question answering over a context.\nFurthermore, we present a new multitask question answering network (MQAN) that jointly learns all tasks in decaNLP without any task-specific modules or parameters more effectively than sequence-to-sequence and reading comprehension baselines.\nMQAN shows improvements in transfer learning for machine translation and named entity recognition, domain adaptation for sentiment analysis and natural language inference, and zero-shot capabilities for text classification.\nWe demonstrate that the MQAN's multi-pointer-generator decoder is key to this success and that performance further improves with an anti-curriculum training strategy.\nThough designed for decaNLP, MQAN also achieves state of the art results on the WikiSQL semantic parsing task in the single-task setting. \nWe also release code for procuring and processing data, training and evaluating models, and reproducing all experiments for decaNLP.", "keywords": "multitask learning;natural language processing;question answering;machine translation;relation extraction;semantic parsing;commensense reasoning;summarization;entailment;sentiment;dialog", "primary_area": "", "supplementary_material": "", "author": "Bryan McCann;Nitish Shirish Keskar;Caiming Xiong;Richard Socher", "authorids": "bmccann@salesforce.com;nkeskar@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmccann2019the,\ntitle={The Natural Language Decathlon: Multitask Learning as Question Answering},\nauthor={Bryan McCann and Nitish Shirish Keskar and Caiming Xiong and Richard Socher},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lfHhR9tm},\n}", "github": "[![github](/images/github_icon.svg) salesforce/decaNLP](https://github.com/salesforce/decaNLP) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=B1lfHhR9tm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1lfHhR9tm", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;3;4", "wc_review": "724;416;213", "wc_reply_reviewers": "1041;469;76", "wc_reply_authors": "650;322;446", "reply_reviewers": "3;3;1", "reply_authors": "1;1;2", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 451.0, 210.07776337981767 ], "wc_reply_reviewers_avg": [ 528.6666666666666, 396.21234486348635 ], "wc_reply_authors_avg": [ 472.6666666666667, 135.2265588640864 ], "reply_reviewers_avg": [ 2.3333333333333335, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 728, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11150838077444944380&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "B1liWh09F7", "title": "SALSA-TEXT : SELF ATTENTIVE LATENT SPACE BASED ADVERSARIAL TEXT GENERATION", "track": "main", "status": "Withdraw", "tldr": "We propose a self-attention based GAN architecture for unconditional text generation and improve on previous adversarial code-based results.", "abstract": "Inspired by the success of self attention mechanism and Transformer architecture\nin sequence transduction and image generation applications, we propose novel self\nattention-based architectures to improve the performance of adversarial latent code-\nbased schemes in text generation. Adversarial latent code-based text generation\nhas recently gained a lot of attention due to their promising results. In this paper,\nwe take a step to fortify the architectures used in these setups, specifically AAE\nand ARAE. We benchmark two latent code-based methods (AAE and ARAE)\ndesigned based on adversarial setups. In our experiments, the Google sentence\ncompression dataset is utilized to compare our method with these methods using\nvarious objective and subjective measures. The experiments demonstrate the\nproposed (self) attention-based models outperform the state-of-the-art in adversarial\ncode-based text generation.", "keywords": "Self-attention;Transformer;generative adversarial networks;GAN;neural text generation;NTG;generative models", "primary_area": "", "supplementary_material": "", "author": "Jules Gagnon-Marchand;Hamed Sadeghi;Mehdi Rezagholizadeh;Md. Akmal Haider", "authorids": "jgagnonmarchand@gmail.com;haamed.sadeghi@gmail.com;mehdi.rezagholizadeh@gmail.com;md.akmal.haidar@huawei.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1liWh09F7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "wc_review": "319;699;492", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 503.3333333333333, 155.34120151746248 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3539884215096070269&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "B1lnjo05Km", "title": "Graph Spectral Regularization For Neural Network Interpretability", "track": "main", "status": "Reject", "tldr": "Imposing graph structure on neural network layers for improved visual interpretability.", "abstract": "Deep neural networks can learn meaningful representations of data. However, these representations are hard to interpret. For example, visualizing a latent layer is generally only possible for at most three dimensions. Neural networks are able to learn and benefit from much higher dimensional representations but these are not visually interpretable because nodes have arbitrary ordering within a layer. Here, we utilize the ability of the human observer to identify patterns in structured representations to visualize higher dimensions. To do so, we propose a class of regularizations we call \\textit{Graph Spectral Regularizations} that impose graph-structure on latent layers. This is achieved by treating activations as signals on a predefined graph and constraining those activations using graph filters, such as low pass and wavelet-like filters. This framework allows for any kind of graph as well as filter to achieve a wide range of structured regularizations depending on the inference needs of the data. First, we show a synthetic example that the graph-structured layer can reveal topological features of the data. Next, we show that a smoothing regularization can impose semantically consistent ordering of nodes when applied to capsule nets. Further, we show that the graph-structured layer, using wavelet-like spatially localized filters, can form localized receptive fields for improved image and biomedical data interpretation. In other words, the mapping between latent layer, neurons and the output space becomes clear due to the localization of the activations. Finally, we show that when structured as a grid, the representations create coherent images that allow for image-processing techniques such as convolutions.", "keywords": "autoencoder;interpretable;graph signal processing;graph spectrum;graph filter;capsule", "primary_area": "", "supplementary_material": "", "author": "Alexander Tong;David van Dijk;Jay Stanley;Guy Wolf;Smita Krishnaswamy", "authorids": "alexander.tong@yale.edu;david.vandijk@yale.edu;jay.stanley@yale.edu;guy.wolf@yale.edu;smita.krishnaswamy@yale.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntong2019graph,\ntitle={Graph Spectral Regularization For Neural Network Interpretability},\nauthor={Alexander Tong and David van Dijk and Jay Stanley and Guy Wolf and Smita Krishnaswamy},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lnjo05Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1lnjo05Km", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;3;4", "wc_review": "445;248;233", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 308.6666666666667, 96.59652627754731 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13199596134586539294&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "ALISTA: Analytic Weights Are As Good As Learned Weights in LISTA", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/803", "id": "B1lnzn0ctQ", "author_site": "Jialin Liu, Xiaohan Chen, Zhangyang Wang, Wotao Yin", "tldr": "", "abstract": "Deep neural networks based on unfolding an iterative algorithm, for example, LISTA (learned iterative shrinkage thresholding algorithm), have been an empirical success for sparse signal recovery. The weights of these neural networks are currently determined by data-driven \u201cblack-box\u201d training. In this work, we propose Analytic LISTA (ALISTA), where the weight matrix in LISTA is computed as the solution to a data-free optimization problem, leaving only the stepsize and threshold parameters to data-driven learning. This signi\ufb01cantly simpli\ufb01es the training. Speci\ufb01cally, the data-free optimization problem is based on coherence minimization. We show our ALISTA retains the optimal linear convergence proved in (Chen et al., 2018) and has a performance comparable to LISTA. Furthermore, we extend ALISTA to convolutional linear operators, again determined in a data-free manner. We also propose a feed-forward framework that combines the data-free optimization and ALISTA networks from end to end, one that can be jointly trained to gain robustness to small perturbations in the encoding model.", "keywords": "sparse recovery;neural networks", "primary_area": "", "supplementary_material": "", "author": "Jialin Liu;Xiaohan Chen;Zhangyang Wang;Wotao Yin", "authorids": "liujl11@math.ucla.edu;chernxh@tamu.edu;atlaswang@tamu.edu;wotaoyin@math.ucla.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nliu2018alista,\ntitle={{ALISTA}: Analytic Weights Are As Good As Learned Weights in {LISTA}},\nauthor={Jialin Liu and Xiaohan Chen and Zhangyang Wang and Wotao Yin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lnzn0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;9;10", "confidence": "4;5;5", "wc_review": "458;553;177", "wc_reply_reviewers": "42;0;0", "wc_reply_authors": "1393;674;24", "reply_reviewers": "1;0;0", "reply_authors": "3;2;1", "rating_avg": [ 8.666666666666666, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 396.0, 159.63917647829015 ], "wc_reply_reviewers_avg": [ 14.0, 19.79898987322333 ], "wc_reply_authors_avg": [ 697.0, 559.1284885128522 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9449111825230678, "gs_citation": 227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7357754868208950005&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1lnzn0ctQ", "pdf": "https://openreview.net/pdf?id=B1lnzn0ctQ", "email": ";;;", "author_num": 4 }, { "id": "B1lwSsC5KX", "title": "D\u00e9j\u00e0 Vu: An Empirical Evaluation of the Memorization Properties of Convnets", "track": "main", "status": "Reject", "tldr": "We analyze the memorization properties by a convnet of the training set and propose several use-cases where we can extract some information about the training set. ", "abstract": "Convolutional neural networks memorize part of their training data, which is why strategies such as data augmentation and drop-out are employed to mitigate over- fitting. This paper considers the related question of \u201cmembership inference\u201d, where the goal is to determine if an image was used during training. We con- sider membership tests over either ensembles of samples or individual samples.\nFirst, we show how to detect if a dataset was used to train a model, and in particular whether some validation images were used at train time. Then, we introduce a new approach to infer membership when a few of the top layers are not available or have been fine-tuned, and show that lower layers still carry information about the training samples. To support our findings, we conduct large-scale experiments on Imagenet and subsets of YFCC-100M with modern architectures such as VGG and Resnet.\n", "keywords": "membership inference;memorization;attack;privacy", "primary_area": "", "supplementary_material": "", "author": "Alexandre Sablayrolles;Matthijs Douze;Cordelia Schmid;Herv\u00e9 J\u00e9gou", "authorids": "asablayrolles@fb.com;matthijs@fb.com;cordelia.schmid@inria.fr;rvj@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsablayrolles2019dj,\ntitle={D\u00e9j\u00e0 Vu: An Empirical Evaluation of the Memorization Properties of Convnets},\nauthor={Alexandre Sablayrolles and Matthijs Douze and Cordelia Schmid and Herv\u00e9 J\u00e9gou},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lwSsC5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1lwSsC5KX", "pdf_size": 0, "rating": "4;5;6", "confidence": "2;4;2", "wc_review": "1158;929;540", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "847;795;496", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 875.6666666666666, 255.10041595854412 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 712.6666666666666, 154.67025857898113 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10533089125928602338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "B1lx42A9Ym", "title": "Neural Rendering Model: Joint Generation and Prediction for Semi-Supervised Learning", "track": "main", "status": "Reject", "tldr": "We develop a new deep generative model for semi-supervised learning and propose a new Max-Min cross-entropy for training CNNs.", "abstract": "Unsupervised and semi-supervised learning are important problems that are especially challenging with complex data like natural images. Progress on these problems would accelerate if we had access to appropriate generative models under which to pose the associated inference tasks. Inspired by the success of Convolutional Neural Networks (CNNs) for supervised prediction in images, we design the Neural Rendering Model (NRM), a new hierarchical probabilistic generative model whose inference calculations correspond to those in a CNN. The NRM introduces a small set of latent variables at each level of the model and enforces dependencies among all the latent variables via a conjugate prior distribution. The conjugate prior yields a new regularizer for learning based on the paths rendered in the generative model for training CNNs\u2013the Rendering Path Normalization (RPN). We demonstrate that this regularizer improves generalization both in theory and in practice. Likelihood estimation in the NRM yields the new Max-Min cross entropy training loss, which suggests a new deep network architecture\u2013the Max- Min network\u2013which exceeds or matches the state-of-art for semi-supervised and supervised learning on SVHN, CIFAR10, and CIFAR100.", "keywords": "neural nets;generative models;semi-supervised learning;cross-entropy", "primary_area": "", "supplementary_material": "", "author": "Nhat Ho;Tan Nguyen;Ankit B. Patel;Anima Anandkumar;Michael I. Jordan;Richard G. Baraniuk", "authorids": "minhnhat@berkeley.edu;mn15@rice.edu;ankit.patel@bcm.edu;anima@caltech.edu;jordan@cs.berkeley.edu;richb@rice.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nho2019neural,\ntitle={Neural Rendering Model: Joint Generation and Prediction for Semi-Supervised Learning},\nauthor={Nhat Ho and Tan Nguyen and Ankit B. Patel and Anima Anandkumar and Michael I. Jordan and Richard G. Baraniuk},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lx42A9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=B1lx42A9Ym", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;3;3", "wc_review": "274;241;285", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "410;1037;518", "reply_reviewers": "0;0;0", "reply_authors": "1;5;2", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 266.6666666666667, 18.696404883173543 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 655.0, 273.6896052099897 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15682554850242668140&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "B1lxH20qtX", "title": "Learning to control self-assembling morphologies: a study of generalization via modularity", "track": "main", "status": "Reject", "tldr": "Learning to control self-assembling agents via dynamic graph networks", "abstract": "Much of contemporary sensorimotor learning assumes that one is already given a complex agent (e.g., a robotic arm) and the goal is to learn to control it. In contrast, this paper investigates a modular co-evolution strategy: a collection of primitive agents learns to self-assemble into increasingly complex collectives in order to solve control tasks. Each primitive agent consists of a limb and a neural controller. Limbs may choose to link up to form collectives, with linking being treated as a dynamic action. When two limbs link, a joint is added between them, actuated by the 'parent' limb's controller. This forms a new 'single' agent, which may further link with other agents. In this way, complex morphologies can emerge, controlled by a policy whose architecture is in explicit correspondence with the morphology. In experiments, we demonstrate that agents with these modular and dynamic topologies generalize better to test-time environments compared to static and monolithic baselines. Project videos are available at https://doubleblindICLR19.github.io/self-assembly/", "keywords": "modularity;compostionality;graphs;dynamics;network", "primary_area": "", "supplementary_material": "", "author": "Deepak Pathak;Chris Lu;Trevor Darrell;Philip Isola;Alexei A. Efros", "authorids": "pathak@berkeley.edu;chris.lu@berkeley.edu;trevor@eecs.berkeley.edu;phillip.isola@gmail.com;efros@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npathak2019learning,\ntitle={Learning to control self-assembling morphologies: a study of generalization via modularity},\nauthor={Deepak Pathak and Chris Lu and Trevor Darrell and Philip Isola and Alexei A. Efros},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lxH20qtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1lxH20qtX", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;3;3", "wc_review": "416;128;202", "wc_reply_reviewers": "53;0;0", "wc_reply_authors": "1359;659;733", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 248.66666666666666, 122.11833969100992 ], "wc_reply_reviewers_avg": [ 17.666666666666668, 24.984439601924677 ], "wc_reply_authors_avg": [ 917.0, 313.9978768505715 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 149, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6230712298907925889&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "title": "Three Mechanisms of Weight Decay Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1120", "id": "B1lz-3Rct7", "author_site": "Guodong Zhang, Chaoqi Wang, Bowen Xu, Roger Grosse", "tldr": "We investigate weight decay regularization for different optimizers and identify three distinct mechanisms by which weight decay improves generalization.", "abstract": "Weight decay is one of the standard tricks in the neural network toolbox, but the reasons for its regularization effect are poorly understood, and recent results have cast doubt on the traditional interpretation in terms of $L_2$ regularization.\nLiteral weight decay has been shown to outperform $L_2$ regularization for optimizers for which they differ. \nWe empirically investigate weight decay for three optimization algorithms (SGD, Adam, and K-FAC) and a variety of network architectures. We identify three distinct mechanisms by which weight decay exerts a regularization effect, depending on the particular optimization algorithm and architecture: (1) increasing the effective learning rate, (2) approximately regularizing the input-output Jacobian norm, and (3) reducing the effective damping coefficient for second-order optimization. \nOur results provide insight into how to improve the regularization of neural networks.", "keywords": "Generalization;Regularization;Optimization", "primary_area": "", "supplementary_material": "", "author": "Guodong Zhang;Chaoqi Wang;Bowen Xu;Roger Grosse", "authorids": "gdzhang.cs@gmail.com;cqwang@cs.toronto.edu;bowenxu@cs.toronto.com;rgrosse@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhang2018three,\ntitle={Three Mechanisms of Weight Decay Regularization},\nauthor={Guodong Zhang and Chaoqi Wang and Bowen Xu and Roger Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1lz-3Rct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;5;4", "wc_review": "635;869;158", "wc_reply_reviewers": "0;281;30", "wc_reply_authors": "577;827;26", "reply_reviewers": "0;1;1", "reply_authors": "1;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 554.0, 295.8614540625392 ], "wc_reply_reviewers_avg": [ 103.66666666666667, 125.99029945021782 ], "wc_reply_authors_avg": [ 476.6666666666667, 334.61453777277654 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 323, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15919782238590351977&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=B1lz-3Rct7", "pdf": "https://openreview.net/pdf?id=B1lz-3Rct7", "email": ";;;", "author_num": 4 }, { "id": "B1x-LjAcKX", "title": "Local Critic Training of Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We propose a new learning algorithm of deep neural networks, which unlocks the layer-wise dependency of backpropagation.", "abstract": "This paper proposes a novel approach to train deep neural networks by unlocking the layer-wise dependency of backpropagation training. The approach employs additional modules called local critic networks besides the main network model to be trained, which are used to obtain error gradients without complete feedforward and backward propagation processes. We propose a cascaded learning strategy for these local networks. In addition, the approach is also useful from multi-model perspectives, including structural optimization of neural networks, computationally efficient progressive inference, and ensemble classification for performance improvement. Experimental results show the effectiveness of the proposed approach and suggest guidelines for determining appropriate algorithm parameters.", "keywords": "inter-layer locking;local critic network;backpropagation;convolutional neural network;structural optimization;progress inference;ensemble inference", "primary_area": "", "supplementary_material": "", "author": "Hojung Lee;Jong-Seok Lee", "authorids": "hjlee92@yonsei.ac.kr;jong-seok.lee@yonsei.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2019local,\ntitle={Local Critic Training of Deep Neural Networks},\nauthor={Hojung Lee and Jong-Seok Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=B1x-LjAcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1x-LjAcKX", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;5", "wc_review": "205;190;306", "wc_reply_reviewers": "45;0;27", "wc_reply_authors": "270;113;351", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 233.66666666666666, 51.51267373720331 ], "wc_reply_reviewers_avg": [ 24.0, 18.49324200890693 ], "wc_reply_authors_avg": [ 244.66666666666666, 98.80058479359094 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16051141708892405078&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "B1x0E2C5tQ", "title": "What Is in a Translation Unit? Comparing Character and Subword Representations Beyond Translation", "track": "main", "status": "Withdraw", "tldr": "We study the impact of using different kinds of subword units on the quality of the resulting representations when used to model syntax, semantics, and morphology.", "abstract": "Recent work has shown that contextualized word representations derived from neural machine translation (NMT) are a viable alternative to such from simple word predictions tasks. This is because the internal understanding that needs to be built in order to be able to translate from one language to another is much more comprehensive. Unfortunately, computational and memory limitations as of present prevent NMT models from using large word vocabularies, and thus alternatives such as subword units (BPE and morphological segmentations) and characters have been used. Here we study the impact of using different kinds of units on the quality of the resulting representations when used to model syntax, semantics, and morphology. We found that while representations derived from subwords are slightly better for modeling syntax, character-based representations are superior for modeling morphology and are also more robust to noisy input.", "keywords": "subwords;representations;word embeddings;transfer learning;machine translation;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Nadir Durrani;Fahim Dalvi;Hassan Sajjad;Yonatan Belinkov;Preslav Nakov", "authorids": "ndurrani@qf.org.qa;faimaduddin@qf.org.qa;hsajjad@qf.org.qa;belinkov@mit.edu;pnakov@hbku.edu.qa", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1x0E2C5tQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;4;4", "wc_review": "295;185;528", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 336.0, 142.99883449408483 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7214691400084969744&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1x0enCcK7", "title": "Automatic generation of object shapes with desired functionalities", "track": "main", "status": "Reject", "tldr": "It's difficult to make objects with desired affordances. We propose an automated method for generating object shapes with desired affordances, based on neural networks.", "abstract": "3D objects (artefacts) are made to fulfill functions. Designing an object often starts with defining a list of functionalities that it should provide, also known as functional requirements. Today, the design of 3D object models is still a slow and largely artisanal activity, with few Computer-Aided Design (CAD) tools existing to aid the exploration of the design solution space. The purpose of the study is to explore the possibility of shape generation conditioned on desired functionalities. To accelerate the design process, we introduce an algorithm for generating object shapes with desired functionalities. We follow the principle form follows function, and assume that the form of a structure is correlated to its function. First, we use an artificial neural network to learn a function-to-form mapping by analysing a dataset of objects labeled with their functionalities. Then, we combine forms providing one or more desired functions, generating an object shape that is expected to provide all of them. Finally, we verify in simulation whether the generated object possesses the desired functionalities, by defining and executing functionality tests on it.", "keywords": "automated design;affordance learning", "primary_area": "", "supplementary_material": "", "author": "Mihai Andries;Atabak Dehban;Jose Santos-Victor", "authorids": "mandries@isr.tecnico.ulisboa.pt;adehban@isr.tecnico.ulisboa.pt;jasv@isr.tecnico.ulisboa.pt", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nandries2019automatic,\ntitle={Automatic generation of object shapes with desired functionalities},\nauthor={Mihai Andries and Atabak Dehban and Jose Santos-Victor},\nyear={2019},\nurl={https://openreview.net/forum?id=B1x0enCcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1x0enCcK7", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;4;3", "wc_review": "291;196;316", "wc_reply_reviewers": "0;11;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;1;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 267.6666666666667, 51.69354139756941 ], "wc_reply_reviewers_avg": [ 3.6666666666666665, 5.185449728701348 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zpseKJqZUVAJ:scholar.google.com/&scioq=Automatic+generation+of+object+shapes+with+desired+functionalities&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "B1x33sC9KQ", "title": "ACIQ: Analytical Clipping for Integer Quantization of neural networks", "track": "main", "status": "Reject", "tldr": "We analyze the trade-off between quantization noise and clipping distortion in low precision networks, and show marked improvements over standard quantization schemes that normally avoid clipping", "abstract": "We analyze the trade-off between quantization noise and clipping distortion in low precision networks. We identify the statistics of various tensors, and derive exact expressions for the mean-square-error degradation due to clipping. By optimizing these expressions, we show marked improvements over standard quantization schemes that normally avoid clipping. For example, just by choosing the accurate clipping values, more than 40\\% accuracy improvement is obtained for the quantization of VGG-16 to 4-bits of precision. Our results have many applications for the quantization of neural networks at both training and inference time. \n", "keywords": "quantization;reduced precision;training;inference;activation", "primary_area": "", "supplementary_material": "", "author": "Ron Banner;Yury Nahshan;Elad Hoffer;Daniel Soudry", "authorids": "ron.banner@intel.com;yury.nahshan@intel.com;daniel.soudry@gmail.com;elad.hoffer@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbanner2019aciq,\ntitle={{ACIQ}: Analytical Clipping for Integer Quantization of neural networks},\nauthor={Ron Banner and Yury Nahshan and Elad Hoffer and Daniel Soudry},\nyear={2019},\nurl={https://openreview.net/forum?id=B1x33sC9KQ},\n}", "github": "[![github](/images/github_icon.svg) submission2019/AnalyticalScaleForIntegerQuantization](https://github.com/submission2019/AnalyticalScaleForIntegerQuantization)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1x33sC9KQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "389;320;200", "wc_reply_reviewers": "0;184;0", "wc_reply_authors": "302;651;243", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 303.0, 78.08969202141856 ], "wc_reply_reviewers_avg": [ 61.333333333333336, 86.73843182554982 ], "wc_reply_authors_avg": [ 398.6666666666667, 180.04505608936398 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2613237382004735112&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1x5KiCcFX", "title": "Understanding GANs via Generalization Analysis for Disconnected Support", "track": "main", "status": "Reject", "tldr": "We investigate the generalization performance of GANs and show how GANs outperform others with a specific property of data.", "abstract": "This paper provides theoretical analysis of generative adversarial networks (GANs) to explain its advantages over other standard methods of learning probability measures. GANs learn a probability through observations, using the objective function with a generator and a discriminator. While many empirical results indicate that GANs can generate realistic samples, the reason for such successful performance remains unelucidated. This paper focuses the situation where the target probability measure satisfies the disconnected support property, which means a separate support of a probability, and relates it with the advantage of GANs. It is theoretically shown that, unlike other popular models, GANs do not suffer from the decrease of generalization performance caused by the disconnected support property. We rigorously quantify the generalization performance of GANs of a given architecture, and compare it with the performance of the other models. Based on the theory, we also provide a guideline for selecting deep network architecture for GANs. We demonstrate some numerical examples which support our results.", "keywords": "Generalization analysis;Statistical estimation;Understanding GANs;Disconnected support", "primary_area": "", "supplementary_material": "", "author": "Masaaki Imaizumi;Kenji Fukumizu", "authorids": "insou11@hotmail.com;fukumizu@ism.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nimaizumi2019understanding,\ntitle={Understanding {GAN}s via Generalization Analysis for Disconnected Support},\nauthor={Masaaki Imaizumi and Kenji Fukumizu},\nyear={2019},\nurl={https://openreview.net/forum?id=B1x5KiCcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1x5KiCcFX", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;4", "wc_review": "677;1250;756", "wc_reply_reviewers": "0;0;157", "wc_reply_authors": "454;440;333", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 894.3333333333334, 253.5538513916828 ], "wc_reply_reviewers_avg": [ 52.333333333333336, 74.01050976419197 ], "wc_reply_authors_avg": [ 409.0, 54.043192602460735 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13855763263846081935&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1x9siCcYQ", "title": "SENSE: SEMANTICALLY ENHANCED NODE SEQUENCE EMBEDDING", "track": "main", "status": "Reject", "tldr": "Node sequence embedding mechanism that captures both graph and text properties.", "abstract": "Effectively capturing graph node sequences in the form of vector embeddings is critical to many applications. We achieve this by (i) first learning vector embeddings of single graph nodes and (ii) then composing them to compactly represent node sequences. Specifically, we propose SENSE-S (Semantically Enhanced Node Sequence Embedding - for Single nodes), a skip-gram based novel embedding mechanism, for single graph nodes that co-learns graph structure as well as their textual descriptions. We demonstrate that SENSE-S vectors increase the accuracy of multi-label classification tasks by up to 50% and link-prediction tasks by up to 78% under a variety of scenarios using real datasets. Based on SENSE-S, we next propose generic SENSE to compute composite vectors that represent a sequence of nodes, where preserving the node order is important. We prove that this approach is efficient in embedding node sequences, and our experiments on real data confirm its high accuracy in node order decoding.", "keywords": "Semantic;Graph;Sequence;Embeddings", "primary_area": "", "supplementary_material": "", "author": "Swati Rallapalli;Liang Ma;Mudhakar Srivatsa;Ananthram Swami;Heesung Kwon;Graham Bent;Christopher Simpkin", "authorids": "srallapalli@us.ibm.com;maliang@us.ibm.com;msrivats@us.ibm.com;ananthram.swami.civ@mail.mil;heesung.kwon.civ@mail.mil;gbent@uk.ibm.com;simpkin.chris@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nrallapalli2019sense,\ntitle={{SENSE}: {SEMANTICALLY} {ENHANCED} {NODE} {SEQUENCE} {EMBEDDING}},\nauthor={Swati Rallapalli and Liang Ma and Mudhakar Srivatsa and Ananthram Swami and Heesung Kwon and Graham Bent and Christopher Simpkin},\nyear={2019},\nurl={https://openreview.net/forum?id=B1x9siCcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=B1x9siCcYQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;3", "wc_review": "407;402;195", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "357;272;321", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 334.6666666666667, 98.7803399242762 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 316.6666666666667, 34.83612429010374 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15298269651899444088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "B1xFVhActm", "title": "Fake Sentence Detection as a Training Task for Sentence Encoding", "track": "main", "status": "Reject", "tldr": "", "abstract": " Sentence encoders are typically trained on generative language modeling tasks with large unlabeled datasets. While these encoders achieve strong results on many sentence-level tasks, they are difficult to train with long training cycles. \n We introduce fake sentence detection as a new discriminative training task for learning sentence encoders. We automatically generate fake sentences by corrupting original sentences from a source collection and train the encoders to produce representations that are effective at detecting fake sentences. This binary classification task turns to be quite efficient for training sentence encoders. We compare a basic BiLSTM encoder trained on this task with strong sentence encoding models (Skipthought and FastSent) trained on a language modeling task. We find that the BiLSTM trains much faster on fake sentence detection (20 hours instead of weeks) using smaller amounts of data (1M instead of 64M sentences). Further analysis shows the learned representations also capture many syntactic and semantic properties expected from good sentence representations.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Viresh Ranjan;Heeyoung Kwon;Niranjan Balasubramanian;Minh Hoai", "authorids": "vranjan@cs.stonybrook.edu;heekwon@cs.stonybrook.edu;niranjan@cs.stonybrook.edu;minhhoai@cs.stonybrook.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nranjan2019fake,\ntitle={Fake Sentence Detection as a Training Task for Sentence Encoding},\nauthor={Viresh Ranjan and Heeyoung Kwon and Niranjan Balasubramanian and Minh Hoai},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xFVhActm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1xFVhActm", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;4;3", "wc_review": "515;1367;367", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 749.6666666666666, 440.6822992688598 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16218370024415385620&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "B1xFhiC9Y7", "title": "Domain Adaptation for Structured Output via Disentangled Patch Representations", "track": "main", "status": "Reject", "tldr": "A domain adaptation method for structured output via learning patch-level discriminative feature representations", "abstract": "Predicting structured outputs such as semantic segmentation relies on expensive per-pixel annotations to learn strong supervised models like convolutional neural networks. However, these models trained on one data domain may not generalize well to other domains unequipped with annotations for model finetuning. To avoid the labor-intensive process of annotation, we develop a domain adaptation method to adapt the source data to the unlabeled target domain. To this end, we propose to learn discriminative feature representations of patches based on label histograms in the source domain, through the construction of a disentangled space. With such representations as guidance, we then use an adversarial learning scheme to push the feature representations in target patches to the closer distributions in source ones. In addition, we show that our framework can integrate a global alignment process with the proposed patch-level alignment and achieve state-of-the-art performance on semantic segmentation. Extensive ablation studies and experiments are conducted on numerous benchmark datasets with various settings, such as synthetic-to-real and cross-city scenarios.", "keywords": "Domain Adaptation;Feature Representation Learning;Semantic Segmentation", "primary_area": "", "supplementary_material": "", "author": "Yi-Hsuan Tsai;Kihyuk Sohn;Samuel Schulter;Manmohan Chandraker", "authorids": "wasidennis@gmail.com;kihyuk.sohn@gmail.com;samuel@nec-labs.com;manu@nec-labs.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ntsai2019domain,\ntitle={Domain Adaptation for Structured Output via Disentangled Patch Representations},\nauthor={Yi-Hsuan Tsai and Kihyuk Sohn and Samuel Schulter and Manmohan Chandraker},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xFhiC9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1xFhiC9Y7", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;5", "wc_review": "491;425;379", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "456;563;234", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 431.6666666666667, 45.9661711358352 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 417.6666666666667, 137.02149060964447 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3548121311467294320&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1xFxh0cKX", "title": "Guided Evolutionary Strategies: Escaping the curse of dimensionality in random search", "track": "main", "status": "Reject", "tldr": "We propose an optimization method for when only biased gradients are available--we define a new gradient estimator for this scenario, derive the bias and variance of this estimator, and apply it to example problems.", "abstract": "Many applications in machine learning require optimizing a function whose true gradient is unknown, but where surrogate gradient information (directions that may be correlated with, but not necessarily identical to, the true gradient) is available instead. This arises when an approximate gradient is easier to compute than the full gradient (e.g. in meta-learning or unrolled optimization), or when a true gradient is intractable and is replaced with a surrogate (e.g. in certain reinforcement learning applications or training networks with discrete variables). We propose Guided Evolutionary Strategies, a method for optimally using surrogate gradient directions along with random search. We define a search distribution for evolutionary strategies that is elongated along a subspace spanned by the surrogate gradients. This allows us to estimate a descent direction which can then be passed to a first-order optimizer. We analytically and numerically characterize the tradeoffs that result from tuning how strongly the search distribution is stretched along the guiding subspace, and use this to derive a setting of the hyperparameters that works well across problems. Finally, we apply our method to example problems including truncated unrolled optimization and training neural networks with discrete variables, demonstrating improvement over both standard evolutionary strategies and first-order methods (that directly follow the surrogate gradient). We provide a demo of Guided ES at: redacted URL", "keywords": "evolutionary strategies;optimization;gradient estimators;biased gradients", "primary_area": "", "supplementary_material": "", "author": "Niru Maheswaranathan;Luke Metz;George Tucker;Dami Choi;Jascha Sohl-Dickstein", "authorids": "nirum@google.com;lmetz@google.com;gjt@google.com;damichoi@google.com;jaschasd@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmaheswaranathan2019guided,\ntitle={Guided Evolutionary Strategies: Escaping the curse of dimensionality in random search},\nauthor={Niru Maheswaranathan and Luke Metz and George Tucker and Dami Choi and Jascha Sohl-Dickstein},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xFxh0cKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1xFxh0cKX", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;4", "wc_review": "453;244;214", "wc_reply_reviewers": "0;0;93", "wc_reply_authors": "132;560;326", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 303.6666666666667, 106.30250336761699 ], "wc_reply_reviewers_avg": [ 31.0, 43.840620433565945 ], "wc_reply_authors_avg": [ 339.3333333333333, 174.98444375302498 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5286797438044511850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "B1xHUiC5tm", "title": "Context-aware Forecasting for Multivariate Stationary Time-series", "track": "main", "status": "Reject", "tldr": "In order to forecast multivariate stationary time-series we learn embeddings containing contextual features within a RNN; we apply the framework on public transportation data", "abstract": "The domain of time-series forecasting has been extensively studied because it is of fundamental importance in many real-life applications. Weather prediction, traffic flow forecasting or sales are compelling examples of sequential phenomena. Predictive models generally make use of the relations between past and future values. However, in the case of stationary time-series, observed values also drastically depend on a number of exogenous features that can be used to improve forecasting quality. In this work, we propose a change of paradigm which consists in learning such features in embeddings vectors within recurrent neural networks. We apply our framework to forecast smart cards tap-in logs in the Parisian subway network. Results show that context-embedded models perform quantitatively better in one-step ahead and multi-step ahead forecasting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Valentin Guiguet;Nicolas Baskiotis;Vincent Guigue;Patrick Gallinari", "authorids": "guiguetvalentin@gmail.com;nicolas.baskiotis@lip6.fr;vincent.guigue@lip6.fr;patrick.gallinari@lip6.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=B1xHUiC5tm", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;3", "wc_review": "186;589;717", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 497.3333333333333, 226.26287563117575 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13335131564852949734&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Learning Multimodal Graph-to-Graph Translation for Molecule Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/719", "id": "B1xJAsA5F7", "author_site": "Wengong Jin, Kevin Yang, Regina Barzilay, Tommi Jaakkola", "tldr": "We introduce a graph-to-graph encoder-decoder framework for learning diverse graph translations.", "abstract": "We view molecule optimization as a graph-to-graph translation problem. The goal is to learn to map from one molecular graph to another with better properties based on an available corpus of paired molecules. Since molecules can be optimized in different ways, there are multiple viable translations for each input graph. A key challenge is therefore to model diverse translation outputs. Our primary contributions include a junction tree encoder-decoder for learning diverse graph translations along with a novel adversarial training method for aligning distributions of molecules. Diverse output distributions in our model are explicitly realized by low-dimensional latent vectors that modulate the translation process. We evaluate our model on multiple molecule optimization tasks and show that our model outperforms previous state-of-the-art baselines by a significant margin. \n", "keywords": "graph-to-graph translation;graph generation;molecular optimization", "primary_area": "", "supplementary_material": "", "author": "Wengong Jin;Kevin Yang;Regina Barzilay;Tommi Jaakkola", "authorids": "wengong@csail.mit.edu;yangk@mit.edu;regina@csail.mit.edu;tommi@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njin2018learning,\ntitle={Learning Multimodal Graph-to-Graph Translation for Molecule Optimization},\nauthor={Wengong Jin and Kevin Yang and Regina Barzilay and Tommi Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xJAsA5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;5;4", "wc_review": "531;570;278", "wc_reply_reviewers": "0;499;0", "wc_reply_authors": "1043;2104;365", "reply_reviewers": "0;2;0", "reply_authors": "2;4;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 459.6666666666667, 129.44067710302232 ], "wc_reply_reviewers_avg": [ 166.33333333333334, 235.2308558747248 ], "wc_reply_authors_avg": [ 1170.6666666666667, 715.6602235387653 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 327, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5726195198168837773&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1xJAsA5F7", "pdf": "https://openreview.net/pdf?id=B1xJAsA5F7", "email": ";;;", "author_num": 4 }, { "id": "B1xOYoA5tQ", "title": "Multi-way Encoding for Robustness to Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "We demonstrate that by leveraging a multi-way output encoding, rather than the widely used one-hot encoding, we can make deep models more robust to adversarial attacks.", "abstract": "Deep models are state-of-the-art for many computer vision tasks including image classification and object detection. However, it has been shown that deep models are vulnerable to adversarial examples. We highlight how one-hot encoding directly contributes to this vulnerability and propose breaking away from this widely-used, but highly-vulnerable mapping. We demonstrate that by leveraging a different output encoding, multi-way encoding, we can make models more robust. Our approach makes it more difficult for adversaries to find useful gradients for generating adversarial attacks. We present state-of-the-art robustness results for black-box, white-box attacks, and achieve higher clean accuracy on four benchmark datasets: MNIST, CIFAR-10, CIFAR-100, and SVHN when combined with adversarial training. The strength of our approach is also presented in the form of an attack for model watermarking, raising challenges in detecting stolen models.", "keywords": "Adversarial Defense;Robustness of Deep Convolutional Networks", "primary_area": "", "supplementary_material": "", "author": "Donghyun Kim;Sarah Adel Bargal;Jianming Zhang;Stan Sclaroff", "authorids": "donhk@bu.edu;sbargal@bu.edu;jianmzha@adobe.com;sclaroff@bu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkim2019multiway,\ntitle={Multi-way Encoding for Robustness to Adversarial Attacks},\nauthor={Donghyun Kim and Sarah Adel Bargal and Jianming Zhang and Stan Sclaroff},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xOYoA5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=B1xOYoA5tQ", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;2;2;3", "wc_review": "604;309;227;142", "wc_reply_reviewers": "585;0;0;0", "wc_reply_authors": "2091;343;138;33", "reply_reviewers": "2;0;0;0", "reply_authors": "4;1;1;1", "rating_avg": [ 5.5, 0.8660254037844386 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "wc_review_avg": [ 320.5, 174.00359191694866 ], "wc_reply_reviewers_avg": [ 146.25, 253.3124306069483 ], "wc_reply_authors_avg": [ 651.25, 838.6830077568044 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8703882797784891, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3837788221899023286&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "B1xU4nAqK7", "title": "Unsupervised Exploration with Deep Model-Based Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reinforcement learning (RL) often requires large numbers of trials to solve a single specific task. This is in sharp contrast to human and animal learning: humans and animals can use past experience to acquire an understanding about the world, which they can then use to perform new tasks with minimal additional learning. In this work, we study how an unsupervised exploration phase can be used to build up such prior knowledge, which can then be utilized in a second phase to perform new tasks, either directly without any additional exploration, or through minimal fine-tuning. A critical question with this approach is: what kind of knowledge should be transferred from the unsupervised phase to the goal-directed phase? We argue that model-based RL offers an appealing solution. By transferring models, which are task-agnostic, we can perform new tasks without any additional learning at all. However, this relies on having a suitable exploration method during unsupervised training, and a model-based RL method that can effectively utilize modern high-capacity parametric function classes, such as deep neural networks. We show that both challenges can be addressed by representing model-uncertainty, which can both guide exploration in the unsupervised phase and ensure that the errors in the model are not exploited by the planner in the goal-directed phase. We illustrate, on simple simulated benchmark tasks, that our method can perform various goal-directed skills on the first attempt, and can improve further with fine-tuning, exceeding the performance of alternative exploration methods.", "keywords": "exploration;model based reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Kurtland Chua;Rowan McAllister;Roberto Calandra;Sergey Levine", "authorids": "kchua@berkeley.edu;rmcallister@berkeley.edu;roberto.calandra@berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchua2019unsupervised,\ntitle={Unsupervised Exploration with Deep Model-Based Reinforcement Learning},\nauthor={Kurtland Chua and Rowan McAllister and Roberto Calandra and Sergey Levine},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xU4nAqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=B1xU4nAqK7", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;4", "wc_review": "360;242;357", "wc_reply_reviewers": "30;0;0", "wc_reply_authors": "70;70;70", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 319.6666666666667, 54.932281543328926 ], "wc_reply_reviewers_avg": [ 10.0, 14.142135623730951 ], "wc_reply_authors_avg": [ 70.0, 0.0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13252674845825995483&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "A Data-Driven and Distributed Approach to Sparse Signal Representation and Recovery", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1051", "id": "B1xVTjCqKQ", "author_site": "Ali Mousavi, Gautam Dasarathy, Richard Baraniuk", "tldr": "We use deep learning techniques to solve the sparse signal representation and recovery problem.", "abstract": "In this paper, we focus on two challenges which offset the promise of sparse signal representation, sensing, and recovery. First, real-world signals can seldom be described as perfectly sparse vectors in a known basis, and traditionally used random measurement schemes are seldom optimal for sensing them. Second, existing signal recovery algorithms are usually not fast enough to make them applicable to real-time problems. In this paper, we address these two challenges by presenting a novel framework based on deep learning. For the first challenge, we cast the problem of finding informative measurements by using a maximum likelihood (ML) formulation and show how we can build a data-driven dimensionality reduction protocol for sensing signals using convolutional architectures. For the second challenge, we discuss and analyze a novel parallelization scheme and show it significantly speeds-up the signal recovery process. We demonstrate the significant improvement our method obtains over competing methods through a series of experiments. ", "keywords": "Sparsity;Compressive Sensing;Convolutional Network", "primary_area": "", "supplementary_material": "", "author": "Ali Mousavi;Gautam Dasarathy;Richard G. Baraniuk", "authorids": "ali.mousavi1988@gmail.com;gautamd@asu.edu;richb@rice.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmousavi2018a,\ntitle={A Data-Driven and Distributed Approach to Sparse Signal Representation and Recovery},\nauthor={Ali Mousavi and Gautam Dasarathy and Richard G. Baraniuk},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xVTjCqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;4", "wc_review": "218;418;410", "wc_reply_reviewers": "0;0;13", "wc_reply_authors": "583;660;814", "reply_reviewers": "0;0;1", "reply_authors": "1;2;2", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 348.6666666666667, 92.45299105791848 ], "wc_reply_reviewers_avg": [ 4.333333333333333, 6.128258770283413 ], "wc_reply_authors_avg": [ 685.6666666666666, 96.03587292719783 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2392789320430907220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=B1xVTjCqKQ", "pdf": "https://openreview.net/pdf?id=B1xVTjCqKQ", "email": ";;", "author_num": 3 }, { "title": "On the Minimal Supervision for Training Any Binary Classifier from Only Unlabeled Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/807", "id": "B1xWcj0qYm", "author_site": "Nan Lu, Gang Niu, Aditya Krishna Menon, Masashi Sugiyama", "tldr": "Three class priors are all you need to train deep models from only U data, while any two should not be enough.", "abstract": "Empirical risk minimization (ERM), with proper loss function and regularization, is the common practice of supervised classification. In this paper, we study training arbitrary (from linear to deep) binary classifier from only unlabeled (U) data by ERM. We prove that it is impossible to estimate the risk of an arbitrary binary classifier in an unbiased manner given a single set of U data, but it becomes possible given two sets of U data with different class priors. These two facts answer a fundamental question---what the minimal supervision is for training any binary classifier from only U data. Following these findings, we propose an ERM-based learning method from two sets of U data, and then prove it is consistent. Experiments demonstrate the proposed method could train deep models and outperform state-of-the-art methods for learning from two sets of U data.", "keywords": "learning from only unlabeled data;empirical risk minimization;unbiased risk estimator", "primary_area": "", "supplementary_material": "", "author": "Nan Lu;Gang Niu;Aditya Krishna Menon;Masashi Sugiyama", "authorids": "lu@ms.k.u-tokyo.ac.jp;gang.niu@riken.jp;adityakmenon@google.com;sugi@k.u-tokyo.ac.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlu2018on,\ntitle={On the Minimal Supervision for Training Any Binary Classifier from Only Unlabeled Data},\nauthor={Nan Lu and Gang Niu and Aditya K. Menon and Masashi Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xWcj0qYm},\n}", "github": "[![github](/images/github_icon.svg) lunanbit/UUlearning](https://github.com/lunanbit/UUlearning)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8;8", "confidence": "4;4;3;3", "wc_review": "651;309;594;416", "wc_reply_reviewers": "0;0;21;15", "wc_reply_authors": "1076;380;717;410", "reply_reviewers": "0;0;1;1", "reply_authors": "5;1;5;1", "rating_avg": [ 7.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 492.5, 136.88407504162052 ], "wc_reply_reviewers_avg": [ 9.0, 9.246621004453464 ], "wc_reply_authors_avg": [ 645.75, 281.24399993599866 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 3.0, 2.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12632779449090033610&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=B1xWcj0qYm", "pdf": "https://openreview.net/pdf?id=B1xWcj0qYm", "email": ";;;", "author_num": 4 }, { "title": "Neural Logic Machines", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/816", "id": "B1xY-hRctX", "author_site": "Honghua Dong, Jiayuan Mao, Tian Lin, Chong Wang, Lihong Li, Dengyong Zhou", "tldr": "We propose the Neural Logic Machine (NLM), a neural-symbolic architecture for both inductive learning and logic reasoning.", "abstract": "We propose the Neural Logic Machine (NLM), a neural-symbolic architecture for both inductive learning and logic reasoning. NLMs exploit the power of both neural networks---as function approximators, and logic programming---as a symbolic processor for objects with properties, relations, logic connectives, and quantifiers. After being trained on small-scale tasks (such as sorting short arrays), NLMs can recover lifted rules, and generalize to large-scale tasks (such as sorting longer arrays). In our experiments, NLMs achieve perfect generalization in a number of tasks, from relational reasoning tasks on the family tree and general graphs, to decision making tasks including sorting arrays, finding shortest paths, and playing the blocks world. Most of these tasks are hard to accomplish for neural networks or inductive logic programming alone.", "keywords": "Neuro-Symbolic Computation;Logic Induction", "primary_area": "", "supplementary_material": "", "author": "Honghua Dong;Jiayuan Mao;Tian Lin;Chong Wang;Lihong Li;Denny Zhou", "authorids": "dhh19951@gmail.com;maojiayuan@gmail.com;tianlin@google.com;chongw@google.com;lihongli.cs@gmail.com;dennyzhou@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ndong2018neural,\ntitle={Neural Logic Machines},\nauthor={Honghua Dong and Jiayuan Mao and Tian Lin and Chong Wang and Lihong Li and Denny Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xY-hRctX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=B1xY-hRctX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;3;2", "wc_review": "846;402;203", "wc_reply_reviewers": "0;45;0", "wc_reply_authors": "1024;353;159", "reply_reviewers": "0;1;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 483.6666666666667, 268.7803729606924 ], "wc_reply_reviewers_avg": [ 15.0, 21.213203435596427 ], "wc_reply_authors_avg": [ 512.0, 370.60041374324805 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 352, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4525183211642569463&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=B1xY-hRctX", "pdf": "https://openreview.net/pdf?id=B1xY-hRctX", "email": ";;;;;", "author_num": 6 }, { "id": "B1xeyhCctQ", "title": "Bias Also Matters: Bias Attribution for Deep Neural Network Explanation", "track": "main", "status": "Reject", "tldr": "Attribute the bias terms of deep neural networks to input features by a backpropagation-type algorithm; Generate complementary and highly interpretable explanations of DNNs in addition to gradient-based attributions.", "abstract": "The gradient of a deep neural network (DNN) w.r.t. the input provides information that can be used to explain the output prediction in terms of the input features and has been widely studied to assist in interpreting DNNs. In a linear model (i.e., $g(x)=wx+b$), the gradient corresponds solely to the weights $w$. Such a model can reasonably locally linearly approximate a smooth nonlinear DNN, and hence the weights of this local model are the gradient. The other part, however, of a local linear model, i.e., the bias $b$, is usually overlooked in attribution methods since it is not part of the gradient. In this paper, we observe that since the bias in a DNN also has a non-negligible contribution to the correctness of predictions, it can also play a significant role in understanding DNN behaviors. In particular, we study how to attribute a DNN's bias to its input features. We propose a backpropagation-type algorithm ``bias back-propagation (BBp)'' that starts at the output layer and iteratively attributes the bias of each layer to its input nodes as well as combining the resulting bias term of the previous layer. This process stops at the input layer, where summing up the attributions over all the input features exactly recovers $b$. Together with the backpropagation of the gradient generating $w$, we can fully recover the locally linear model $g(x)=wx+b$. Hence, the attribution of the DNN outputs to its inputs is decomposed into two parts, the gradient $w$ and the bias attribution, providing separate and complementary explanations. We study several possible attribution methods applied to the bias of each layer in BBp. In experiments, we show that BBp can generate complementary and highly interpretable explanations of DNNs in addition to gradient-based attributions.", "keywords": "explainable AI;interpreting deep neural networks;bias;attribution method;piecewise linear activation function;backpropagation", "primary_area": "", "supplementary_material": "", "author": "Shengjie Wang;Tianyi Zhou;Jeff Bilmes", "authorids": "wangsj@cs.washington.edu;tianyi.david.zhou@gmail.com;bilmes@uw.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2019bias,\ntitle={Bias Also Matters: Bias Attribution for Deep Neural Network Explanation},\nauthor={Shengjie Wang and Tianyi Zhou and Jeff Bilmes},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xeyhCctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1xeyhCctQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;5;5", "wc_review": "287;406;835", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 509.3333333333333, 235.3498577767906 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8475068318232695769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Neural Speed Reading with Structural-Jump-LSTM", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1044", "id": "B1xf9jAqFQ", "author_site": "Christian Hansen, Casper Hansen, Stephen Alstrup, Jakob Simonsen, Christina Lioma", "tldr": "We propose a new model for neural speed reading that utilizes the inherent punctuation structure of a text to define effective jumping and skipping behavior.", "abstract": "Recurrent neural networks (RNNs) can model natural language by sequentially ''reading'' input tokens and outputting a distributed representation of each token. Due to the sequential nature of RNNs, inference time is linearly dependent on the input length, and all inputs are read regardless of their importance. Efforts to speed up this inference, known as ''neural speed reading'', either ignore or skim over part of the input. We present Structural-Jump-LSTM: the first neural speed reading model to both skip and jump text during inference. The model consists of a standard LSTM and two agents: one capable of skipping single words when reading, and one capable of exploiting punctuation structure (sub-sentence separators (,:), sentence end symbols (.!?), or end of text markers) to jump ahead after reading a word.\nA comprehensive experimental evaluation of our model against all five state-of-the-art neural reading models shows that \nStructural-Jump-LSTM achieves the best overall floating point operations (FLOP) reduction (hence is faster), while keeping the same accuracy or even improving it compared to a vanilla LSTM that reads the whole text.", "keywords": "natural language processing;speed reading;recurrent neural network;classification", "primary_area": "", "supplementary_material": "", "author": "Christian Hansen;Casper Hansen;Stephen Alstrup;Jakob Grue Simonsen;Christina Lioma", "authorids": "chrh@di.ku.dk;c.hansen@di.ku.dk;s.alstrup@di.ku.dk;simonsen@di.ku.dk;c.lioma@di.ku.dk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhansen2018neural,\ntitle={Neural Speed Reading with Structural-Jump-{LSTM}},\nauthor={Christian Hansen and Casper Hansen and Stephen Alstrup and Jakob Grue Simonsen and Christina Lioma},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xf9jAqFQ},\n}", "github": "[![github](/images/github_icon.svg) Varyn/Neural-Speed-Reading-with-Structural-Jump-LSTM](https://github.com/Varyn/Neural-Speed-Reading-with-Structural-Jump-LSTM)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;5", "wc_review": "131;295;266", "wc_reply_reviewers": "0;0;21", "wc_reply_authors": "444;420;662", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 230.66666666666666, 71.46249987852993 ], "wc_reply_reviewers_avg": [ 7.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 508.6666666666667, 108.86484995422332 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10699754124824317847&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=B1xf9jAqFQ", "pdf": "https://openreview.net/pdf?id=B1xf9jAqFQ", "email": ";;;;", "author_num": 5 }, { "title": "Rigorous Agent Evaluation: An Adversarial Approach to Uncover Catastrophic Failures", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1136", "id": "B1xhQhRcK7", "author_site": "Jonathan Uesato, Ananya Kumar, Csaba Szepesvari, Tom Erez, Avraham Ruderman, Keith Anderson, Krishnamurthy Dvijotham, Nicolas Heess, Pushmeet Kohli", "tldr": "We show that rare but catastrophic failures may be missed entirely by random testing, which poses issues for safe deployment. Our proposed approach for adversarial testing fixes this.", "abstract": "This paper addresses the problem of evaluating learning systems in safety critical domains such as autonomous driving, where failures can have catastrophic consequences. We focus on two problems: searching for scenarios when learned agents fail and assessing their probability of failure. The standard method for agent evaluation in reinforcement learning, Vanilla Monte Carlo, can miss failures entirely, leading to the deployment of unsafe agents. We demonstrate this is an issue for current agents, where even matching the compute used for training is sometimes insufficient for evaluation. To address this shortcoming, we draw upon the rare event probability estimation literature and propose an adversarial evaluation approach. Our approach focuses evaluation on adversarially chosen situations, while still providing unbiased estimates of failure probabilities. The key difficulty is in identifying these adversarial situations -- since failures are rare there is little signal to drive optimization. To solve this we propose a continuation approach that learns failure modes in related but less robust agents. Our approach also allows reuse of data already collected for training the agent. We demonstrate the efficacy of adversarial evaluation on two standard domains: humanoid control and simulated driving. Experimental results show that our methods can find catastrophic failures and estimate failures rates of agents multiple orders of magnitude faster than standard evaluation schemes, in minutes to hours rather than days.", "keywords": "agent evaluation;adversarial examples;robustness;safety;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Jonathan Uesato*;Ananya Kumar*;Csaba Szepesvari*;Tom Erez;Avraham Ruderman;Keith Anderson;Krishnamurthy (Dj) Dvijotham;Nicolas Heess;Pushmeet Kohli", "authorids": "juesato@gmail.com;ananya@cs.stanford.edu;szepi@google.com;etom@google.com;aruderman@google.com;keithanderson@google.com;dvij@google.com;heess@google.com;pushmeet@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nuesato*2018rigorous,\ntitle={Rigorous Agent Evaluation: An Adversarial Approach to Uncover Catastrophic Failures},\nauthor={Jonathan Uesato* and Ananya Kumar* and Csaba Szepesvari* and Tom Erez and Avraham Ruderman and Keith Anderson and Krishnamurthy (Dj) Dvijotham and Nicolas Heess and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xhQhRcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;3", "wc_review": "519;248;340", "wc_reply_reviewers": "0;160;0", "wc_reply_authors": "888;1044;853", "reply_reviewers": "0;1;0", "reply_authors": "2;3;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 369.0, 112.51962791738455 ], "wc_reply_reviewers_avg": [ 53.333333333333336, 75.42472332656506 ], "wc_reply_authors_avg": [ 928.3333333333334, 83.02743857036393 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13064865884841591859&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=B1xhQhRcK7", "pdf": "https://openreview.net/pdf?id=B1xhQhRcK7", "email": ";;;;;;;;", "author_num": 9 }, { "id": "B1xnPsA5KX", "title": "Modular Deep Probabilistic Programming", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modularity is a key feature of deep learning libraries but has not been fully exploited for probabilistic programming. We propose to improve modularity of probabilistic programming language by offering not only plain probabilistic distributions but also sophisticated probabilistic model such as Bayesian non-parametric models as fundamental building blocks. We demonstrate this idea by presenting a modular probabilistic programming language MXFusion, which includes a new type of re-usable building blocks, called probabilistic modules. A probabilistic module consists of a set of random variables with associated probabilistic distributions and dedicated inference methods. Under the framework of variational inference, the pre-specified inference methods of individual probabilistic modules can be transparently used for inference of the whole probabilistic model. We demonstrate the power and convenience of probabilistic modules in MXFusion with various examples of Gaussian process models, which are evaluated with experiments on real data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhenwen Dai;Eric Meissner;Neil D. Lawrence", "authorids": "zhenwend@amazon.com;erimeiss@amazon.com;lawrennd@amazon.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndai2019modular,\ntitle={Modular Deep Probabilistic Programming},\nauthor={Zhenwen Dai and Eric Meissner and Neil D. Lawrence},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xnPsA5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=B1xnPsA5KX", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;3", "wc_review": "543;583;555", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "244;365;172", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 560.3333333333334, 16.75974011996871 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 260.3333333333333, 79.63388446858256 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W4nTuulkFPYJ:scholar.google.com/&scioq=Modular+Deep+Probabilistic+Programming&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Large Scale GAN Training for High Fidelity Natural Image Synthesis", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/937", "id": "B1xsqj09Fm", "author_site": "Andrew Brock, Jeff Donahue, Karen Simonyan", "tldr": "GANs benefit from scaling up.", "abstract": "Despite recent progress in generative image modeling, successfully generating high-resolution, diverse samples from complex datasets such as ImageNet remains an elusive goal. To this end, we train Generative Adversarial Networks at the largest scale yet attempted, and study the instabilities specific to such scale. We find that applying orthogonal regularization to the generator renders it amenable to a simple \"truncation trick\", allowing fine control over the trade-off between sample fidelity and variety by reducing the variance of the Generator's input. Our modifications lead to models which set the new state of the art in class-conditional image synthesis. When trained on ImageNet at 128x128 resolution, our models (BigGANs) achieve an Inception Score (IS) of 166.3 and Frechet Inception Distance (FID) of 9.6, improving over the previous best IS of 52.52 and FID of 18.65.", "keywords": "GANs;Generative Models;Large Scale Training;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Andrew Brock;Jeff Donahue;Karen Simonyan", "authorids": "ajb5@hw.ac.uk;jeffdonahue@google.com;simonyan@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbrock2018large,\ntitle={Large Scale {GAN} Training for High Fidelity Natural Image Synthesis},\nauthor={Andrew Brock and Jeff Donahue and Karen Simonyan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=B1xsqj09Fm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 33 community implementations](https://paperswithcode.com/paper/?openreview=B1xsqj09Fm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;8;9", "confidence": "3;4;4", "wc_review": "960;360;376", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "992;279;287", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 565.3333333333334, 279.14790981763696 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 519.3333333333334, 334.24176213566665 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 6860, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9573828555610570748&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=B1xsqj09Fm", "pdf": "https://openreview.net/pdf?id=B1xsqj09Fm", "email": ";;", "author_num": 3 }, { "id": "B1zMDjAqKQ", "title": "Unsupervised Expectation Learning for Multisensory Binding", "track": "main", "status": "Reject", "tldr": "A hybrid deep neural network which adapts concepts of expectation learning for improving unisensory recognition using multisensory binding. ", "abstract": "Expectation learning is a continuous learning process which uses known multisensory bindings to modulate unisensory perception. When perceiving an event, we have an expectation on what we should see or hear which affects our unisensory perception. Expectation learning is known to enhance the unisensory perception of previously known multisensory events. In this work, we present a novel hybrid deep recurrent model based on audio/visual autoencoders, for unimodal stimulus representation and reconstruction, and a recurrent self-organizing network for multisensory binding of the representations. The model adapts concepts of expectation learning to enhance the unisensory representation based on the learned bindings.\nWe demonstrate that the proposed model is capable of reconstructing signals from one modality by processing input of another modality for 43,500 Youtube videos in the animal subset of the AudioSet corpus. Our experiments also show that when using expectation learning, the proposed model presents state-of-the-art performance in representing and classifying unisensory stimuli.", "keywords": "multisensory binding;expectation learning;unsupervised learning;Deep autoencoder;Growing-When-Required Network;animal recognition", "primary_area": "", "supplementary_material": "", "author": "Pablo Barros;German I. Parisi;Manfred Eppe;Stefan Wermter", "authorids": "barros@informatik.uni-hamburg.de;parisi@informatik.uni-hamburg.de;eppe@informatik.uni-hamburg.de;wermter@informatik.uni-hamburg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbarros2019unsupervised,\ntitle={Unsupervised Expectation Learning for Multisensory Binding},\nauthor={Pablo Barros and German I. Parisi and Manfred Eppe and Stefan Wermter},\nyear={2019},\nurl={https://openreview.net/forum?id=B1zMDjAqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=B1zMDjAqKQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;2;3", "wc_review": "147;458;383", "wc_reply_reviewers": "0;0;101", "wc_reply_authors": "365;583;1054", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 329.3333333333333, 132.51498867004526 ], "wc_reply_reviewers_avg": [ 33.666666666666664, 47.6118565998942 ], "wc_reply_authors_avg": [ 667.3333333333334, 287.534732201559 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lgIuriaI2NoJ:scholar.google.com/&scioq=Unsupervised+Expectation+Learning+for+Multisensory+Binding&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BJ4AFsRcFQ", "title": "Total Style Transfer with a Single Feed-Forward Network", "track": "main", "status": "Reject", "tldr": "A paper suggesting a method to transform the style of images using deep neural networks.", "abstract": "Recent image style transferring methods achieved arbitrary stylization with input content and style images. To transfer the style of an arbitrary image to a content image, these methods used a feed-forward network with a lowest-scaled feature transformer or a cascade of the networks with a feature transformer of a corresponding scale. However, their approaches did not consider either multi-scaled style in their single-scale feature transformer or dependency between the transformed feature statistics across the cascade networks. This shortcoming resulted in generating partially and inexactly transferred style in the generated images.\nTo overcome this limitation of partial style transfer, we propose a total style transferring method which transfers multi-scaled feature statistics through a single feed-forward process. First, our method transforms multi-scaled feature maps of a content image into those of a target style image by considering both inter-channel correlations in each single scaled feature map and inter-scale correlations between multi-scaled feature maps. Second, each transformed feature map is inserted into the decoder layer of the corresponding scale using skip-connection. Finally, the skip-connected multi-scaled feature maps are decoded into a stylized image through our trained decoder network.", "keywords": "Image Style Transfer;Deep Learning;Neural Network", "primary_area": "", "supplementary_material": "", "author": "Minseong Kim;Hyun-Chul Choi", "authorids": "tyui592@ynu.ac.kr;pogary@ynu.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkim2019total,\ntitle={Total Style Transfer with a Single Feed-Forward Network},\nauthor={Minseong Kim and Hyun-Chul Choi},\nyear={2019},\nurl={https://openreview.net/forum?id=BJ4AFsRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ4AFsRcFQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;5;5", "wc_review": "423;631;758", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 604.0, 138.0893430597259 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16416931782130736312&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "BJ4BVhRcYX", "title": "INTERPRETABLE CONVOLUTIONAL FILTER PRUNING", "track": "main", "status": "Reject", "tldr": "", "abstract": "The sophisticated structure of Convolutional Neural Network (CNN) allows for\noutstanding performance, but at the cost of intensive computation. As significant\nredundancies inevitably present in such a structure, many works have been proposed\nto prune the convolutional filters for computation cost reduction. Although\nextremely effective, most works are based only on quantitative characteristics of\nthe convolutional filters, and highly overlook the qualitative interpretation of individual\nfilter\u2019s specific functionality. In this work, we interpreted the functionality\nand redundancy of the convolutional filters from different perspectives, and proposed\na functionality-oriented filter pruning method. With extensive experiment\nresults, we proved the convolutional filters\u2019 qualitative significance regardless of\nmagnitude, demonstrated significant neural network redundancy due to repetitive\nfilter functions, and analyzed the filter functionality defection under inappropriate\nretraining process. Such an interpretable pruning approach not only offers outstanding\ncomputation cost optimization over previous filter pruning methods, but\nalso interprets filter pruning process.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuwei Qin;Fuxun Yu;Chenchen Liu;Xiang Chen", "authorids": "zqin@gmu.edu;fyu2@gmu.edu;chliu@clarkson.edu;xchen26@gmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nqin2019interpretable,\ntitle={{INTERPRETABLE} {CONVOLUTIONAL} {FILTER} {PRUNING}},\nauthor={Zhuwei Qin and Fuxun Yu and Chenchen Liu and Xiang Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=BJ4BVhRcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJ4BVhRcYX", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;4", "wc_review": "239;444;544", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 409.0, 126.95143428361361 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10770740173005542707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "BJEOOsCqKm", "title": "Psychophysical vs. learnt texture representations in novelty detection", "track": "main", "status": "Reject", "tldr": "Comparison of psychophysical and CNN-encoded texture representations in a one-class neural network novelty detection application.", "abstract": "Parametric texture models have been applied successfully to synthesize artificial images. Psychophysical studies show that under defined conditions observers are unable to differentiate between model-generated and original natural textures. In industrial applications the reverse case is of interest: a texture analysis system should decide if human observers are able to discriminate between a reference and a novel texture. For example, in case of inspecting decorative surfaces the de- tection of visible texture anomalies without any prior knowledge is required. Here, we implemented a human-vision-inspired novelty detection approach. Assuming that the features used for texture synthesis are important for human texture percep- tion, we compare psychophysical as well as learnt texture representations based on activations of a pretrained CNN in a novelty detection scenario. Additionally, we introduce a novel objective function to train one-class neural networks for novelty detection and compare the results to standard one-class SVM approaches. Our experiments clearly show the differences between human-vision-inspired texture representations and learnt features in detecting visual anomalies. Based on a dig- ital print inspection scenario we show that psychophysical texture representations are able to outperform CNN-encoded features.", "keywords": "novelty detection;learnt texture representation;one-class neural network;human-vision-inspired anomaly detection", "primary_area": "", "supplementary_material": "", "author": "Michael Grunwald;Matthias Hermann;Fabian Freiberg;Matthias O. Franz", "authorids": "m.grunwald@htwg-konstanz.de;matthias.hermann@htwg-konstanz.de;f.freiberg@htwg-konstanz.de;mfanz@htwg-konstanz.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngrunwald2019psychophysical,\ntitle={Psychophysical vs. learnt texture representations in novelty detection},\nauthor={Michael Grunwald and Matthias Hermann and Fabian Freiberg and Matthias O. Franz},\nyear={2019},\nurl={https://openreview.net/forum?id=BJEOOsCqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=BJEOOsCqKm", "pdf_size": 0, "rating": "1;3;3;3", "confidence": "3;4;3;3", "wc_review": "122;434;173;228", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 2.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 239.25, 118.52294081737932 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BYpzgpAChVEJ:scholar.google.com/&scioq=Psychophysical+vs.+learnt+texture+representations+in+novelty+detection&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Woulda, Coulda, Shoulda: Counterfactually-Guided Policy Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1076", "id": "BJG0voC9YQ", "author_site": "Lars Buesing, Theophane Weber, Yori Zwols, Nicolas Heess, Sebastien Racaniere, Arthur Guez, Jean-Baptiste Lespiau", "tldr": "", "abstract": "Learning policies on data synthesized by models can in principle quench the thirst of reinforcement learning algorithms for large amounts of real experience, which is often costly to acquire. However, simulating plausible experience de novo is a hard problem for many complex environments, often resulting in biases for model-based policy evaluation and search. Instead of de novo synthesis of data, here we assume logged, real experience and model alternative outcomes of this experience under counterfactual actions, i.e. actions that were not actually taken. Based on this, we propose the Counterfactually-Guided Policy Search (CF-GPS) algorithm for learning policies in POMDPs from off-policy experience. It leverages structural causal models for counterfactual evaluation of arbitrary policies on individual off-policy episodes. CF-GPS can improve on vanilla model-based RL algorithms by making use of available logged data to de-bias model predictions. In contrast to off-policy algorithms based on Importance Sampling which re-weight data, CF-GPS leverages a model to explicitly consider alternative outcomes, allowing the algorithm to make better use of experience data. We find empirically that these advantages translate into improved policy evaluation and search results on a non-trivial grid-world task. Finally, we show that CF-GPS generalizes the previously proposed Guided Policy Search and that reparameterization-based algorithms such Stochastic Value Gradient can be interpreted as counterfactual methods.", "keywords": "reinforcement learning;generative models;model-based reinforcement learning;causal inference", "primary_area": "", "supplementary_material": "", "author": "Lars Buesing;Theophane Weber;Yori Zwols;Nicolas Heess;Sebastien Racaniere;Arthur Guez;Jean-Baptiste Lespiau", "authorids": "lbuesing@google.com;theophane@google.com;yori@google.com;heess@google.com;sracaniere@google.com;aguez@google.com;jblespiau@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nbuesing2018woulda,\ntitle={Woulda, Coulda, Shoulda: Counterfactually-Guided Policy Search},\nauthor={Lars Buesing and Theophane Weber and Yori Zwols and Nicolas Heess and Sebastien Racaniere and Arthur Guez and Jean-Baptiste Lespiau},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJG0voC9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;2", "wc_review": "392;431;326", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "101;136;158", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 383.0, 43.3358973600409 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 131.66666666666666, 23.471022323045258 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 166, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12981655284011501176&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJG0voC9YQ", "pdf": "https://openreview.net/pdf?id=BJG0voC9YQ", "email": ";;;;;;", "author_num": 7 }, { "id": "BJGVX3CqYm", "title": "Mixed Precision Quantization of ConvNets via Differentiable Neural Architecture Search", "track": "main", "status": "Reject", "tldr": "A novel differentiable neural architecture search framework for mixed quantization of ConvNets.", "abstract": "Recent work in network quantization has substantially reduced the time and space complexity of neural network inference, enabling their deployment on embedded and mobile devices with limited computational and memory resources. However, existing quantization methods often represent all weights and activations with the same precision (bit-width). In this paper, we explore a new dimension of the design space: quantizing different layers with different bit-widths. We formulate this problem as a neural architecture search problem and propose a novel differentiable neural architecture search (DNAS) framework to efficiently explore its exponential search space with gradient-based optimization. Experiments show we surpass the state-of-the-art compression of ResNet on CIFAR-10 and ImageNet. Our quantized models with 21.1x smaller model size or 103.9x lower computational cost can still outperform baseline quantized or even full precision models.", "keywords": "Neural Net Quantization;Neural Architecture Search", "primary_area": "", "supplementary_material": "", "author": "Bichen Wu;Yanghan Wang;Peizhao Zhang;Yuandong Tian;Peter Vajda;Kurt Keutzer", "authorids": ";yanghan@instagram.com;;yuandong@fb.com;;", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nwu2019mixed,\ntitle={Mixed Precision Quantization of ConvNets via Differentiable Neural Architecture Search},\nauthor={Bichen Wu and Yanghan Wang and Peizhao Zhang and Yuandong Tian and Peter Vajda and Kurt Keutzer},\nyear={2019},\nurl={https://openreview.net/forum?id=BJGVX3CqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=BJGVX3CqYm", "pdf_size": 0, "rating": "5;6;6;7", "confidence": "5;3;3;3", "wc_review": "441;262;221;667", "wc_reply_reviewers": "154;0;0;0", "wc_reply_authors": "1229;88;147;637", "reply_reviewers": "2;0;0;0", "reply_authors": "4;1;1;1", "rating_avg": [ 6.0, 0.7071067811865476 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "wc_review_avg": [ 397.75, 176.0928377305562 ], "wc_reply_reviewers_avg": [ 38.5, 66.68395609140178 ], "wc_reply_authors_avg": [ 525.25, 458.80626357973796 ], "reply_reviewers_avg": [ 0.5, 0.8660254037844386 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.816496580927726, "gs_citation": 336, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=750753856500160215&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJG__i0qF7", "title": "Learning to encode spatial relations from natural language", "track": "main", "status": "Reject", "tldr": "We introduce a system capable of capturing the semantics of spatial relations by grounding representation learning in vision.", "abstract": "Natural language processing has made significant inroads into learning the semantics of words through distributional approaches, however representations learnt via these methods fail to capture certain kinds of information implicit in the real world. In particular, spatial relations are encoded in a way that is inconsistent with human spatial reasoning and lacking invariance to viewpoint changes. We present a system capable of capturing the semantics of spatial relations such as behind, left of, etc from natural language. Our key contributions are a novel multi-modal objective based on generating images of scenes from their textual descriptions, and a new dataset on which to train it. We demonstrate that internal representations are robust to meaning preserving transformations of descriptions (paraphrase invariance), while viewpoint invariance is an emergent property of the system.", "keywords": "generative model;grounded language;scene understanding;natural language", "primary_area": "", "supplementary_material": "", "author": "Tiago Ramalho;Tomas Kocisky\u200e;Frederic Besse;S. M. Ali Eslami;Gabor Melis;Fabio Viola;Phil Blunsom;Karl Moritz Hermann", "authorids": "tiago.mpramalho@gmail.com;tkocisky@google.com;fbesse@google.com;aeslami@google.com;melisgl@google.com;fviola@google.com;pblunsom@google.com;kmh@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nramalho2019learning,\ntitle={Learning to encode spatial relations from natural language},\nauthor={Tiago Ramalho and Tomas Kocisky\u200e and Frederic Besse and S. M. Ali Eslami and Gabor Melis and Fabio Viola and Phil Blunsom and Karl Moritz Hermann},\nyear={2019},\nurl={https://openreview.net/forum?id=BJG__i0qF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJG__i0qF7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;5", "wc_review": "395;357;566", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "809;347;356", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 439.3333333333333, 90.90043393124638 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 504.0, 215.69886416019904 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14003421109776115958&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJGfCjA5FX", "title": "PAIRWISE AUGMENTED GANS WITH ADVERSARIAL RECONSTRUCTION LOSS", "track": "main", "status": "Reject", "tldr": "We propose a novel autoencoding model with augmented adversarial reconstruction loss. We intoduce new metric for content-based assessment of reconstructions. ", "abstract": "We propose a novel autoencoding model called Pairwise Augmented GANs. We train a generator and an encoder jointly and in an adversarial manner. The generator network learns to sample realistic objects. In turn, the encoder network at the same time is trained to map the true data distribution to the prior in latent space. To ensure good reconstructions, we introduce an augmented adversarial reconstruction loss. Here we train a discriminator to distinguish two types of pairs: an object with its augmentation and the one with its reconstruction. We show that such adversarial loss compares objects based on the content rather than on the exact match. We experimentally demonstrate that our model generates samples and reconstructions of quality competitive with state-of-the-art on datasets MNIST, CIFAR10, CelebA and achieves good quantitative results on CIFAR10. ", "keywords": "Computer vision;Deep learning;Unsupervised Learning;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Aibek Alanov;Max Kochurov;Daniil Yashkov;Dmitry Vetrov", "authorids": "alanov.aibek@gmail.com;maxim.v.kochurov@gmail.com;daniil.yashkov@phystech.edu;vetrodim@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nalanov2019pairwise,\ntitle={{PAIRWISE} {AUGMENTED} {GANS} {WITH} {ADVERSARIAL} {RECONSTRUCTION} {LOSS}},\nauthor={Aibek Alanov and Max Kochurov and Daniil Yashkov and Dmitry Vetrov},\nyear={2019},\nurl={https://openreview.net/forum?id=BJGfCjA5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJGfCjA5FX", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;4", "wc_review": "133;523;299", "wc_reply_reviewers": "0;101;0", "wc_reply_authors": "562;363;697", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 318.3333333333333, 159.80265607582652 ], "wc_reply_reviewers_avg": [ 33.666666666666664, 47.6118565998942 ], "wc_reply_authors_avg": [ 540.6666666666666, 137.18681504511366 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12051026326801981421&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJGjOi09t7", "title": "A Variational Autoencoder for Probabilistic Non-Negative Matrix Factorisation", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce and demonstrate the variational autoencoder (VAE) for probabilistic non-negative matrix factorisation (PAE-NMF). We design a network which can perform non-negative matrix factorisation (NMF) and add in aspects of a VAE to make the coefficients of the latent space probabilistic. By restricting the weights in the final layer of the network to be non-negative and using the non-negative Weibull distribution we produce a probabilistic form of NMF which allows us to generate new data and find a probability distribution that effectively links the latent and input variables. We demonstrate the effectiveness of PAE-NMF on three heterogeneous datasets: images, financial time series and genomic.", "keywords": "Non-negative matrix factorisation;Variational autoencoder;Probabilistic", "primary_area": "", "supplementary_material": "", "author": "Steven Squires;Adam Prugel-Bennett;Mahesan Niranjan", "authorids": "ses2g14@ecs.soton.ac.uk;apb@ecs.soton.ac.uk;mn@ecs.soton.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsquires2019a,\ntitle={A Variational Autoencoder for Probabilistic Non-Negative Matrix Factorisation},\nauthor={Steven Squires and Adam Prugel-Bennett and Mahesan Niranjan},\nyear={2019},\nurl={https://openreview.net/forum?id=BJGjOi09t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJGjOi09t7", "pdf_size": 0, "rating": "4;4;7", "confidence": "3;5;5", "wc_review": "286;165;285", "wc_reply_reviewers": "0;0;5", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;1", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 245.33333333333334, 56.805711762893075 ], "wc_reply_reviewers_avg": [ 1.6666666666666667, 2.357022603955158 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18243982250746394976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJMvBjC5YQ", "title": "Cutting Down Training Memory by Re-fowarding", "track": "main", "status": "Reject", "tldr": "This paper proposes fundamental theory and optimal algorithms for DNN training, which reduce up to 80% of training memory for popular DNNs.", "abstract": "Deep Neutral Networks(DNNs) require huge GPU memory when training on modern image/video databases. Unfortunately, the GPU memory as a hardware resource is always finite, which limits the image resolution, batch size, and learning rate that could be used for better DNN performance. In this paper, we propose a novel training approach, called Re-forwarding, that substantially reduces memory usage in training. Our approach automatically finds a subset of vertices in a DNN computation graph, and stores tensors only at these vertices during the first forward. During backward, extra local forwards (called the Re-forwarding process) are conducted to compute the missing tensors between the subset of vertices. The total memory cost becomes the sum of (1) the memory cost at the subset of vertices and (2) the maximum memory cost among local re-forwards. Re-forwarding trades training time overheads for memory and does not compromise any performance in testing. We propose theories and algorithms that achieve the optimal memory solutions for DNNs with either linear or arbitrary computation graphs. Experiments show that Re-forwarding cuts down up-to 80% of training memory on popular DNNs such as Alexnet, VGG, ResNet, Densenet and Inception net.", "keywords": "deep learning;training memory;computation-memory trade off;optimal solution", "primary_area": "", "supplementary_material": "", "author": "Jianwei Feng;Dong Huang", "authorids": "jfeng1@andrew.cmu.edu;donghuang@cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfeng2019cutting,\ntitle={Cutting Down Training Memory by Re-fowarding},\nauthor={Jianwei Feng and Dong Huang},\nyear={2019},\nurl={https://openreview.net/forum?id=BJMvBjC5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=BJMvBjC5YQ", "pdf_size": 0, "rating": "4;4;6;6", "confidence": "2;3;3;3", "wc_review": "459;270;297;595", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "956;548;302;610", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "rating_avg": [ 5.0, 1.0 ], "confidence_avg": [ 2.75, 0.4330127018922193 ], "wc_review_avg": [ 405.25, 131.2485714207968 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 604.0, 233.6022260167912 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9958628561569304794&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 0 }, { "id": "BJWfW2C9Y7", "title": "Predictive Local Smoothness for Stochastic Gradient Methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "Stochastic gradient methods are dominant in nonconvex optimization especially for deep models but have low asymptotical convergence due to the fixed smoothness. To address this problem, we propose a simple yet effective method for improving stochastic gradient methods named predictive local smoothness (PLS). First, we create a convergence condition to build a learning rate varied adaptively with local smoothness. Second, the local smoothness can be predicted by the latest gradients. Third, we use the adaptive learning rate to update the stochastic gradients for exploring linear convergence rates. By applying the PLS method, we implement new variants of three popular algorithms: PLS-stochastic gradient descent (PLS-SGD), PLS-accelerated SGD (PLS-AccSGD), and PLS-AMSGrad. Moreover, we provide much simpler proofs to ensure their linear convergence. Empirical results show that our variants have better performance gains than the popular algorithms, such as, faster convergence and alleviating explosion and vanish of gradients.", "keywords": "stochastic gradient method;local smoothness;linear system;AMSGrad", "primary_area": "", "supplementary_material": "", "author": "Jun Li;Hongfu Liu;Bineng Zhong;Yue Wu;Yun Fu", "authorids": "junl.mldl@gmail.com;hongfuliu@brandeis.edu;bnzhong@gmail.com;wuyuebupt@gmail.com;yunfu@ece.neu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019predictive,\ntitle={Predictive Local Smoothness for Stochastic Gradient Methods},\nauthor={Jun Li and Hongfu Liu and Bineng Zhong and Yue Wu and Yun Fu},\nyear={2019},\nurl={https://openreview.net/forum?id=BJWfW2C9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJWfW2C9Y7", "pdf_size": 0, "rating": "2;2;3;4", "confidence": "5;4;3;5", "wc_review": "210;311;156;154", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 2.75, 0.82915619758885 ], "confidence_avg": [ 4.25, 0.82915619758885 ], "wc_review_avg": [ 207.75, 63.7039049038597 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0909090909090909, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3453353538623249540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "signSGD via Zeroth-Order Oracle", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/871", "id": "BJe-DsC5Fm", "author_site": "Sijia Liu, Pin-Yu Chen, Xiangyi Chen, Mingyi Hong", "tldr": "We design and analyze a new zeroth-order stochastic optimization algorithm, ZO-signSGD, and demonstrate its connection and application to black-box adversarial attacks in robust deep learning", "abstract": "In this paper, we design and analyze a new zeroth-order (ZO) stochastic optimization algorithm, ZO-signSGD, which enjoys dual advantages of gradient-free operations and signSGD. The latter requires only the sign information of gradient estimates but is able to achieve a comparable or even better convergence speed than SGD-type algorithms. Our study shows that ZO signSGD requires $\\sqrt{d}$ times more iterations than signSGD, leading to a convergence rate of $O(\\sqrt{d}/\\sqrt{T})$ under mild conditions, where $d$ is the number of optimization variables, and $T$ is the number of iterations. In addition, we analyze the effects of different types of gradient estimators on the convergence of ZO-signSGD, and propose two variants of ZO-signSGD that at least achieve $O(\\sqrt{d}/\\sqrt{T})$ convergence rate. On the application side we explore the connection between ZO-signSGD and black-box adversarial attacks in robust deep learning. Our empirical evaluations on image classification datasets MNIST and CIFAR-10 demonstrate the superior performance of ZO-signSGD on the generation of adversarial examples from black-box neural networks.", "keywords": "nonconvex optimization;zeroth-order algorithm;black-box adversarial attack", "primary_area": "", "supplementary_material": "", "author": "Sijia Liu;Pin-Yu Chen;Xiangyi Chen;Mingyi Hong", "authorids": "sijia.liu@ibm.com;pin-yu.chen@ibm.com;chen5719@umn.edu;mhong@umn.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nliu2018signsgd,\ntitle={sign{SGD} via Zeroth-Order Oracle},\nauthor={Sijia Liu and Pin-Yu Chen and Xiangyi Chen and Mingyi Hong},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJe-DsC5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "2;5;3", "wc_review": "222;534;148", "wc_reply_reviewers": "0;103;0", "wc_reply_authors": "588;2285;371", "reply_reviewers": "0;1;0", "reply_authors": "1;5;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 301.3333333333333, 167.27090468923626 ], "wc_reply_reviewers_avg": [ 34.333333333333336, 48.554665641476255 ], "wc_reply_authors_avg": [ 1081.3333333333333, 855.7189310099952 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.32732683535398854, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4622909249384970113&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJe-DsC5Fm", "pdf": "https://openreview.net/pdf?id=BJe-DsC5Fm", "email": ";;;", "author_num": 4 }, { "id": "BJe-Sn0ctm", "title": "Ain't Nobody Got Time for Coding: Structure-Aware Program Synthesis from Natural Language", "track": "main", "status": "Reject", "tldr": "We generate source code based on short descriptions in natural language, using deep neural networks.", "abstract": "Program synthesis from natural language (NL) is practical for humans and, once technically feasible, would significantly facilitate software development and revolutionize end-user programming. We present SAPS, an end-to-end neural network capable of mapping relatively complex, multi-sentence NL specifications to snippets of executable code. The proposed architecture relies exclusively on neural components, and is built upon a tree2tree autoencoder trained on abstract syntax trees, combined with a pretrained word embedding and a bi-directional multi-layer LSTM for NL processing. The decoder features a doubly-recurrent LSTM with a novel signal propagation scheme and soft attention mechanism. When applied to a large dataset of problems proposed in a previous study, SAPS performs on par with or better than the method proposed there, producing correct programs in over 90% of cases. In contrast to other methods, it does not involve any non-neural components to post-process the resulting programs, and uses a fixed-dimensional latent representation as the only link between the NL analyzer and source code generator. ", "keywords": "Program synthesis;tree2tree autoencoders;soft attention;doubly-recurrent neural networks;LSTM;nlp2tree", "primary_area": "", "supplementary_material": "", "author": "Jakub Bednarek;Karol Piaskowski;Krzysztof Krawiec", "authorids": "jakub.bednarek@put.poznan.pl;kar.piaskowski@gmail.com;krawiec@cs.put.poznan.pl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbednarek2019aint,\ntitle={Ain't Nobody Got Time for Coding: Structure-Aware Program Synthesis from Natural Language},\nauthor={Jakub Bednarek and Karol Piaskowski and Krzysztof Krawiec},\nyear={2019},\nurl={https://openreview.net/forum?id=BJe-Sn0ctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJe-Sn0ctm", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;3;4", "wc_review": "812;1265;432", "wc_reply_reviewers": "304;119;93", "wc_reply_authors": "740;797;769", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 836.3333333333334, 340.5058328754769 ], "wc_reply_reviewers_avg": [ 172.0, 93.93969696920821 ], "wc_reply_authors_avg": [ 768.6666666666666, 23.27134623427608 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17512724718309247784&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Preventing Posterior Collapse with delta-VAEs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1047", "id": "BJe0Gn0cY7", "author_site": "Ali Razavi, Aaron van den Oord, Ben Poole, Oriol Vinyals", "tldr": " Avoid posterior collapse by lower bounding the rate.", "abstract": "Due to the phenomenon of \u201cposterior collapse,\u201d current latent variable generative models pose a challenging design choice that either weakens the capacity of the decoder or requires altering the training objective. We develop an alternative that utilizes the most powerful generative models as decoders, optimize the variational lower bound, and ensures that the latent variables preserve and encode useful information. Our proposed \u03b4-VAEs achieve this by constraining the variational family for the posterior to have a minimum distance to the prior. For sequential latent variable models, our approach resembles the classic representation learning approach of slow feature analysis. We demonstrate our method\u2019s efficacy at modeling text on LM1B and modeling images: learning representations, improving sample quality, and achieving state of the art log-likelihood on CIFAR-10 and ImageNet 32 \u00d7 32.", "keywords": "Posterior Collapse;VAE;Autoregressive Models", "primary_area": "", "supplementary_material": "", "author": "Ali Razavi;Aaron van den Oord;Ben Poole;Oriol Vinyals", "authorids": "alirazavi@google.com;avdnoord@google.com;pooleb@google.com;vinyals@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nrazavi2018preventing,\ntitle={Preventing Posterior Collapse with delta-{VAE}s},\nauthor={Ali Razavi and Aaron van den Oord and Ben Poole and Oriol Vinyals},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJe0Gn0cY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;4", "wc_review": "334;207;379", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "224;297;263", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 306.6666666666667, 72.83009146103156 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 261.3333333333333, 29.825417944356715 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 214, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11040116821853696419&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BJe0Gn0cY7", "pdf": "https://openreview.net/pdf?id=BJe0Gn0cY7", "email": ";;;", "author_num": 4 }, { "title": "Algorithmic Framework for Model-based Deep Reinforcement Learning with Theoretical Guarantees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/887", "id": "BJe1E2R5KX", "author_site": "Yuping Luo, Huazhe Xu, Yuanzhi Li, Yuandong Tian, Trevor Darrell, Tengyu Ma", "tldr": "We design model-based reinforcement learning algorithms with theoretical guarantees and achieve state-of-the-art results on Mujuco benchmark tasks when one million or fewer samples are permitted.", "abstract": "Model-based reinforcement learning (RL) is considered to be a promising approach to reduce the sample complexity that hinders model-free RL. However, the theoretical understanding of such methods has been rather limited. This paper introduces a novel algorithmic framework for designing and analyzing model-based RL algorithms with theoretical guarantees. We design a meta-algorithm with a theoretical guarantee of monotone improvement to a local maximum of the expected reward. The meta-algorithm iteratively builds a lower bound of the expected reward based on the estimated dynamical model and sample trajectories, and then maximizes the lower bound jointly over the policy and the model. The framework extends the optimism-in-face-of-uncertainty principle to non-linear dynamical models in a way that requires no explicit uncertainty quantification. Instantiating our framework with simplification gives a variant of model-based RL algorithms Stochastic Lower Bounds Optimization (SLBO). Experiments demonstrate that SLBO achieves the state-of-the-art performance when only 1M or fewer samples are permitted on a range of continuous control benchmark tasks.", "keywords": "model-based reinforcement learning;sample efficiency;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Yuping Luo;Huazhe Xu;Yuanzhi Li;Yuandong Tian;Trevor Darrell;Tengyu Ma", "authorids": "yupingl@cs.princeton.edu;huazhe_xu@eecs.berkeley.edu;yuanzhili92@gmail.com;yuandong@fb.com;trevor@eecs.berkeley.edu;tengyuma@stanford.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nluo2018algorithmic,\ntitle={Algorithmic Framework for Model-based Deep Reinforcement Learning with Theoretical Guarantees},\nauthor={Yuping Luo and Huazhe Xu and Yuanzhi Li and Yuandong Tian and Trevor Darrell and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJe1E2R5KX},\n}", "github": "[![github](/images/github_icon.svg) roosephu/slbo](https://github.com/roosephu/slbo) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJe1E2R5KX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "pdf_size": 0, "rating": "6;6;7", "confidence": "2;4;4", "wc_review": "321;407;462", "wc_reply_reviewers": "0;0;12", "wc_reply_authors": "881;1274;999", "reply_reviewers": "0;0;1", "reply_authors": "2;2;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 396.6666666666667, 58.02489887013065 ], "wc_reply_reviewers_avg": [ 4.0, 5.656854249492381 ], "wc_reply_authors_avg": [ 1051.3333333333333, 164.65384565472164 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 270, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3175696566467828309&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJe1E2R5KX", "pdf": "https://openreview.net/pdf?id=BJe1E2R5KX", "email": ";;;;;", "author_num": 6 }, { "id": "BJe1hsCcYQ", "title": "Lorentzian Distance Learning", "track": "main", "status": "Reject", "tldr": "A distance learning approach to learn hyperbolic representations", "abstract": "This paper introduces an approach to learn representations based on the Lorentzian distance in hyperbolic geometry. Hyperbolic geometry is especially suited to hierarchically-structured datasets, which are prevalent in the real world. Current hyperbolic representation learning methods compare examples with the Poincar\\'e distance metric. They formulate the problem as minimizing the distance of each node in a hierarchy with its descendants while maximizing its distance with other nodes. This formulation produces node representations close to the centroid of their descendants. We exploit the fact that the centroid w.r.t the squared Lorentzian distance can be written in closed-form. We show that the Euclidean norm of such a centroid decreases as the curvature of the hyperbolic space decreases. This property makes it appropriate to represent hierarchies where parent nodes minimize the distances to their descendants and have smaller Euclidean norm than their children. Our approach obtains state-of-the-art results in retrieval and classification tasks on different datasets. ", "keywords": "distance learning;metric learning;hyperbolic geometry;hierarchy tree", "primary_area": "", "supplementary_material": "", "author": "Marc T Law;Jake Snell;Richard S Zemel", "authorids": "law@cs.toronto.edu;jsnell@cs.toronto.edu;zemel@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlaw2019lorentzian,\ntitle={Lorentzian Distance Learning},\nauthor={Marc T Law and Jake Snell and Richard S Zemel},\nyear={2019},\nurl={https://openreview.net/forum?id=BJe1hsCcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJe1hsCcYQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "451;503;506", "wc_reply_reviewers": "163;0;0", "wc_reply_authors": "1126;792;809", "reply_reviewers": "3;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 486.6666666666667, 25.249862485874168 ], "wc_reply_reviewers_avg": [ 54.333333333333336, 76.83893688893816 ], "wc_reply_authors_avg": [ 909.0, 153.5990451359209 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4828230598624626855&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BJe8niAqKX", "title": "Learning Grounded Sentence Representations by Jointly Using Video and Text Information", "track": "main", "status": "Withdraw", "tldr": "We propose a joint model to incorporate visual knowledge in sentence representations", "abstract": "Visual grounding of language is an active research field aiming at enriching text-based representations with visual information. In this paper, we propose a new way to leverage visual knowledge for sentence representations. Our approach transfers the structure of a visual representation space to the textual space by using two complementary sources of information: (1) the cluster information: the implicit knowledge that two sentences associated with the same visual content describe the same underlying reality and (2) the perceptual information contained within the structure of the visual space. We use a joint approach to encourage beneficial interactions during training between textual, perceptual, and cluster information. We demonstrate the quality of the learned representations on semantic relatedness, classification, and cross-modal retrieval tasks.", "keywords": "multimodal;sentence;representation;embedding;grounding", "primary_area": "", "supplementary_material": "", "author": "Patrick Bordes;Eloi Zablocki;Laure Soulier;Benjamin Piwowarski;Patrick Gallinari", "authorids": "patrick.bordes@lip6.fr;eloi.zablocki@gmail.com;laure.soulier@lip6.fr;benjamin.piwowarski@lip6.fr;patrick.gallinari@lip6.fr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJe8niAqKX", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;4;4", "wc_review": "428;77;568", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 357.6666666666667, 206.52737241236466 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nZUZ8Df6nMUJ:scholar.google.com/&scioq=Learning+Grounded+Sentence+Representations+by+Jointly+Using+Video+and+Text+Information&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Knowledge Flow: Improve Upon Your Teachers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/929", "id": "BJeOioA9Y7", "author_site": "Iou-Jen Liu, Jian Peng, Alex Schwing", "tldr": "\u2018Knowledge Flow\u2019 trains a deep net (student) by injecting information from multiple nets (teachers). The student is independent upon training and performs very well on learned tasks irrespective of the setting (reinforcement or supervised learning).", "abstract": "A zoo of deep nets is available these days for almost any given task, and it is increasingly unclear which net to start with when addressing a new task, or which net to use as an initialization for fine-tuning a new model. To address this issue, in this paper, we develop knowledge flow which moves \u2018knowledge\u2019 from multiple deep nets, referred to as teachers, to a new deep net model, called the student. The structure of the teachers and the student can differ arbitrarily and they can be trained on entirely different tasks with different output spaces too. Upon training with knowledge flow the student is independent of the teachers. We demonstrate our approach on a variety of supervised and reinforcement learning tasks, outperforming fine-tuning and other \u2018knowledge exchange\u2019 methods.\n\n", "keywords": "Transfer Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Iou-Jen Liu;Jian Peng;Alexander Schwing", "authorids": "iliu3@illinois.edu;jianpeng@illinois.edu;aschwing@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nliu2018knowledge,\ntitle={Knowledge Flow: Improve Upon Your Teachers},\nauthor={Iou-Jen Liu and Jian Peng and Alexander Schwing},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJeOioA9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;5", "wc_review": "90;325;385", "wc_reply_reviewers": "153;0;106", "wc_reply_authors": "288;507;554", "reply_reviewers": "1;0;1", "reply_authors": "2;1;2", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 266.6666666666667, 127.30104302618874 ], "wc_reply_reviewers_avg": [ 86.33333333333333, 63.99131885567673 ], "wc_reply_authors_avg": [ 449.6666666666667, 115.91471960981582 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12134336518601639221&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BJeOioA9Y7", "pdf": "https://openreview.net/pdf?id=BJeOioA9Y7", "email": ";;", "author_num": 3 }, { "id": "BJeRg205Fm", "title": "Neural Network Regression with Beta, Dirichlet, and Dirichlet-Multinomial Outputs", "track": "main", "status": "Reject", "tldr": "Neural network regression should use Dirichlet output distribution when targets are probabilities in order to quantify uncertainty of predictions.", "abstract": "We propose a method for quantifying uncertainty in neural network regression models when the targets are real values on a $d$-dimensional simplex, such as probabilities. We show that each target can be modeled as a sample from a Dirichlet distribution, where the parameters of the Dirichlet are provided by the output of a neural network, and that the combined model can be trained using the gradient of the data likelihood. This approach provides interpretable predictions in the form of multidimensional distributions, rather than point estimates, from which one can obtain confidence intervals or quantify risk in decision making. Furthermore, we show that the same approach can be used to model targets in the form of empirical counts as samples from the Dirichlet-multinomial compound distribution. In experiments, we verify that our approach provides these benefits without harming the performance of the point estimate predictions on two diverse applications: (1) distilling deep convolutional networks trained on CIFAR-100, and (2) predicting the location of particle collisions in the XENON1T Dark Matter detector.", "keywords": "regression;uncertainty;deep learning", "primary_area": "", "supplementary_material": "", "author": "Peter Sadowski;Pierre Baldi", "authorids": "peter.sadowski@hawaii.edu;pfbaldi@ics.uci.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsadowski2019neural,\ntitle={Neural Network Regression with Beta, Dirichlet, and Dirichlet-Multinomial Outputs},\nauthor={Peter Sadowski and Pierre Baldi},\nyear={2019},\nurl={https://openreview.net/forum?id=BJeRg205Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJeRg205Fm", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;4", "wc_review": "176;132;702", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "128;0;0", "reply_reviewers": "0;0;0", "reply_authors": "1;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 336.6666666666667, 258.9534492701128 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 42.666666666666664, 60.339778661252055 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12397387136810848392&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJeWOi09FQ", "title": "SHAMANN: Shared Memory Augmented Neural Networks", "track": "main", "status": "Reject", "tldr": "Multiple virtual actors cooperating through shared memory solve medical image segmentation.", "abstract": "Current state-of-the-art methods for semantic segmentation use deep neural networks to learn the segmentation mask from the input image signal as an image-to-image mapping. While these methods effectively exploit global image context, the learning and computational complexities are high. We propose shared memory augmented neural network actors as a dynamically scalable alternative. Based on a decomposition of the image into a sequence of local patches, we train such actors to sequentially segment each patch. To further increase the robustness and better capture shape priors, an external memory module is shared between different actors, providing an implicit mechanism for image information exchange. Finally, the patch-wise predictions are aggregated to a complete segmentation mask. We demonstrate the benefits of the new paradigm on a challenging lung segmentation problem based on chest X-Ray images, as well as on two synthetic tasks based on the MNIST dataset. On the X-Ray data, our method achieves state-of-the-art accuracy with a significantly reduced model size compared to reference methods. In addition, we reduce the number of failure cases by at least half.", "keywords": "memory networks;deep learning;medical image segmentation", "primary_area": "", "supplementary_material": "", "author": "Cosmin I. Bercea;Olivier Pauly;Andreas K. Maier;Florin C. Ghesu", "authorids": "cosmin.bercea@fau.de;olivier.pauly@gmail.com;andreas.maier@fau.de;florin.ghesu@siemens-healthineers.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbercea2019shamann,\ntitle={{SHAMANN}: Shared Memory Augmented Neural Networks},\nauthor={Cosmin I. Bercea and Olivier Pauly and Andreas K. Maier and Florin C. Ghesu},\nyear={2019},\nurl={https://openreview.net/forum?id=BJeWOi09FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJeWOi09FQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;5;3", "wc_review": "369;362;296", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 342.3333333333333, 32.8870119584549 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14350144751362110462&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Directed-Info GAIL: Learning Hierarchical Policies from Unsegmented Demonstrations using Directed Information", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1072", "id": "BJeWUs05KQ", "author_site": "Mohit Sharma, Arjun Sharma, Nicholas Rhinehart, Kris M Kitani", "tldr": "Learning Hierarchical Policies from Unsegmented Demonstrations using Directed Information", "abstract": "The use of imitation learning to learn a single policy for a complex task that has multiple modes or hierarchical structure can be challenging. In fact, previous work has shown that when the modes are known, learning separate policies for each mode or sub-task can greatly improve the performance of imitation learning. In this work, we discover the interaction between sub-tasks from their resulting state-action trajectory sequences using a directed graphical model. We propose a new algorithm based on the generative adversarial imitation learning framework which automatically learns sub-task policies from unsegmented demonstrations. Our approach maximizes the directed information flow in the graphical model between sub-task latent variables and their generated trajectories. We also show how our approach connects with the existing Options framework, which is commonly used to learn hierarchical policies.", "keywords": "Imitation Learning;Reinforcement Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Mohit Sharma;Arjun Sharma;Nicholas Rhinehart;Kris M. Kitani", "authorids": "mohits1@andrew.cmu.edu;arjuns2@andrew.cmu.edu;nrhineha@cs.cmu.edu;kkitani@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsharma2018directedinfo,\ntitle={Directed-Info {GAIL}: Learning Hierarchical Policies from Unsegmented Demonstrations using Directed Information},\nauthor={Mohit Sharma and Arjun Sharma and Nicholas Rhinehart and Kris M. Kitani},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJeWUs05KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;4;4", "wc_review": "319;493;191", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1307;791;177", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 334.3333333333333, 123.7668058173201 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 758.3333333333334, 461.898497748393 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5359860145732970662&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJeWUs05KQ", "pdf": "https://openreview.net/pdf?id=BJeWUs05KQ", "email": ";;;", "author_num": 4 }, { "id": "BJeY6sR9KX", "title": "Aligning Artificial Neural Networks to the Brain yields Shallow Recurrent Architectures", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep artificial neural networks with spatially repeated processing (a.k.a., deep convolutional ANNs) have been established as the best class of candidate models of visual processing in the primate ventral visual processing stream. Over the past five years, these ANNs have evolved from a simple feedforward eight-layer architecture in AlexNet to extremely deep and branching NASNet architectures, demonstrating increasingly better object categorization performance. Here we ask, as ANNs have continued to evolve in performance, are they also strong candidate models for the brain? To answer this question, we developed Brain-Score, a composite of neural and behavioral benchmarks for determining how brain-like a model is, together with an online platform where models can receive a Brain-Score and compare against other models. \nDespite high scores, typical deep models from the machine learning community are often hard to map onto the brain's anatomy due to their vast number of layers and missing biologically-important connections, such as recurrence. To further map onto anatomy and validate our approach, we built CORnet-S: an ANN guided by Brain-Score with the anatomical constraints of compactness and recurrence. Although a shallow model with four anatomically mapped areas and recurrent connectivity, CORnet-S is a top model on Brain-Score and outperforms similarly compact models on ImageNet. Analyzing CORnet-S circuitry variants revealed recurrence as the main predictive factor of both Brain-Score and ImageNet top-1 performance.\n", "keywords": "Computational Neuroscience;Brain-Inspired;Neural Networks;Simplified Models;Recurrent Neural Networks;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Jonas Kubilius;Martin Schrimpf;Ha Hong;Najib J. Majaj;Rishi Rajalingham;Elias B. Issa;Kohitij Kar;Pouya Bashivan;Jonathan Prescott-Roy;Kailyn Schmidt;Aran Nayebi;Daniel Bear;Daniel L. K. Yamins;James J. DiCarlo", "authorids": ";;;;;;;;;;;;;", "gender": ";;;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;;", "aff": ";;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;", "position": ";;;;;;;;;;;;;", "bibtex": "@misc{\nkubilius2019aligning,\ntitle={Aligning Artificial Neural Networks to the Brain yields Shallow Recurrent Architectures},\nauthor={Jonas Kubilius and Martin Schrimpf and Ha Hong and Najib J. Majaj and Rishi Rajalingham and Elias B. Issa and Kohitij Kar and Pouya Bashivan and Jonathan Prescott-Roy and Kailyn Schmidt and Aran Nayebi and Daniel Bear and Daniel L. K. Yamins and James J. DiCarlo},\nyear={2019},\nurl={https://openreview.net/forum?id=BJeY6sR9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJeY6sR9KX", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;4", "wc_review": "412;415;363", "wc_reply_reviewers": "88;12;7", "wc_reply_authors": "654;475;913", "reply_reviewers": "1;1;1", "reply_authors": "4;2;2", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 396.6666666666667, 23.83741223837483 ], "wc_reply_reviewers_avg": [ 35.666666666666664, 37.06151043273271 ], "wc_reply_authors_avg": [ 680.6666666666666, 179.80421450999293 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 14, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pmYbHnMfUxAJ:scholar.google.com/&scioq=Aligning+Artificial+Neural+Networks+to+the+Brain+yields+Shallow+Recurrent+Architectures&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BJeapjA5FX", "title": "GEOMETRIC AUGMENTATION FOR ROBUST NEURAL NETWORK CLASSIFIERS", "track": "main", "status": "Reject", "tldr": "We develop a statistical-geometric unsupervised learning augmentation framework for deep neural networks to make them robust to adversarial attacks.", "abstract": "We introduce a novel geometric perspective and unsupervised model augmentation framework for transforming traditional deep (convolutional) neural networks into adversarially robust classifiers. Class-conditional probability densities based on Bayesian nonparametric mixtures of factor analyzers (BNP-MFA) over the input space are used to design soft decision labels for feature to label isometry. Classconditional distributions over features are also learned using BNP-MFA to develop plug-in maximum a posterior (MAP) classifiers to replace the traditional multinomial logistic softmax classification layers. This novel unsupervised augmented framework, which we call geometrically robust networks (GRN), is applied to CIFAR-10, CIFAR-100, and to Radio-ML (a time series dataset for radio modulation recognition). We demonstrate the robustness of GRN models to adversarial attacks from fast gradient sign method, Carlini-Wagner, and projected gradient descent.", "keywords": "Bayesian nonparametric;robust;deep neural network;classifier;unsupervised learning;geometric", "primary_area": "", "supplementary_material": "", "author": "Robert M. Taylor;Yusong Tan", "authorids": "rtaylor@mitre.org;ytan@mitre.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntaylor2019geometric,\ntitle={{GEOMETRIC} {AUGMENTATION} {FOR} {ROBUST} {NEURAL} {NETWORK} {CLASSIFIERS}},\nauthor={Robert M. Taylor and Yusong Tan},\nyear={2019},\nurl={https://openreview.net/forum?id=BJeapjA5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=BJeapjA5FX", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;3", "wc_review": "147;246;292", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 228.33333333333334, 60.49977043115306 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C7hdPe1wC3kJ:scholar.google.com/&scioq=GEOMETRIC+AUGMENTATION+FOR+ROBUST+NEURAL+NETWORK+CLASSIFIERS&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "BJeem3C9F7", "title": "Pix2Scene: Learning Implicit 3D Representations from Images", "track": "main", "status": "Reject", "tldr": "pix2scene: a deep generative based approach for implicitly modelling the geometrical properties of a 3D scene from images", "abstract": "Modelling 3D scenes from 2D images is a long-standing problem in computer vision with implications in, e.g., simulation and robotics. We propose pix2scene, a deep generative-based approach that implicitly models the geometric properties of a scene from images. Our method learns the depth and orientation of scene points visible in images. Our model can then predict the structure of a scene from various, previously unseen view points. It relies on a bi-directional adversarial learning mechanism to generate scene representations from a latent code, inferring the 3D representation of the underlying scene geometry. We showcase a novel differentiable renderer to train the 3D model in an end-to-end fashion, using only images. We demonstrate the generative ability of our model qualitatively on both a custom dataset and on ShapeNet. Finally, we evaluate the effectiveness of the learned 3D scene representation in supporting a 3D spatial reasoning.", "keywords": "Representation learning;generative model;adversarial learning;implicit 3D generation;scene generation", "primary_area": "", "supplementary_material": "", "author": "Sai Rajeswar;Fahim Mannan;Florian Golemo;David Vazquez;Derek Nowrouzezahrai;Aaron Courville", "authorids": "rajsai24@gmail.com;fmannan@gmail.com;florian.golemo@inria.fr;dvazquez@cvc.uab.es;dereknow@gmail.com;aaron.courville@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nrajeswar2019pixscene,\ntitle={Pix2Scene: Learning Implicit 3D Representations from Images},\nauthor={Sai Rajeswar and Fahim Mannan and Florian Golemo and David Vazquez and Derek Nowrouzezahrai and Aaron Courville},\nyear={2019},\nurl={https://openreview.net/forum?id=BJeem3C9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJeem3C9F7", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;1;4", "wc_review": "271;227;630", "wc_reply_reviewers": "100;0;0", "wc_reply_authors": "1188;682;1017", "reply_reviewers": "1;0;0", "reply_authors": "3;1;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 376.0, 180.5011541976025 ], "wc_reply_reviewers_avg": [ 33.333333333333336, 47.14045207910317 ], "wc_reply_authors_avg": [ 962.3333333333334, 210.1591989157003 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.49999999999999994, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=837692863319403572&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "A Max-Affine Spline Perspective of Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/830", "id": "BJej72AqF7", "author_site": "Richard Baraniuk, Jack Wang, Randall Balestriero", "tldr": "We provide new insights and interpretations of RNNs from a max-affine spline operators perspective.", "abstract": "We develop a framework for understanding and improving recurrent neural networks (RNNs) using max-affine spline operators (MASOs). We prove that RNNs using piecewise affine and convex nonlinearities can be written as a simple piecewise affine spline operator. The resulting representation provides several new perspectives for analyzing RNNs, three of which we study in this paper. First, we show that an RNN internally partitions the input space during training and that it builds up the partition through time. Second, we show that the affine slope parameter of an RNN corresponds to an input-specific template, from which we can interpret an RNN as performing a simple template matching (matched filtering) given the input. Third, by carefully examining the MASO RNN affine mapping, we prove that using a random initial hidden state corresponds to an explicit L2 regularization of the affine parameters, which can mollify exploding gradients and improve generalization. Extensive experiments on several datasets of various modalities demonstrate and validate each of the above conclusions. In particular, using a random initial hidden states elevates simple RNNs to near state-of-the-art performers on these datasets. ", "keywords": "RNN;max-affine spline operators", "primary_area": "", "supplementary_material": "", "author": "Zichao Wang;Randall Balestriero;Richard Baraniuk", "authorids": "richb@rice.edu;zw16@rice.edu;randallbalestriero@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nwang2018a,\ntitle={A {MAX}-{AFFINE} {SPLINE} {PERSPECTIVE} {OF} {RECURRENT} {NEURAL} {NETWORKS}},\nauthor={Zichao Wang and Randall Balestriero and Richard Baraniuk},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJej72AqF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;3", "wc_review": "445;153;141", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "626;435;307", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 246.33333333333334, 140.5639435353968 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 456.0, 131.07504211964485 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8468021192773155169&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJej72AqF7", "pdf": "https://openreview.net/pdf?id=BJej72AqF7", "email": ";;", "author_num": 3 }, { "title": "Learning to Navigate the Web", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/862", "id": "BJemQ209FQ", "author_site": "Izzeddin Gur, Ulrich Rueckert, Aleksandra Faust, Dilek Hakkani-Tur", "tldr": "We train reinforcement learning policies using reward augmentation, curriculum learning, and meta-learning to successfully navigate web pages.", "abstract": "Learning in environments with large state and action spaces, and sparse rewards, can hinder a Reinforcement Learning (RL) agent\u2019s learning through trial-and-error. For instance, following natural language instructions on the Web (such as booking a flight ticket) leads to RL settings where input vocabulary and number of actionable elements on a page can grow very large. Even though recent approaches improve the success rate on relatively simple environments with the help of human demonstrations to guide the exploration, they still fail in environments where the set of possible instructions can reach millions. We approach the aforementioned problems from a different perspective and propose guided RL approaches that can generate unbounded amount of experience for an agent to learn from. Instead of learning from a complicated instruction with a large vocabulary, we decompose it into multiple sub-instructions and schedule a curriculum in which an agent is tasked with a gradually increasing subset of these relatively easier sub-instructions. In addition, when the expert demonstrations are not available, we propose a novel meta-learning framework that generates new instruction following tasks and trains the agent more effectively. We train DQN, deep reinforcement learning agent, with Q-value function approximated with a novel QWeb neural network architecture on these smaller, synthetic instructions. We evaluate the ability of our agent to generalize to new instructions onWorld of Bits benchmark, on forms with up to 100 elements, supporting 14 million possible instructions. The QWeb agent outperforms the baseline without using any human demonstration achieving 100% success rate on several difficult environments.", "keywords": "navigating web pages;reinforcement learning;q learning;curriculum learning;meta training", "primary_area": "", "supplementary_material": "", "author": "Izzeddin Gur;Ulrich Rueckert;Aleksandra Faust;Dilek Hakkani-Tur", "authorids": "izzeddingur@gmail.com;rueckert@google.com;sandrafaust@google.com;dilek@ieee.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngur2018learning,\ntitle={Learning to Navigate the Web},\nauthor={Izzeddin Gur and Ulrich Rueckert and Aleksandra Faust and Dilek Hakkani-Tur},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJemQ209FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;3;3", "wc_review": "235;176;671", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "536;572;904", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 360.6666666666667, 220.75677918368794 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 670.6666666666666, 165.64486778111242 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7234609565333107792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BJemQ209FQ", "pdf": "https://openreview.net/pdf?id=BJemQ209FQ", "email": ";;;", "author_num": 4 }, { "id": "BJepX2A9tX", "title": "Rotation Equivariant Networks via Conic Convolution and the DFT", "track": "main", "status": "Withdraw", "tldr": "We propose conic convolution and the 2D-DFT to encode rotation equivariance into an neural network.", "abstract": "Performance of neural networks can be significantly improved by encoding known invariance for particular tasks. Many image classification tasks, such as those related to cellular imaging, exhibit invariance to rotation. In particular, to aid convolutional neural networks in learning rotation invariance, we consider a simple, efficient conic convolutional scheme that encodes rotational equivariance, along with a method for integrating the magnitude response of the 2D-discrete-Fourier transform (2D-DFT) to encode global rotational invariance. We call our new method the Conic Convolution and DFT Network (CFNet). We evaluated the efficacy of CFNet as compared to a standard CNN and group-equivariant CNN (G-CNN) for several different image classification tasks and demonstrated improved performance, including classification accuracy, computational efficiency, and its robustness to hyperparameter selection. Taken together, we believe CFNet represents a new scheme that has the potential to improve many imaging analysis applications.", "keywords": "deep learning;rotation equivariance;bioimaging analysis", "primary_area": "", "supplementary_material": "", "author": "Benjamin Chidester;Minh N. Do;Jian Ma", "authorids": "bchidest@andrew.cmu.edu;minhdo@illinois.edu;jianma@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJepX2A9tX", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;2", "wc_review": "691;237;236", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 388.0, 214.25374364679527 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J8kfHhZdcGUJ:scholar.google.com/&scioq=Rotation+Equivariant+Networks+via+Conic+Convolution+and+the+DFT&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJesDsA9t7", "title": "Better Accuracy with Quantified Privacy: Representations Learned via Reconstructive Adversarial Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "The remarkable success of machine learning, especially deep learning, has produced a variety of cloud-based services for mobile users. Such services require an end user to send data to the service provider, which presents a serious challenge to end-user privacy. To address this concern, prior works either add noise to the data or send features extracted from the raw data. They struggle to balance between the utility and privacy because added noise reduces utility and raw data can be reconstructed from extracted features.\n\nThis work represents a methodical departure from prior works: we balance between a measure of privacy and another of utility by leveraging adversarial learning to find a sweeter tradeoff. We design an encoder that optimizes against the reconstruction error (a measure of privacy), adversarially by a Decoder, and the inference accuracy (a measure of utility) by a Classifier. The result is RAN, a novel deep model with a new training algorithm that automatically extracts features for classification that are both private and useful. \n\nIt turns out that adversarially forcing the extracted features to only conveys the intended information required by classification leads to an implicit regularization leading to better classification accuracy than the original model which completely ignores privacy. Thus, we achieve better privacy with better utility, a surprising possibility in machine learning! We conducted extensive experiments on five popular datasets over four training schemes, and demonstrate the superiority of RAN compared with existing alternatives.", "keywords": "end-user privacy;utility;feature learning;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Sicong Liu;Anshumali Shrivastava;Junzhao Du;Lin Zhong", "authorids": "scliu007@gmail.com;anshumali@rice.edu;dujz@xidian.edu.cn;lzhong@rice.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nliu2019better,\ntitle={Better Accuracy with Quantified Privacy: Representations Learned via Reconstructive Adversarial Network},\nauthor={Sicong Liu and Anshumali Shrivastava and Junzhao Du and Lin Zhong},\nyear={2019},\nurl={https://openreview.net/forum?id=BJesDsA9t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJesDsA9t7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "331;500;266", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "666;755;298", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 365.6666666666667, 98.62499119842236 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 573.0, 197.81978330456906 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2633071763160364470&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJf9k305Fm", "title": "Visualizing and Discovering Behavioural Weaknesses in Deep Reinforcement Learning", "track": "main", "status": "Withdraw", "tldr": "We present a method to synthesize states of interest for reinforcement learning agents in order to analyze their behavior. ", "abstract": "As deep reinforcement learning is being applied to more and more tasks, there is a growing need to better understand and probe the learned agents. Visualizing and understanding the decision making process can be very valuable to comprehend and identify problems in the learned behavior. However, this topic has been relatively under-explored in the reinforcement learning community. In this work we present a method for synthesizing states of interest for a trained agent. Such states could be situations (e.g. crashing or damaging a car) in which specific actions are necessary. Further, critical states in which a very high or a very low reward can be achieved (e.g. risky states) are often interesting to understand the situational awareness of the system. To this end, we learn a generative model over the state space of the environment and use its latent space to optimize a target function for the state of interest. In our experiments we show that this method can generate insightful visualizations for a variety of environments and reinforcement learning methods. We explore these issues in the standard Atari benchmark games as well as in an autonomous driving simulator. Based on the efficiency with which we have been able to identify significant decision scenarios with this technique, we believe this general approach could serve as an important tool for AI safety applications.", "keywords": "Visualization;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Christian Rupprecht;Cyril Ibrahim;Chris Pal", "authorids": "christian.rupprecht@in.tum.de;cyril.ibrahim@elementai.com;christopher.pal@polymtl.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJf9k305Fm", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;4", "wc_review": "588;352;655", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 531.6666666666666, 129.9546929595935 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2446145440558948882&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Training for Faster Adversarial Robustness Verification via Inducing ReLU Stability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1028", "id": "BJfIVjAcKm", "author_site": "Kai Xiao, Vincent Tjeng, Nur Muhammad Shafiullah, Aleksander Madry", "tldr": "We develop methods to train deep neural models that are both robust to adversarial perturbations and whose robustness is significantly easier to verify.", "abstract": "We explore the concept of co-design in the context of neural network verification. Specifically, we aim to train deep neural networks that not only are robust to adversarial perturbations but also whose robustness can be verified more easily. To this end, we identify two properties of network models - weight sparsity and so-called ReLU stability - that turn out to significantly impact the complexity of the corresponding verification task. We demonstrate that improving weight sparsity alone already enables us to turn computationally intractable verification problems into tractable ones. Then, improving ReLU stability leads to an additional 4-13x speedup in verification times. An important feature of our methodology is its \"universality,\" in the sense that it can be used with a broad range of training procedures and verification approaches.\n", "keywords": "verification;adversarial robustness;adversarial examples;stability;deep learning;regularization", "primary_area": "", "supplementary_material": "", "author": "Kai Y. Xiao;Vincent Tjeng;Nur Muhammad (Mahi) Shafiullah;Aleksander Madry", "authorids": "kaix@mit.edu;vtjeng@mit.edu;nshafiul@mit.edu;madry@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nxiao2018training,\ntitle={Training for Faster Adversarial Robustness Verification via Inducing Re{LU} Stability},\nauthor={Kai Y. Xiao and Vincent Tjeng and Nur Muhammad (Mahi) Shafiullah and Aleksander Madry},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJfIVjAcKm},\n}", "github": "[![github](/images/github_icon.svg) MadryLab/relu_stable](https://github.com/MadryLab/relu_stable)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;8", "confidence": "3;3;2", "wc_review": "852;261;433", "wc_reply_reviewers": "310;0;0", "wc_reply_authors": "3178;317;393", "reply_reviewers": "1;0;0", "reply_authors": "7;1;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 515.3333333333334, 248.19929268410272 ], "wc_reply_reviewers_avg": [ 103.33333333333333, 146.13540144521983 ], "wc_reply_authors_avg": [ 1296.0, 1331.136607064304 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 2.8284271247461903 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 234, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11696009804149879522&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BJfIVjAcKm", "pdf": "https://openreview.net/pdf?id=BJfIVjAcKm", "email": ";;;", "author_num": 4 }, { "title": "Learning to Learn with Conditional Class Dependencies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1038", "id": "BJfOXnActQ", "author_site": "Xiang Jiang, Seyed Mohammad Havaei, Farshid Varno, Gabriel Chartrand, Nicolas Chapados, Stan Matwin", "tldr": "CAML is an instance of MAML with conditional class dependencies.", "abstract": "Neural networks can learn to extract statistical properties from data, but they seldom make use of structured information from the label space to help representation learning. Although some label structure can implicitly be obtained when training on huge amounts of data, in a few-shot learning context where little data is available, making explicit use of the label structure can inform the model to reshape the representation space to reflect a global sense of class dependencies. We propose a meta-learning framework, Conditional class-Aware Meta-Learning (CAML), that conditionally transforms feature representations based on a metric space that is trained to capture inter-class dependencies. This enables a conditional modulation of the feature representations of the base-learner to impose regularities informed by the label space. Experiments show that the conditional transformation in CAML leads to more disentangled representations and achieves competitive results on the miniImageNet benchmark.", "keywords": "meta-learning;learning to learn;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Xiang Jiang;Mohammad Havaei;Farshid Varno;Gabriel Chartrand;Nicolas Chapados;Stan Matwin", "authorids": "xiang.jiang@dal.ca;mohammad@imagia.com;f.varno@dal.ca;gabriel@imagia.com;nic@imagia.com;stan@cs.dal.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\njiang2018learning,\ntitle={Learning to Learn with Conditional Class Dependencies},\nauthor={Xiang Jiang and Mohammad Havaei and Farshid Varno and Gabriel Chartrand and Nicolas Chapados and Stan Matwin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJfOXnActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;8", "confidence": "5;3;3", "wc_review": "453;549;256", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "605;311;72", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 419.3333333333333, 121.96265366450866 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 329.3333333333333, 217.98216033020276 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4241227563688250638&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJfOXnActQ", "pdf": "https://openreview.net/pdf?id=BJfOXnActQ", "email": ";;;;;", "author_num": 6 }, { "id": "BJfRpoA9YX", "title": "Adversarial Information Factorization", "track": "main", "status": "Reject", "tldr": "Learn representations for images that factor out a single attribute.", "abstract": "We propose a novel generative model architecture designed to learn representations for images that factor out a single attribute from the rest of the representation. A single object may have many attributes which when altered do not change the identity of the object itself. Consider the human face; the identity of a particular person is independent of whether or not they happen to be wearing glasses. The attribute of wearing glasses can be changed without changing the identity of the person. However, the ability to manipulate and alter image attributes without altering the object identity is not a trivial task. Here, we are interested in learning a representation of the image that separates the identity of an object (such as a human face) from an attribute (such as 'wearing glasses'). We demonstrate the success of our factorization approach by using the learned representation to synthesize the same face with and without a chosen attribute. We refer to this specific synthesis process as image attribute manipulation. We further demonstrate that our model achieves competitive scores, with state of the art, on a facial attribute classification task.", "keywords": "disentangled representations;factored representations;generative adversarial networks;variational auto encoders;generative models", "primary_area": "", "supplementary_material": "", "author": "Antonia Creswell;Yumnah Mohamied;Biswa Sengupta;Anil Bharath", "authorids": "ac2211@ic.ac.uk;ym1008@ic.ac.uk;biswasengupta@gmail.com;aab01@ic.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ncreswell2019adversarial,\ntitle={Adversarial Information Factorization},\nauthor={Antonia Creswell and Yumnah Mohamied and Biswa Sengupta and Anil Bharath},\nyear={2019},\nurl={https://openreview.net/forum?id=BJfRpoA9YX},\n}", "github": "[![github](/images/github_icon.svg) ToniCreswell/attribute-cVAEGAN](https://github.com/ToniCreswell/attribute-cVAEGAN)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJfRpoA9YX", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "wc_review": "249;359;800", "wc_reply_reviewers": "117;219;599", "wc_reply_authors": "1667;1849;3489", "reply_reviewers": "1;1;4", "reply_authors": "5;6;9", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 469.3333333333333, 238.09008565853574 ], "wc_reply_reviewers_avg": [ 311.6666666666667, 207.39870352107368 ], "wc_reply_authors_avg": [ 2335.0, 819.3769991076554 ], "reply_reviewers_avg": [ 2.0, 1.4142135623730951 ], "reply_authors_avg": [ 6.666666666666667, 1.699673171197595 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10756122077129776369&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Hierarchical Visuomotor Control of Humanoids", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/685", "id": "BJfYvo09Y7", "author_site": "Josh Merel, Arun Ahuja, Vu Pham, Saran Tunyasuvunakool, SIQI LIU, Dhruva Tirumala, Nicolas Heess, Greg Wayne", "tldr": "Solve tasks involving vision-guided humanoid locomotion, reusing locomotion behavior from motion capture data.", "abstract": "We aim to build complex humanoid agents that integrate perception, motor control, and memory. In this work, we partly factor this problem into low-level motor control from proprioception and high-level coordination of the low-level skills informed by vision. We develop an architecture capable of surprisingly flexible, task-directed motor control of a relatively high-DoF humanoid body by combining pre-training of low-level motor controllers with a high-level, task-focused controller that switches among low-level sub-policies. The resulting system is able to control a physically-simulated humanoid body to solve tasks that require coupling visual perception from an unstabilized egocentric RGB camera during locomotion in the environment. Supplementary video link: https://youtu.be/fBoir7PNxPk", "keywords": "hierarchical reinforcement learning;motor control;motion capture", "primary_area": "", "supplementary_material": "", "author": "Josh Merel;Arun Ahuja;Vu Pham;Saran Tunyasuvunakool;Siqi Liu;Dhruva Tirumala;Nicolas Heess;Greg Wayne", "authorids": "jsmerel@google.com;arahuja@google.com;vuph@google.com;stunya@google.com;liusiqi@google.com;dhruvat@google.com;heess@google.com;gregwayne@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nmerel2018hierarchical,\ntitle={Hierarchical Visuomotor Control of Humanoids},\nauthor={Josh Merel and Arun Ahuja and Vu Pham and Saran Tunyasuvunakool and Siqi Liu and Dhruva Tirumala and Nicolas Heess and Greg Wayne},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJfYvo09Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;8", "confidence": "3;4;3", "wc_review": "668;399;291", "wc_reply_reviewers": "164;0;0", "wc_reply_authors": "1042;354;589", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 452.6666666666667, 158.51883869818823 ], "wc_reply_reviewers_avg": [ 54.666666666666664, 77.3103414097292 ], "wc_reply_authors_avg": [ 661.6666666666666, 285.5361428766749 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16014664282742845985&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJfYvo09Y7", "pdf": "https://openreview.net/pdf?id=BJfYvo09Y7", "email": ";;;;;;;", "author_num": 8 }, { "id": "BJf_YjCqYX", "title": "Identifying Bias in AI using Simulation", "track": "main", "status": "Reject", "tldr": "We present a framework that leverages high-fidelity computer simulations to interrogate and diagnose biases within ML classifiers. ", "abstract": "Machine learned models exhibit bias, often because the datasets used to train them are biased. This presents a serious problem for the deployment of such technology, as the resulting models might perform poorly on populations that are minorities within the training set and ultimately present higher risks to them. We propose to use high-fidelity computer simulations to interrogate and diagnose biases within ML classifiers. We present a framework that leverages Bayesian parameter search to efficiently characterize the high dimensional feature space and more quickly identify weakness in performance. We apply our approach to an example domain, face detection, and show that it can be used to help identify demographic biases in commercial face application programming interfaces (APIs).", "keywords": "Bias;Simulation;Optimization;Face Detection", "primary_area": "", "supplementary_material": "", "author": "Daniel McDuff;Roger Cheng;Ashish Kapoor", "authorids": "damcduff@microsoft.com;rocheng@microsoft.com;akapoor@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmcduff2019identifying,\ntitle={Identifying Bias in {AI} using Simulation},\nauthor={Daniel McDuff and Roger Cheng and Ashish Kapoor},\nyear={2019},\nurl={https://openreview.net/forum?id=BJf_YjCqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJf_YjCqYX", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;2;5", "wc_review": "174;221;473", "wc_reply_reviewers": "0;0;334", "wc_reply_authors": "285;187;631", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 289.3333333333333, 131.28171574476352 ], "wc_reply_reviewers_avg": [ 111.33333333333333, 157.4491099442046 ], "wc_reply_authors_avg": [ 367.6666666666667, 190.4544273281377 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.3273268353539886, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9776904038633688478&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "BJfguoAcFm", "title": "Learning Kolmogorov Models for Binary Random Variables", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a framework for learning a Kolmogorov model, for a collection of binary random variables. More specifically, we derive conditions that link (in the sense of implications in mathematical logic) outcomes of specific random variables and extract valuable relations from the data. We also propose an efficient algorithm for computing the model and show its first-order optimality, despite the combinatorial nature of the learning problem. We exemplify our general framework to recommendation systems and gene expression data. We believe that the work is a significant step toward interpretable machine learning. ", "keywords": "Kolmogorov model;interpretable models;causal relations mining;non-convex optimization;relaxations", "primary_area": "", "supplementary_material": "", "author": "Hadi Ghauch;Hossein S. Ghadikolaei;Mikael Skoglund;Carlo Fischione", "authorids": "ghauch@kth.se;hshokri@kth.se;skoglund@kth.se;carlofi@kth.se", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nghauch2019learning,\ntitle={Learning Kolmogorov Models for Binary Random Variables},\nauthor={Hadi Ghauch and Hossein S. Ghadikolaei and Mikael Skoglund and Carlo Fischione},\nyear={2019},\nurl={https://openreview.net/forum?id=BJfguoAcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJfguoAcFm", "pdf_size": 0, "rating": "5;5;8", "confidence": "2;4;4", "wc_review": "380;156;243", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "882;609;1009", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 259.6666666666667, 92.20388037146569 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 833.3333333333334, 166.88585586828168 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5000000000000001, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7566112562389722124&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "BJfvAoC9YQ", "title": "Feature Transformers: A Unified Representation Learning Framework for Lifelong Learning", "track": "main", "status": "Reject", "tldr": "Single generic mathematical framework for lifelong learning paradigms with data privacy", "abstract": "Despite the recent advances in representation learning, lifelong learning continues\nto be one of the most challenging and unconquered problems. Catastrophic forgetting\nand data privacy constitute two of the important challenges for a successful\nlifelong learner. Further, existing techniques are designed to handle only specific\nmanifestations of lifelong learning, whereas a practical lifelong learner is expected\nto switch and adapt seamlessly to different scenarios. In this paper, we present a\nsingle, unified mathematical framework for handling the myriad variants of lifelong\nlearning, while alleviating these two challenges. We utilize an external memory\nto store only the features representing past data and learn richer and newer\nrepresentations incrementally through transformation neural networks - feature\ntransformers. We define, simulate and demonstrate exemplary performance on a\nrealistic lifelong experimental setting using the MNIST rotations dataset, paving\nthe way for practical lifelong learners. To illustrate the applicability of our method\nin data sensitive domains like healthcare, we study the pneumothorax classification\nproblem from X-ray images, achieving near gold standard performance.\nWe also benchmark our approach with a number of state-of-the art methods on\nMNIST rotations and iCIFAR100 datasets demonstrating superior performance.", "keywords": "continual learning;deep learning;lifelong learning;new task learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Hariharan Ravishankar;Rahul Venkataramani;Saihareesh Anamandra;Prasad Sudhakar", "authorids": "hariharan.ravishankar@ge.com;rahul.venkataramani@ge.com;saihareesh.anamandra@ge.com;prasad.sudhakar@ge.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nravishankar2019feature,\ntitle={Feature Transformers: A Unified Representation Learning Framework for Lifelong Learning},\nauthor={Hariharan Ravishankar and Rahul Venkataramani and Saihareesh Anamandra and Prasad Sudhakar},\nyear={2019},\nurl={https://openreview.net/forum?id=BJfvAoC9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJfvAoC9YQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;3", "wc_review": "826;93;218", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "376;180;159", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 379.0, 320.16974664491124 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 238.33333333333334, 97.72182742640231 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eT7vwztO26IJ:scholar.google.com/&scioq=Feature+Transformers:+A+Unified+Representation+Learning+Framework+for+Lifelong+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BJfvknCqFQ", "title": "A Rotation and a Translation Suffice: Fooling CNNs with Simple Transformations", "track": "main", "status": "Reject", "tldr": "We show that CNNs are not robust to simple rotations and translation and explore methods of improving this.", "abstract": "We show that simple spatial transformations, namely translations and rotations alone, suffice to fool neural networks on a significant fraction of their inputs in multiple image classification tasks. Our results are in sharp contrast to previous work in adversarial robustness that relied on more complicated optimization ap- proaches unlikely to appear outside a truly adversarial context. Moreover, the misclassifying rotations and translations are easy to find and require only a few black-box queries to the target model. Overall, our findings emphasize the need to design robust classifiers even for natural input transformations in benign settings.\n", "keywords": "robustness;spatial transformations;invariance;rotations;data augmentation;robust optimization", "primary_area": "", "supplementary_material": "", "author": "Logan Engstrom;Brandon Tran;Dimitris Tsipras;Ludwig Schmidt;Aleksander Madry", "authorids": "engstrom@mit.edu;btran115@mit.edu;tsipras@mit.edu;ludwigs@mit.edu;madry@mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nengstrom2019a,\ntitle={A Rotation and a Translation Suffice: Fooling {CNN}s with Simple Transformations},\nauthor={Logan Engstrom and Brandon Tran and Dimitris Tsipras and Ludwig Schmidt and Aleksander Madry},\nyear={2019},\nurl={https://openreview.net/forum?id=BJfvknCqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJfvknCqFQ", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;2;3", "wc_review": "670;295;371", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "920;485;151", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 445.3333333333333, 161.8648270076678 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 518.6666666666666, 314.8442295626345 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.3273268353539886, "gs_citation": 420, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14753130787342863705&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Unsupervised Adversarial Image Reconstruction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/906", "id": "BJg4Z3RqF7", "author_site": "Arthur Pajot, Emmanuel de B\u00e9zenac, patrick Gallinari", "tldr": "", "abstract": "We address the problem of recovering an underlying signal from lossy, inaccurate observations in an unsupervised setting. Typically, we consider situations where there is little to no background knowledge on the structure of the underlying signal, no access to signal-measurement pairs, nor even unpaired signal-measurement data. The only available information is provided by the observations and the measurement process statistics. We cast the problem as finding the \\textit{maximum a posteriori} estimate of the signal given each measurement, and propose a general framework for the reconstruction problem. We use a formulation of generative adversarial networks, where the generator takes as input a corrupted observation in order to produce realistic reconstructions, and add a penalty term tying the reconstruction to the associated observation. We evaluate our reconstructions on several image datasets with different types of corruptions. The proposed approach yields better results than alternative baselines, and comparable performance with model variants trained with additional supervision.", "keywords": "Deep Learning;Adversarial;MAP;GAN;neural networks", "primary_area": "", "supplementary_material": "", "author": "Arthur Pajot;Emmanuel de Bezenac;Patrick Gallinari", "authorids": "arthur.pajot@lip6.fr;emmanuel.de-bezenac@lip6.fr;patrick.gallinari@lip6.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\npajot2018unsupervised,\ntitle={Unsupervised Adversarial Image Reconstruction},\nauthor={Arthur Pajot and Emmanuel de Bezenac and Patrick Gallinari},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJg4Z3RqF7},\n}", "github": "[![github](/images/github_icon.svg) UNIR-Anonymous/UNIR](https://github.com/UNIR-Anonymous/UNIR)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;8", "confidence": "3;3;4", "wc_review": "257;69;360", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "691;346;679", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 228.66666666666666, 120.47775267197214 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 572.0, 159.88120589988054 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=552778780795437052&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJg4Z3RqF7", "pdf": "https://openreview.net/pdf?id=BJg4Z3RqF7", "email": ";;", "author_num": 3 }, { "title": "Max-MIG: an Information Theoretic Approach for Joint Learning from Crowds", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/665", "id": "BJg9DoR9t7", "author_site": "Peng Cao, Yilun Xu, Yuqing Kong, Yizhou Wang", "tldr": "", "abstract": "Eliciting labels from crowds is a potential way to obtain large labeled data. Despite a variety of methods developed for learning from crowds, a key challenge remains unsolved: \\emph{learning from crowds without knowing the information structure among the crowds a priori, when some people of the crowds make highly correlated mistakes and some of them label effortlessly (e.g. randomly)}. We propose an information theoretic approach, Max-MIG, for joint learning from crowds, with a common assumption: the crowdsourced labels and the data are independent conditioning on the ground truth. Max-MIG simultaneously aggregates the crowdsourced labels and learns an accurate data classifier. Furthermore, we devise an accurate data-crowds forecaster that employs both the data and the crowdsourced labels to forecast the ground truth. To the best of our knowledge, this is the first algorithm that solves the aforementioned challenge of learning from crowds. In addition to the theoretical validation, we also empirically show that our algorithm achieves the new state-of-the-art results in most settings, including the real-world data, and is the first algorithm that is robust to various information structures. Codes are available at https://github.com/Newbeeer/Max-MIG .\n", "keywords": "crowdsourcing;information theory", "primary_area": "", "supplementary_material": "", "author": "Peng Cao*;Yilun Xu*;Yuqing Kong;Yizhou Wang", "authorids": "caopeng2016@pku.edu.cn;xuyilun@pku.edu.cn;yuqing.kong@pku.edu.cn;yizhou.wang@pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ncao2018maxmig,\ntitle={Max-{MIG}: an Information Theoretic Approach for Joint Learning from Crowds},\nauthor={Peng Cao and Yilun Xu and Yuqing Kong and Yizhou Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJg9DoR9t7},\n}", "github": "[![github](/images/github_icon.svg) Newbeeer/Max-MIG](https://github.com/Newbeeer/Max-MIG)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "1535;374;535", "wc_reply_reviewers": "0;0;209", "wc_reply_authors": "782;518;1270", "reply_reviewers": "0;0;1", "reply_authors": "2;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 814.6666666666666, 513.5759167077659 ], "wc_reply_reviewers_avg": [ 69.66666666666667, 98.52354484532562 ], "wc_reply_authors_avg": [ 856.6666666666666, 311.5095860839955 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14993809510724823282&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJg9DoR9t7", "pdf": "https://openreview.net/pdf?id=BJg9DoR9t7", "email": ";;;", "author_num": 4 }, { "id": "BJgAfh09tm", "title": "Bilingual-GAN: Neural Text Generation and Neural Machine Translation as Two Sides of the Same Coin", "track": "main", "status": "Withdraw", "tldr": "We present a novel method for Bilingual Text Generation producing parallel concurrent sentences in two languages.", "abstract": "Latent space based GAN methods and attention based encoder-decoder architectures have achieved impressive results in text generation and Unsupervised NMT respectively. Leveraging the two domains, we propose an adversarial latent space based architecture capable of generating parallel sentences in two languages concurrently and translating bidirectionally. The bilingual generation goal is achieved by sampling from the latent space that is adversarially constrained to be shared between both languages. First an NMT model is trained, with back-translation and an adversarial setup, to enforce a latent state between the two languages. The encoder and decoder are shared for the two translation directions. Next, a GAN is trained to generate \u2018synthetic\u2019 code mimicking the languages\u2019 shared latent space. This code is then fed into the decoder to generate text in either language. We perform our experiments on Europarl and Multi30k datasets, on the English-French language pair, and document our performance using both Supervised and Unsupervised NMT.", "keywords": "Text Generation;Machine Translation;Deep Learning;GAN", "primary_area": "", "supplementary_material": "", "author": "Ahmad Rashid;Alan Do-Omri;Mehdi Rezagholizadeh;Md. Akmal Haidar;Hamed Sadeghi", "authorids": "ahmadrash@gmail.com;alan.do-omri@mail.mcgill.ca;mehdi.rezagholizadeh@gmail.com;md.akmal.haidar@huawei.com;haamed.sadeghi@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJgAfh09tm", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;3", "wc_review": "396;491;337", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 408.0, 63.44026061316793 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:P6wNVLkOTX8J:scholar.google.com/&scioq=Bilingual-GAN:+Neural+Text+Generation+and+Neural+Machine+Translation+as+Two+Sides+of+the+Same+Coin&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BJgEjiRqYX", "title": "A Case for Object Compositionality in Deep Generative Models of Images", "track": "main", "status": "Reject", "tldr": "We propose to structure the generator of a GAN to consider objects and their relations explicitly, and generate images by means of composition", "abstract": "Deep generative models seek to recover the process with which the observed data was generated. They may be used to synthesize new samples or to subsequently extract representations. Successful approaches in the domain of images are driven by several core inductive biases. However, a bias to account for the compositional way in which humans structure a visual scene in terms of objects has frequently been overlooked. In this work we propose to structure the generator of a GAN to consider objects and their relations explicitly, and generate images by means of composition. This provides a way to efficiently learn a more accurate generative model of real-world images, and serves as an initial step towards learning corresponding object representations. We evaluate our approach on several multi-object image datasets, and find that the generator learns to identify and disentangle information corresponding to different objects at a representational level. A human study reveals that the resulting generative model is better at generating images that are more faithful to the reference distribution.", "keywords": "Objects;Compositionality;Generative Models;GAN;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Sjoerd van Steenkiste;Karol Kurach;Sylvain Gelly", "authorids": "sjoerd@idsia.ch;kkurach@gmail.com;sylvain.gelly@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsteenkiste2019a,\ntitle={A Case for Object Compositionality in Deep Generative Models of Images},\nauthor={Sjoerd van Steenkiste and Karol Kurach and Sylvain Gelly},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgEjiRqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJgEjiRqYX", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;5;4", "wc_review": "425;789;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1749;1120;662", "reply_reviewers": "0;0;0", "reply_authors": "3;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 481.6666666666667, 231.29970937196518 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1177.0, 445.5924894639346 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8007127882847041486&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BJgFcj0qKX", "title": "Stacked U-Nets: A No-Frills Approach to Natural Image Segmentation", "track": "main", "status": "Withdraw", "tldr": "Presents new architecture which leverages information globalization power of u-nets in a deeper networks and performs well across tasks without any bells and whistles.", "abstract": "Many imaging tasks require global information about all pixels in an image. Conventional bottom-up classification networks globalize information by decreasing resolution; features are pooled and down-sampled into a single output. But for semantic segmentation and object detection tasks, a network must provide higher-resolution pixel-level outputs. To globalize information while preserving resolution, many researchers propose the inclusion of sophisticated auxiliary blocks, but these come at the cost of a considerable increase in network size and computational cost. This paper proposes stacked u-nets (SUNets), which iteratively combine features from different resolution scales while maintaining resolution. SUNets leverage the information globalization power of u-nets in a deeper net- work architectures that is capable of handling the complexity of natural images. SUNets perform extremely well on semantic segmentation tasks using a small number of parameters.", "keywords": "semantic segmentation;stacked u-nets;classification", "primary_area": "", "supplementary_material": "", "author": "Sohil Shah;Pallabi Ghosh;Larry S Davis;Tom Goldstein", "authorids": "sohilas@umd.edu;tomg@cs.umd.edu;pallabig@umd.edu;lsd@umiacs.umd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJgFcj0qKX", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;5;5", "wc_review": "99;145;195", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;284;0", "reply_reviewers": "0;0;0", "reply_authors": "0;1;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 146.33333333333334, 39.20317447463775 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 94.66666666666667, 133.878883904653 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18042424259237851664&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJgGhiR5KX", "title": "Learning Cross-Lingual Sentence Representations via a Multi-task Dual-Encoder Model", "track": "main", "status": "Reject", "tldr": "State-of-the-art zero-shot learning performance by using a translation task to bridge multi-task training across languages.", "abstract": "A significant roadblock in multilingual neural language modeling is the lack of labeled non-English data. One potential method for overcoming this issue is learning cross-lingual text representations that can be used to transfer the performance from training on English tasks to non-English tasks, despite little to no task-specific non-English data. In this paper, we explore a natural setup for learning crosslingual sentence representations: the dual-encoder. We provide a comprehensive evaluation of our cross-lingual representations on a number of monolingual, crosslingual, and zero-shot/few-shot learning tasks, and also give an analysis of different learned cross-lingual embedding spaces.", "keywords": "sentence;embeddings;zero-shot;multilingual;multi-task;cross-lingual", "primary_area": "", "supplementary_material": "", "author": "Muthuraman Chidambaram;Yinfei Yang;Daniel Cer;Steve Yuan;Yun-Hsuan Sung;Brian Strope;Ray Kurzweil", "authorids": "mc4xf@virginia.edu;yinfeiy@google.com;cer@google.com;steveyuan@google.com;yhsung@google.com;bps@google.com;raykurzweil@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nchidambaram2019learning,\ntitle={Learning Cross-Lingual Sentence Representations via a Multi-task Dual-Encoder Model},\nauthor={Muthuraman Chidambaram and Yinfei Yang and Daniel Cer and Steve Yuan and Yun-Hsuan Sung and Brian Strope and Ray Kurzweil},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgGhiR5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJgGhiR5KX", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;5;4", "wc_review": "450;345;724", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "721;445;590", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 506.3333333333333, 159.77136442081508 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 585.3333333333334, 112.72483705416873 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.18898223650461357, "gs_citation": 182, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4506216963294452513&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "title": "AutoLoss: Learning Discrete Schedule for Alternate Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1063", "id": "BJgK6iA5KX", "author_site": "Haowen Xu, Hao Zhang, Zhiting Hu, Xiaodan Liang, Ruslan Salakhutdinov, Eric Xing", "tldr": "We propose a unified formulation for iterative alternate optimization and develop AutoLoss, a framework to automatically learn and generate optimization schedules.", "abstract": "Many machine learning problems involve iteratively and alternately optimizing different task objectives with respect to different sets of parameters. Appropriately scheduling the optimization of a task objective or a set of parameters is usually crucial to the quality of convergence. In this paper, we present AutoLoss, a meta-learning framework that automatically learns and determines the optimization schedule. AutoLoss provides a generic way to represent and learn the discrete optimization schedule from metadata, allows for a dynamic and data-driven schedule in ML problems that involve alternating updates of different parameters or from different loss objectives.\n\nWe apply AutoLoss on four ML tasks: d-ary quadratic regression, classification using a multi-layer perceptron (MLP), image generation using GANs, and multi-task neural machine translation (NMT). We show that the AutoLoss controller is able to capture the distribution of better optimization schedules that result in higher quality of convergence on all four tasks. The trained AutoLoss controller is generalizable -- it can guide and improve the learning of a new task model with different specifications, or on different datasets.", "keywords": "Meta Learning;AutoML;Optimization Schedule", "primary_area": "", "supplementary_material": "", "author": "Haowen Xu;Hao Zhang;Zhiting Hu;Xiaodan Liang;Ruslan Salakhutdinov;Eric Xing", "authorids": "haowen.will.xu@gmail.com;hao@cs.cmu.edu;zhitingh@cs.cmu.edu;xiaodan1@cs.cmu.edu;rsalakhu@cs.cmu.edu;eric.xing@petuum.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nxu2018autoloss,\ntitle={AutoLoss: Learning Discrete Schedule for Alternate Optimization},\nauthor={Haowen Xu and Hao Zhang and Zhiting Hu and Xiaodan Liang and Ruslan Salakhutdinov and Eric Xing},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgK6iA5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;4", "wc_review": "222;170;1286", "wc_reply_reviewers": "124;0;51", "wc_reply_authors": "799;557;1098", "reply_reviewers": "1;0;1", "reply_authors": "2;1;3", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 559.3333333333334, 514.2692766332526 ], "wc_reply_reviewers_avg": [ 58.333333333333336, 50.88767587103536 ], "wc_reply_authors_avg": [ 818.0, 221.27057343141374 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15351780571596212720&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJgK6iA5KX", "pdf": "https://openreview.net/pdf?id=BJgK6iA5KX", "email": ";;;;;", "author_num": 6 }, { "title": "Learning what and where to attend", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1007", "id": "BJgLg3R9KQ", "author_site": "Drew Linsley, Dan Shiebler, Sven Eberhardt, Thomas Serre", "tldr": "A large-scale dataset for training attention models for object recognition leads to more accurate, interpretable, and human-like object recognition.", "abstract": "Most recent gains in visual recognition have originated from the inclusion of attention mechanisms in deep convolutional networks (DCNs). Because these networks are optimized for object recognition, they learn where to attend using only a weak form of supervision derived from image class labels. Here, we demonstrate the benefit of using stronger supervisory signals by teaching DCNs to attend to image regions that humans deem important for object recognition. We first describe a large-scale online experiment (ClickMe) used to supplement ImageNet with nearly half a million human-derived \"top-down\" attention maps. Using human psychophysics, we confirm that the identified top-down features from ClickMe are more diagnostic than \"bottom-up\" saliency features for rapid image categorization. As a proof of concept, we extend a state-of-the-art attention network and demonstrate that adding ClickMe supervision significantly improves its accuracy and yields visual features that are more interpretable and more similar to those used by human observers.", "keywords": "Attention models;human feature importance;object recognition;cognitive science", "primary_area": "", "supplementary_material": "", "author": "Drew Linsley;Dan Shiebler;Sven Eberhardt;Thomas Serre", "authorids": "drewlinsley@gmail.com;danshiebler@gmail.com;sven2sven2sven2@gmail.com;thomas_serre@brown.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlinsley2018learning,\ntitle={Learning what and where to attend with humans in the loop},\nauthor={Drew Linsley and Dan Shiebler and Sven Eberhardt and Thomas Serre},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgLg3R9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;3;3", "wc_review": "392;242;509", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "408;324;536", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 381.0, 109.27945827098522 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 422.6666666666667, 87.16778201963281 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 176, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6480644992642234324&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BJgLg3R9KQ", "pdf": "https://openreview.net/pdf?id=BJgLg3R9KQ", "email": ";;;", "author_num": 4 }, { "id": "BJgQB20qFQ", "title": "Learning to Progressively Plan", "track": "main", "status": "Reject", "tldr": "", "abstract": "For problem solving, making reactive decisions based on problem description is fast but inaccurate, while search-based planning using heuristics gives better solutions but could be exponentially slow. In this paper, we propose a new approach that improves an existing solution by iteratively picking and rewriting its local components until convergence. The rewriting policy employs a neural network trained with reinforcement learning. We evaluate our approach in two domains: job scheduling and expression simplification. Compared to common effective heuristics, baseline deep models and search algorithms, our approach efficiently gives solutions with higher quality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyun Chen;Yuandong Tian", "authorids": "xinyun.chen@berkeley.edu;yuandong@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchen2019learning,\ntitle={Learning to Progressively Plan},\nauthor={Xinyun Chen and Yuandong Tian},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgQB20qFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=BJgQB20qFQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;3;3", "wc_review": "125;582;394", "wc_reply_reviewers": "0;1;0", "wc_reply_authors": "241;76;787", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 367.0, 187.54377266832046 ], "wc_reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "wc_reply_authors_avg": [ 368.0, 303.8387730359639 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5083355510288481687&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "ROBUST ESTIMATION VIA GENERATIVE ADVERSARIAL NETWORKS", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1133", "id": "BJgRDjR9tQ", "author_site": "Chao Gao, Jiyi Liu, Yuan Yao, Weizhi ZHU", "tldr": "GANs are shown to provide us a new effective robust mean estimate against agnostic contaminations with both statistical optimality and practical tractability.", "abstract": "Robust estimation under Huber's $\\epsilon$-contamination model has become an important topic in statistics and theoretical computer science. Rate-optimal procedures such as Tukey's median and other estimators based on statistical depth functions are impractical because of their computational intractability. In this paper, we establish an intriguing connection between f-GANs and various depth functions through the lens of f-Learning. Similar to the derivation of f-GAN, we show that these depth functions that lead to rate-optimal robust estimators can all be viewed as variational lower bounds of the total variation distance in the framework of f-Learning. This connection opens the door of computing robust estimators using tools developed for training GANs. In particular, we show that a JS-GAN that uses a neural network discriminator with at least one hidden layer is able to achieve the minimax rate of robust mean estimation under Huber's $\\epsilon$-contamination model. Interestingly, the hidden layers of the neural net structure in the discriminator class are shown to be necessary for robust estimation.", "keywords": "robust statistics;neural networks;minimax rate;data depth;contamination model;Tukey median;GAN", "primary_area": "", "supplementary_material": "", "author": "Chao GAO;jiyi LIU;Yuan YAO;Weizhi ZHU", "authorids": "chaogao@galton.uchicago.edu;jiyi.liu@yale.edu;yuany@ust.hk;wzhuai@connect.ust.hk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngao2018robust,\ntitle={{ROBUST} {ESTIMATION} {VIA} {GENERATIVE} {ADVERSARIAL} {NETWORKS}},\nauthor={Chao GAO and jiyi LIU and Yuan YAO and Weizhi ZHU},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgRDjR9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;7", "confidence": "5;5;4", "wc_review": "779;361;267", "wc_reply_reviewers": "349;0;11", "wc_reply_authors": "1877;212;125", "reply_reviewers": "1;0;1", "reply_authors": "3;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 469.0, 222.53688832790544 ], "wc_reply_reviewers_avg": [ 120.0, 161.9897116074557 ], "wc_reply_authors_avg": [ 738.0, 806.1773998320717 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15221755775674009849&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "openreview": "https://openreview.net/forum?id=BJgRDjR9tQ", "pdf": "https://openreview.net/pdf?id=BJgRDjR9tQ", "email": ";;;", "author_num": 4 }, { "id": "BJgTZ3C5FX", "title": "Generative model based on minimizing exact empirical Wasserstein distance", "track": "main", "status": "Reject", "tldr": "We have proposed a flexible generative model that learns stably by directly minimizing exact empirical Wasserstein distance.", "abstract": "Generative Adversarial Networks (GANs) are a very powerful framework for generative modeling. However, they are often hard to train, and learning of GANs often becomes unstable. Wasserstein GAN (WGAN) is a promising framework to deal with the instability problem as it has a good convergence property. One drawback of the WGAN is that it evaluates the Wasserstein distance in the dual domain, which requires some approximation, so that it may fail to optimize the true Wasserstein distance. In this paper, we propose evaluating the exact empirical optimal transport cost efficiently in the primal domain and performing gradient descent with respect to its derivative to train the generator network. Experiments on the MNIST dataset show that our method is significantly stable to converge, and achieves the lowest Wasserstein distance among the WGAN variants at the cost of some sharpness of generated images. Experiments on the 8-Gaussian toy dataset show that better gradients for the generator are obtained in our method. In addition, the proposed method enables more flexible generative modeling than WGAN.", "keywords": "Generative modeling;Generative Adversarial Networks (GANs);Wasserstein GAN;Optimal transport", "primary_area": "", "supplementary_material": "", "author": "Akihiro Iohara;Takahito Ogawa;Toshiyuki Tanaka", "authorids": "iohara@sys.i.kyoto-u.ac.jp;takahito.ogawa@datagrid.co.jp;tt@i.kyoto-u.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\niohara2019generative,\ntitle={Generative model based on minimizing exact empirical Wasserstein distance},\nauthor={Akihiro Iohara and Takahito Ogawa and Toshiyuki Tanaka},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgTZ3C5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJgTZ3C5FX", "pdf_size": 0, "rating": "2;3;5", "confidence": "5;4;2", "wc_review": "174;249;191", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 204.66666666666666, 32.10745846199741 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18180330604559919632&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJgWl3A5YX", "title": "The Body is not a Given: Joint Agent Policy Learning and Morphology Evolution", "track": "main", "status": "Withdraw", "tldr": "Evolving the shape of the body in RL controlled agents improves their performance (and help learning)", "abstract": "Reinforcement learning (RL) has proven to be a powerful paradigm for deriving complex behaviors from simple reward signals in a wide range of environments. When applying RL to continuous control agents in simulated physics environments, the body is usually considered to be part of the environment. However, during evolution the physical body of biological organisms and their controlling brains are co-evolved, thus exploring a much larger space of actuator/controller configurations. Put differently, the intelligence does not reside only in the agent's mind, but also in the design of their body. \nWe propose a method for uncovering strong agents, consisting of a good combination of a body and policy, based on combining RL with an evolutionary procedure. Given the resulting agent, we also propose an approach for identifying the body changes that contributed the most to the agent performance. We use the Shapley value from cooperative game theory to find the fair contribution of individual components, taking into account synergies between components. \nWe evaluate our methods in an environment similar to the the recently proposed Robo-Sumo task, where agents in a 3D environment with simulated physics compete in tipping over their opponent or pushing them out of the arena. Our results show that the proposed methods are indeed capable of generating strong agents, significantly outperforming baselines that focus on optimizing the agent policy alone. \n\nA video is available at: www.youtube.com/watch?v=eei6Rgom3YY", "keywords": "Reinforcement Learning;Continuous Control;Evolutionary Computation;Genetic Algorithms;Evolving Morphology;Baldwin Effect;Population Based Training", "primary_area": "", "supplementary_material": "", "author": "Dylan Banarse;Yoram Bachrach;Siqi Liu;Chrisantha Fernando;Nicolas Heess;Pushmeet Kohli;Guy Lever;Thore Graepel", "authorids": "dylski@google.com;yorambac@google.com;guylever@google.com;heess@google.com;pushmeet@google.com;liusiqi@google.com;chrisantha@google.com;thore@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer5", "site": "https://openreview.net/forum?id=BJgWl3A5YX", "pdf_size": 0, "rating": "3;4;4;4", "confidence": "4;3;4;4", "wc_review": "531;441;601;93", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;803;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;1;0;0", "rating_avg": [ 3.75, 0.4330127018922193 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 416.5, 195.19413413317523 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 200.75, 347.70919961945214 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.25, 0.4330127018922193 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.3333333333333333, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18118944999606562643&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "BJgYl205tQ", "title": "Quality Evaluation of GANs Using Cross Local Intrinsic Dimensionality", "track": "main", "status": "Reject", "tldr": "We propose a new metric for evaluating GAN models.", "abstract": "Generative Adversarial Networks (GANs) are an elegant mechanism for data generation. However, a key challenge when using GANs is how to best measure their ability to generate realistic data. In this paper, we demonstrate that an intrinsic dimensional characterization of the data space learned by a GAN model leads to an effective evaluation metric for GAN quality. In particular, we propose a new evaluation measure, CrossLID, that assesses the local intrinsic dimensionality (LID) of input data with respect to neighborhoods within GAN-generated samples. In experiments on 3 benchmark image datasets, we compare our proposed measure to several state-of-the-art evaluation metrics. Our experiments show that CrossLID is strongly correlated with sample quality, is sensitive to mode collapse, is robust to small-scale noise and image transformations, and can be applied in a model-free manner. Furthermore, we show how CrossLID can be used within the GAN training process to improve generation quality.", "keywords": "Generative Adversarial Networks;Evaluation Metric;Local Intrinsic Dimensionality", "primary_area": "", "supplementary_material": "", "author": "Sukarna Barua;Xingjun Ma;Sarah Monazam Erfani;Michael Houle;James Bailey", "authorids": "sukarnab@student.unimelb.edu.au;xingjun.ma@unimelb.edu.au;sarah.erfani@unimelb.edu.au;meh@nii.ac.jp;baileyj@unimelb.edu.au", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbarua2019quality,\ntitle={Quality Evaluation of {GAN}s Using Cross Local Intrinsic Dimensionality},\nauthor={Sukarna Barua and Xingjun Ma and Sarah Monazam Erfani and Michael Houle and James Bailey},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgYl205tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJgYl205tQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;5;4", "wc_review": "991;1183;123", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1768;1986;102", "reply_reviewers": "0;0;0", "reply_authors": "3;3;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 765.6666666666666, 461.1444700115379 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1285.3333333333333, 841.4627475744577 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17942765581786297921&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJg_fnRqF7", "title": "Deep clustering based on a mixture of autoencoders", "track": "main", "status": "Withdraw", "tldr": "We propose a deep clustering method where instead of a centroid each cluster is represented by an autoencoder", "abstract": "In this paper we propose a Deep Autoencoder Mixture Clustering (DAMIC) algorithm. It is based on a mixture of deep autoencoders where each cluster is represented by an autoencoder. A clustering network transforms the data into another space and then selects one of the clusters. Next, the autoencoder associated with this cluster is used to reconstruct the data-point. The clustering algorithm jointly learns the nonlinear data representation and the set of autoencoders. The optimal clustering is found by minimizing the reconstruction loss of the mixture of autoencoder network. Unlike other deep clustering algorithms, no regularization term is needed to avoid data collapsing to a single point. Our experimental evaluations on image and text corpora show significant improvement over state-of-the-art methods.", "keywords": "deep clustering;mixture of experts;mixture of autoencoders", "primary_area": "", "supplementary_material": "", "author": "Shlomo E. Chazan;Sharon Gannot;Jacob Goldberger", "authorids": "shlomi.chazan@biu.ac.il;sharon.gannot@biu.ac.il;jacob.goldberger@biu.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJg_fnRqF7", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;5;3", "wc_review": "603;233;243", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 359.6666666666667, 172.11107524567447 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17621901775666816509&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "title": "INVASE: Instance-wise Variable Selection using Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1022", "id": "BJg_roAcK7", "author_site": "Jinsung Yoon, James Jordon, Mihaela Schaar", "tldr": "", "abstract": "The advent of big data brings with it data with more and more dimensions and thus a growing need to be able to efficiently select which features to use for a variety of problems. While global feature selection has been a well-studied problem for quite some time, only recently has the paradigm of instance-wise feature selection been developed. In this paper, we propose a new instance-wise feature selection method, which we term INVASE. INVASE consists of 3 neural networks, a selector network, a predictor network and a baseline network which are used to train the selector network using the actor-critic methodology. Using this methodology, INVASE is capable of flexibly discovering feature subsets of a different size for each instance, which is a key limitation of existing state-of-the-art methods. We demonstrate through a mixture of synthetic and real data experiments that INVASE significantly outperforms state-of-the-art benchmarks.", "keywords": "Instance-wise feature selection;interpretability;actor-critic methodology", "primary_area": "", "supplementary_material": "", "author": "Jinsung Yoon;James Jordon;Mihaela van der Schaar", "authorids": "jsyoon0823@gmail.com;james.jordon@wolfson.ox.ac.uk;mihaela.vanderschaar@eng.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyoon2018invase,\ntitle={{INVASE}: Instance-wise Variable Selection using Neural Networks},\nauthor={Jinsung Yoon and James Jordon and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJg_roAcK7},\n}", "github": "[![github](/images/github_icon.svg) vanderschaarlab/mlforhealthlabpub](https://github.com/vanderschaarlab/mlforhealthlabpub/tree/main/alg/invase)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;3", "wc_review": "275;531;237", "wc_reply_reviewers": "0;412;45", "wc_reply_authors": "199;992;469", "reply_reviewers": "0;1;1", "reply_authors": "1;2;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 347.6666666666667, 130.5611819118616 ], "wc_reply_reviewers_avg": [ 152.33333333333334, 184.52882942480528 ], "wc_reply_authors_avg": [ 553.3333333333334, 329.1872145485335 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 218, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3318146487055213130&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=BJg_roAcK7", "pdf": "https://openreview.net/pdf?id=BJg_roAcK7", "email": ";;", "author_num": 3 }, { "id": "BJgbzhC5Ym", "title": "NECST: Neural Joint Source-Channel Coding", "track": "main", "status": "Reject", "tldr": "jointly learn compression + error correcting codes with deep learning", "abstract": "For reliable transmission across a noisy communication channel, classical results from information theory show that it is asymptotically optimal to separate out the source and channel coding processes. However, this decomposition can fall short in the finite bit-length regime, as it requires non-trivial tuning of hand-crafted codes and assumes infinite computational power for decoding. In this work, we propose Neural Error Correcting and Source Trimming (NECST) codes to jointly learn the encoding and decoding processes in an end-to-end fashion. By adding noise into the latent codes to simulate the channel during training, we learn to both compress and error-correct given a fixed bit-length and computational budget. We obtain codes that are not only competitive against several capacity-approaching channel codes, but also learn useful robust representations of the data for downstream tasks such as classification. Finally, we learn an extremely fast neural decoder, yielding almost an order of magnitude in speedup compared to standard decoding methods based on iterative belief propagation. ", "keywords": "joint source-channel coding;deep generative models;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Kristy Choi;Kedar Tatwawadi;Tsachy Weissman;Stefano Ermon", "authorids": "kechoi@cs.stanford.edu;kedart@stanford.edu;tsachy@stanford.edu;ermon@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchoi2019necst,\ntitle={{NECST}: Neural Joint Source-Channel Coding},\nauthor={Kristy Choi and Kedar Tatwawadi and Tsachy Weissman and Stefano Ermon},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgbzhC5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJgbzhC5Ym", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;5;4", "wc_review": "654;451;482", "wc_reply_reviewers": "0;162;36", "wc_reply_authors": "763;936;389", "reply_reviewers": "0;2;1", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 529.0, 89.28979038314888 ], "wc_reply_reviewers_avg": [ 66.0, 69.45502141674135 ], "wc_reply_authors_avg": [ 696.0, 228.28198936111158 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.6546536707079772, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11267677220783131500&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Meta-Learning with Latent Embedding Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/839", "id": "BJgklhAcK7", "author_site": "Andrei A. Rusu, Dushyant Rao, Jakub Sygnowski, Oriol Vinyals, Razvan Pascanu, Simon Osindero, Raia Hadsell", "tldr": "Latent Embedding Optimization (LEO) is a novel gradient-based meta-learner with state-of-the-art performance on the challenging 5-way 1-shot and 5-shot miniImageNet and tieredImageNet classification tasks.", "abstract": "Gradient-based meta-learning techniques are both widely applicable and proficient at solving challenging few-shot learning and fast adaptation problems. However, they have practical difficulties when operating on high-dimensional parameter spaces in extreme low-data regimes. We show that it is possible to bypass these limitations by learning a data-dependent latent generative representation of model parameters, and performing gradient-based meta-learning in this low-dimensional latent space. The resulting approach, latent embedding optimization (LEO), decouples the gradient-based adaptation procedure from the underlying high-dimensional space of model parameters. Our evaluation shows that LEO can achieve state-of-the-art performance on the competitive miniImageNet and tieredImageNet few-shot classification tasks. Further analysis indicates LEO is able to capture uncertainty in the data, and can perform adaptation more effectively by optimizing in latent space.", "keywords": "meta-learning;few-shot;miniImageNet;tieredImageNet;hypernetworks;generative;latent embedding;optimization", "primary_area": "", "supplementary_material": "", "author": "Andrei A. Rusu;Dushyant Rao;Jakub Sygnowski;Oriol Vinyals;Razvan Pascanu;Simon Osindero;Raia Hadsell", "authorids": "andreirusu@google.com;dushyantr@google.com;sygi@google.com;vinyals@google.com;razp@google.com;osindero@google.com;raia@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nrusu2018metalearning,\ntitle={Meta-Learning with Latent Embedding Optimization},\nauthor={Andrei A. Rusu and Dushyant Rao and Jakub Sygnowski and Oriol Vinyals and Razvan Pascanu and Simon Osindero and Raia Hadsell},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgklhAcK7},\n}", "github": "[![github](/images/github_icon.svg) deepmind/leo](https://github.com/deepmind/leo) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=BJgklhAcK7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;8", "confidence": "3;5;5", "wc_review": "172;232;401", "wc_reply_reviewers": "0;17;0", "wc_reply_authors": "760;877;447", "reply_reviewers": "0;1;0", "reply_authors": "3;3;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 268.3333333333333, 96.95474316516042 ], "wc_reply_reviewers_avg": [ 5.666666666666667, 8.013876853447538 ], "wc_reply_authors_avg": [ 694.6666666666666, 181.5237970319288 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 1771, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11552536411545683614&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BJgklhAcK7", "pdf": "https://openreview.net/pdf?id=BJgklhAcK7", "email": ";;;;;;", "author_num": 7 }, { "id": "BJgnmhA5KQ", "title": "Diverse Machine Translation with a Single Multinomial Latent Variable", "track": "main", "status": "Reject", "tldr": "", "abstract": "There are many ways to translate a sentence into another language. Explicit modeling of such uncertainty may enable better model fitting to the data and it may enable users to express a preference for how to translate a piece of content. Latent variable models are a natural way to represent uncertainty. Prior work investigated the use of multivariate continuous and discrete latent variables, but their interpretation and use for generating a diverse set of hypotheses have been elusive. In this work, we drastically simplify the model, using just a single multinomial latent variable. The resulting mixture of experts model can be trained efficiently via hard-EM and can generate a diverse set of hypothesis by parallel greedy decoding. We perform extensive experiments on three WMT benchmark datasets that have multiple human references, and we show that our model provides a better trade-off between quality and diversity of generations compared to all baseline methods.\\footnote{Code to reproduce this work is available at: anonymized URL.}", "keywords": "machine translation;latent variable models;diverse decoding", "primary_area": "", "supplementary_material": "", "author": "Tianxiao Shen;Myle Ott;Michael Auli;Marc\u2019Aurelio Ranzato", "authorids": "tianxiao@mit.edu;myleott@fb.com;michaelauli@fb.com;ranzato@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshen2019diverse,\ntitle={Diverse Machine Translation with a Single Multinomial Latent Variable},\nauthor={Tianxiao Shen and Myle Ott and Michael Auli and Marc\u2019Aurelio Ranzato},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgnmhA5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJgnmhA5KQ", "pdf_size": 0, "rating": "3;5;6;7", "confidence": "4;4;4;4", "wc_review": "800;283;700;357", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1034;692;957;381", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;2;1", "rating_avg": [ 5.25, 1.479019945774904 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 535.0, 219.45272839497804 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 766.0, 255.9326083171115 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XjLo5HCkGw4J:scholar.google.com/&scioq=Diverse+Machine+Translation+with+a+Single+Multinomial+Latent+Variable&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJgolhR9Km", "title": "Neural Networks with Structural Resistance to Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "We introduce a type of neural network that is structurally resistant to adversarial attacks, even when trained on unaugmented training sets. The resistance is due to the stability of network units wrt input perturbations.", "abstract": "In adversarial attacks to machine-learning classifiers, small perturbations are added to input that is correctly classified. The perturbations yield adversarial examples, which are virtually indistinguishable from the unperturbed input, and yet are misclassified. In standard neural networks used for deep learning, attackers can craft adversarial examples from most input to cause a misclassification of their choice. \n\nWe introduce a new type of network units, called RBFI units, whose non-linear structure makes them inherently resistant to adversarial attacks. On permutation-invariant MNIST, in absence of adversarial attacks, networks using RBFI units match the performance of networks using sigmoid units, and are slightly below the accuracy of networks with ReLU units. When subjected to adversarial attacks based on projected gradient descent or fast gradient-sign methods, networks with RBFI units retain accuracies above 75%, while ReLU or Sigmoid see their accuracies reduced to below 1%.\nFurther, RBFI networks trained on regular input either exceed or closely match the accuracy of sigmoid and ReLU network trained with the help of adversarial examples.\n\nThe non-linear structure of RBFI units makes them difficult to train using standard gradient descent. We show that RBFI networks of RBFI units can be efficiently trained to high accuracies using pseudogradients, computed using functions especially crafted to facilitate learning instead of their true derivatives.\n", "keywords": "machine learning;adversarial attacks", "primary_area": "", "supplementary_material": "", "author": "Luca de Alfaro", "authorids": "luca@ucsc.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nalfaro2019neural,\ntitle={Neural Networks with Structural Resistance to Adversarial Attacks},\nauthor={Luca de Alfaro},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgolhR9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJgolhR9Km", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;3;4", "wc_review": "178;613;222", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 337.6666666666667, 195.51697851820668 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12894504478700583317&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Non-vacuous Generalization Bounds at the ImageNet Scale: a PAC-Bayesian Compression Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/806", "id": "BJgqqsAct7", "author_site": "Wenda Zhou, Victor Veitch, Morgane Austern, Ryan P Adams, Peter Orbanz", "tldr": "We obtain non-vacuous generalization bounds on ImageNet-scale deep neural networks by combining an original PAC-Bayes bound and an off-the-shelf neural network compression method.", "abstract": "Modern neural networks are highly overparameterized, with capacity to substantially overfit to training data. Nevertheless, these networks often generalize well in practice. It has also been observed that trained networks can often be ``compressed to much smaller representations. The purpose of this paper is to connect these two empirical observations. Our main technical result is a generalization bound for compressed networks based on the compressed size that, combined with off-the-shelf compression algorithms, leads to state-of-the-art generalization guarantees. In particular, we provide the first non-vacuous generalization guarantees for realistic architectures applied to the ImageNet classification problem. Additionally, we show that compressibility of models that tend to overfit is limited. Empirical results show that an increase in overfitting increases the number of bits required to describe a trained network.", "keywords": "generalization;deep-learning;pac-bayes", "primary_area": "", "supplementary_material": "", "author": "Wenda Zhou;Victor Veitch;Morgane Austern;Ryan P. Adams;Peter Orbanz", "authorids": "wz2335@columbia.edu;victorveitch@gmail.com;ma3293@columbia.edu;rpa@princeton.edu;porbanz@stat.columbia.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhou2018nonvacuous,\ntitle={Non-vacuous Generalization Bounds at the ImageNet Scale: a {PAC}-Bayesian Compression Approach},\nauthor={Wenda Zhou and Victor Veitch and Morgane Austern and Ryan P. Adams and Peter Orbanz},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgqqsAct7},\n}", "github": "[![github](/images/github_icon.svg) wendazhou/nnet-compression-generalization](https://github.com/wendazhou/nnet-compression-generalization)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "5;4;4", "wc_review": "231;790;475", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "259;448;211", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 498.6666666666667, 228.82356133541455 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 306.0, 102.30347012687302 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 232, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12180551458196751211&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=BJgqqsAct7", "pdf": "https://openreview.net/pdf?id=BJgqqsAct7", "email": ";;;;", "author_num": 5 }, { "id": "BJgsN3R9Km", "title": "AntMan: Sparse Low-Rank Compression To Accelerate RNN Inference", "track": "main", "status": "Reject", "tldr": "Reducing computation and memory complexity of RNN models by up to 100x using sparse low-rank compression modules, trained via knowledge distillation.", "abstract": "Wide adoption of complex RNN based models is hindered by their inference performance, cost and memory requirements. To address this issue, we develop AntMan, combining structured sparsity with low-rank decomposition synergistically, to reduce model computation, size and execution time of RNNs while attaining desired accuracy. AntMan extends knowledge distillation based training to learn the compressed models efficiently. Our evaluation shows that AntMan offers up to 100x computation reduction with less than 1pt accuracy drop for language and machine reading comprehension models. Our evaluation also shows that for a given accuracy target, AntMan produces 5x smaller models than the state-of-art. Lastly, we show that AntMan offers super-linear speed gains compared to theoretical speedup, demonstrating its practical value on commodity hardware.", "keywords": "model compression;RNN;perforamnce optimization;langugage model;machine reading comprehension;knowledge distillation;teacher-student", "primary_area": "", "supplementary_material": "", "author": "Samyam Rajbhandari;Harsh Shrivastava;Yuxiong He", "authorids": "samyamr@microsoft.com;hshrivastava3@gatech.edu;yuxhe@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrajbhandari2019antman,\ntitle={AntMan: Sparse Low-Rank Compression To Accelerate {RNN} Inference},\nauthor={Samyam Rajbhandari and Harsh Shrivastava and Yuxiong He},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgsN3R9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJgsN3R9Km", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;2;5", "wc_review": "217;188;392", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1210;454;694", "reply_reviewers": "0;0;0", "reply_authors": "5;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 265.6666666666667, 90.11227564667436 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 786.0, 315.41718405946114 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7665140217294507530&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "BJgvg30ctX", "title": "Information Regularized Neural Networks", "track": "main", "status": "Reject", "tldr": "we propose a regularizer that improves the classification performance of neural networks", "abstract": "We formulate an information-based optimization problem for supervised classification. For invertible neural networks, the control of these information terms is passed down to the latent features and parameter matrix in the last fully connected layer, given that mutual information is invariant under invertible map. We propose an objective function and prove that it solves the optimization problem. Our framework allows us to learn latent features in an more interpretable form while improving the classification performance. We perform extensive quantitative and qualitative experiments in comparison with the existing state-of-the-art classification models.", "keywords": "supervised classification;information theory;deep learning;regularization", "primary_area": "", "supplementary_material": "", "author": "Tianchen Zhao;Dejiao Zhang;Zeyu Sun;Honglak Lee", "authorids": "ericolon@umich.edu;dejiao@umich.edu;zeyusun@umich.edu;honglak@eecs.umich.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2019information,\ntitle={Information Regularized Neural Networks},\nauthor={Tianchen Zhao and Dejiao Zhang and Zeyu Sun and Honglak Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgvg30ctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJgvg30ctX", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;4", "wc_review": "535;314;293", "wc_reply_reviewers": "0;0;68", "wc_reply_authors": "806;1377;657", "reply_reviewers": "0;0;1", "reply_authors": "1;3;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 380.6666666666667, 109.46638246004518 ], "wc_reply_reviewers_avg": [ 22.666666666666668, 32.05550741379015 ], "wc_reply_authors_avg": [ 946.6666666666666, 310.312029343942 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "BJgy-n0cK7", "title": "Inter-BMV: Interpolation with Block Motion Vectors for Fast Semantic Segmentation on Video", "track": "main", "status": "Reject", "tldr": "We exploit video compression techniques (in particular, the block motion vectors in H.264 video) and feature similarity across frames to accelerate a classical image recognition task, semantic segmentation, on video.", "abstract": "Models optimized for accuracy on single images are often prohibitively slow to\nrun on each frame in a video, especially on challenging dense prediction tasks,\nsuch as semantic segmentation. Recent work exploits the use of optical flow to\nwarp image features forward from select keyframes, as a means to conserve computation\non video. This approach, however, achieves only limited speedup, even\nwhen optimized, due to the accuracy degradation introduced by repeated forward\nwarping, and the inference cost of optical flow estimation. To address these problems,\nwe propose a new scheme that propagates features using the block motion\nvectors (BMV) present in compressed video (e.g. H.264 codecs), instead of optical\nflow, and bi-directionally warps and fuses features from enclosing keyframes\nto capture scene context on each video frame. Our technique, interpolation-BMV,\nenables us to accurately estimate the features of intermediate frames, while keeping\ninference costs low. We evaluate our system on the CamVid and Cityscapes\ndatasets, comparing to both a strong single-frame baseline and related work. We\nfind that we are able to substantially accelerate segmentation on video, achieving\nnear real-time frame rates (20+ frames per second) on large images (e.g. 960 x \u0002720\npixels), while maintaining competitive accuracy. This represents an improvement\nof almost 6\u0002x over the single-frame baseline and 2.5x\u0002 over the fastest prior work.", "keywords": "semantic segmentation;video;efficient inference;video segmentation;video compression", "primary_area": "", "supplementary_material": "", "author": "Samvit Jain;Joseph Gonzalez", "authorids": "samvit@eecs.berkeley.edu;jegonzal@cs.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\njain2019interbmv,\ntitle={Inter-{BMV}: Interpolation with Block Motion Vectors for Fast Semantic Segmentation on Video},\nauthor={Samvit Jain and Joseph Gonzalez},\nyear={2019},\nurl={https://openreview.net/forum?id=BJgy-n0cK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJgy-n0cK7", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;4;4", "wc_review": "265;732;138", "wc_reply_reviewers": "0;200;0", "wc_reply_authors": "696;519;613", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 378.3333333333333, 255.3981640932883 ], "wc_reply_reviewers_avg": [ 66.66666666666667, 94.28090415820634 ], "wc_reply_authors_avg": [ 609.3333333333334, 72.30644661592922 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15894515453789235290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "BJl4f2A5tQ", "title": "Surprising Negative Results for Generative Adversarial Tree Search", "track": "main", "status": "Reject", "tldr": "Surprising negative results on Model Based + Model deep RL", "abstract": "While many recent advances in deep reinforcement learning rely on model-free methods, model-based approaches remain an alluring prospect for their potential to exploit unsupervised data to learn environment dynamics. One prospect is to pursue hybrid approaches, as in AlphaGo, which combines Monte-Carlo Tree Search (MCTS)\u2014a model-based method\u2014with deep-Q networks (DQNs)\u2014a model-free method. MCTS requires generating rollouts, which is computationally expensive. In this paper, we propose to simulate roll-outs, exploiting the latest breakthroughs in image-to-image transduction, namely Pix2Pix GANs, to predict the dynamics of the environment. Our proposed algorithm, generative adversarial tree search (GATS), simulates rollouts up to a specified depth using both a GAN- based dynamics model and a reward predictor. GATS employs MCTS for planning over the simulated samples and uses DQN to estimate the Q-function at the leaf states. Our theoretical analysis establishes some favorable properties of GATS vis-a-vis the bias-variance trade-off and empirical results show that on 5 popular Atari games, the dynamics and reward predictors converge quickly to accurate solutions. However, GATS fails to outperform DQNs in 4 out of 5 games. Notably, in these experiments, MCTS has only short rollouts (up to tree depth 4), while previous successes of MCTS have involved tree depth in the hundreds. We present a hypothesis for why tree search with short rollouts can fail even given perfect modeling.", "keywords": "Deep Reinforcement Learning;Generative Adversarial Nets", "primary_area": "", "supplementary_material": "", "author": "Kamyar Azizzadenesheli;Brandon Yang;Weitang Liu;Emma Brunskill;Zachary Lipton;Animashree Anandkumar", "authorids": "kazizzad@uci.edu;bcyang@stanford.edu;wetliu@ucdavis.edu;ebrun@cs.stanford.edu;zlipton@cmu.edu;anima@caltech.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nazizzadenesheli2019surprising,\ntitle={Surprising Negative Results for Generative Adversarial Tree Search },\nauthor={Kamyar Azizzadenesheli and Brandon Yang and Weitang Liu and Emma Brunskill and Zachary Lipton and Animashree Anandkumar},\nyear={2019},\nurl={https://openreview.net/forum?id=BJl4f2A5tQ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BJl4f2A5tQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJl4f2A5tQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;2", "wc_review": "480;480;468", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "381;514;261", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 476.0, 5.656854249492381 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 385.3333333333333, 103.33225805892154 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3430296472789086274&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "BJl65sA9tm", "title": "Improving Generative Adversarial Imitation Learning with Non-expert Demonstrations", "track": "main", "status": "Reject", "tldr": "We improve GAIL by learning discriminators using multiclass classification with non-expert regarded as an extra class.", "abstract": "Imitation learning aims to learn an optimal policy from expert demonstrations and its recent combination with deep learning has shown impressive performance. However, collecting a large number of expert demonstrations for deep learning is time-consuming and requires much expert effort. In this paper, we propose a method to improve generative adversarial imitation learning by using additional information from non-expert demonstrations which are easier to obtain. The key idea of our method is to perform multiclass classification to learn discriminator functions where non-expert demonstrations are regarded as being drawn from an extra class. Experiments in continuous control tasks demonstrate that our method learns better policies than the generative adversarial imitation learning baseline when the number of expert demonstrations is small.", "keywords": "Imitation learning;Generative adversarial imitation learning", "primary_area": "", "supplementary_material": "", "author": "Voot Tangkaratt;Masashi Sugiyama", "authorids": "voot.tangkaratt@riken.jp;sugi@k.u-tokyo.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntangkaratt2019improving,\ntitle={Improving Generative Adversarial Imitation Learning with Non-expert Demonstrations},\nauthor={Voot Tangkaratt and Masashi Sugiyama},\nyear={2019},\nurl={https://openreview.net/forum?id=BJl65sA9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJl65sA9tm", "pdf_size": 0, "rating": "4;5;5;7", "confidence": "5;4;3;3", "wc_review": "730;350;427;287", "wc_reply_reviewers": "386;71;45;0", "wc_reply_authors": "789;436;490;395", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "rating_avg": [ 5.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "wc_review_avg": [ 448.5, 169.9183627510576 ], "wc_reply_reviewers_avg": [ 125.5, 152.52950534240907 ], "wc_reply_authors_avg": [ 527.5, 154.69082067142833 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7608859102526822, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3967422908057793112&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 0 }, { "title": "Learning to Represent Edits", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1070", "id": "BJl6AjC5F7", "author_site": "Pengcheng Yin, Graham Neubig, Miltiadis Allamanis, Marc Brockschmidt, Alexander Gaunt", "tldr": "", "abstract": "We introduce the problem of learning distributed representations of edits. By combining a\n\"neural editor\" with an \"edit encoder\", our models learn to represent the salient\ninformation of an edit and can be used to apply edits to new inputs.\nWe experiment on natural language and source code edit data. Our evaluation yields\npromising results that suggest that our neural network models learn to capture\nthe structure and semantics of edits. We hope that this interesting task and\ndata source will inspire other researchers to work further on this problem.", "keywords": "Representation Learning;Source Code;Natural Language;edit", "primary_area": "", "supplementary_material": "", "author": "Pengcheng Yin;Graham Neubig;Miltiadis Allamanis;Marc Brockschmidt;Alexander L. Gaunt", "authorids": "pcyin@cs.cmu.edu;gneubig@cs.cmu.edu;miallama@microsoft.com;mabrocks@microsoft.com;algaunt@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyin2018learning,\ntitle={Learning to Represent Edits},\nauthor={Pengcheng Yin and Graham Neubig and Miltiadis Allamanis and Marc Brockschmidt and Alexander L. Gaunt},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJl6AjC5F7},\n}", "github": "[![github](/images/github_icon.svg) Microsoft/msrc-dpu-learning-to-represent-edits](https://github.com/Microsoft/msrc-dpu-learning-to-represent-edits) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJl6AjC5F7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;3", "wc_review": "380;1033;421", "wc_reply_reviewers": "73;0;0", "wc_reply_authors": "1080;934;558", "reply_reviewers": "1;0;0", "reply_authors": "3;3;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 611.3333333333334, 298.6328105810806 ], "wc_reply_reviewers_avg": [ 24.333333333333332, 34.41253001774532 ], "wc_reply_authors_avg": [ 857.3333333333334, 219.89290322538582 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15643648406405720624&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJl6AjC5F7", "pdf": "https://openreview.net/pdf?id=BJl6AjC5F7", "email": ";;;;", "author_num": 5 }, { "title": "Neural Probabilistic Motor Primitives for Humanoid Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/684", "id": "BJl6TjRcY7", "author_site": "Josh Merel, Leonard Hasenclever, Alexandre Galashov, Arun Ahuja, Vu Pham, Greg Wayne, Yee Whye Teh, Nicolas Heess", "tldr": "Neural Probabilistic Motor Primitives compress motion capture tracking policies into one flexible model capable of one-shot imitation and reuse as a low-level controller.", "abstract": "We focus on the problem of learning a single motor module that can flexibly express a range of behaviors for the control of high-dimensional physically simulated humanoids. To do this, we propose a motor architecture that has the general structure of an inverse model with a latent-variable bottleneck. We show that it is possible to train this model entirely offline to compress thousands of expert policies and learn a motor primitive embedding space. The trained neural probabilistic motor primitive system can perform one-shot imitation of whole-body humanoid behaviors, robustly mimicking unseen trajectories. Additionally, we demonstrate that it is also straightforward to train controllers to reuse the learned motor primitive space to solve tasks, and the resulting movements are relatively naturalistic. To support the training of our model, we compare two approaches for offline policy cloning, including an experience efficient method which we call linear feedback policy cloning. We encourage readers to view a supplementary video (https://youtu.be/CaDEf-QcKwA ) summarizing our results.", "keywords": "Motor Primitives;Distillation;Reinforcement Learning;Continuous Control;Humanoid Control;Motion Capture;One-Shot Imitation", "primary_area": "", "supplementary_material": "", "author": "Josh Merel;Leonard Hasenclever;Alexandre Galashov;Arun Ahuja;Vu Pham;Greg Wayne;Yee Whye Teh;Nicolas Heess", "authorids": "jsmerel@google.com;leonardh@google.com;agalashov@google.com;arahuja@google.com;vuph@google.com;gregwayne@google.com;ywteh@google.com;heess@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nmerel2018neural,\ntitle={Neural Probabilistic Motor Primitives for Humanoid Control},\nauthor={Josh Merel and Leonard Hasenclever and Alexandre Galashov and Arun Ahuja and Vu Pham and Greg Wayne and Yee Whye Teh and Nicolas Heess},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJl6TjRcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;4", "wc_review": "213;616;287", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "335;706;505", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 372.0, 175.15897540995914 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 515.3333333333334, 151.63626068838403 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 177, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11172180957185308522&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJl6TjRcY7", "pdf": "https://openreview.net/pdf?id=BJl6TjRcY7", "email": ";;;;;;;", "author_num": 8 }, { "id": "BJlMcjC5K7", "title": "Neural Random Projections for Language Modelling", "track": "main", "status": "Reject", "tldr": "Neural language models can be trained with a compressed embedding space, by using sparse random projections, created incrementally for each unique discrete input.", "abstract": "Neural network-based language models deal with data sparsity problems by mapping the large discrete space of words into a smaller continuous space of real-valued vectors. By learning distributed vector representations for words, each training sample informs the neural network model about a combinatorial number of other patterns. In this paper, we exploit the sparsity in natural language even further by encoding each unique input word using a fixed sparse random representation. \nThese sparse codes are then projected onto a smaller embedding space which allows for the encoding of word occurrences from a possibly unknown vocabulary, along with the creation of more compact language models using a reduced number of parameters. We investigate the properties of our encoding mechanism empirically, by evaluating its performance on the widely used Penn Treebank corpus. We show that guaranteeing approximately equidistant vector representations for unique discrete inputs is enough to provide the neural network model with enough information to learn --and make use-- of \ndistributed representations for these inputs.", "keywords": "neural networks;language modelling;natural language processing;uncertainty;random projections", "primary_area": "", "supplementary_material": "", "author": "Davide Nunes;Luis Antunes", "authorids": "nunesd@campus.ul.pt;xarax@ciencias.ulisboa.pt", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnunes2019neural,\ntitle={Neural Random Projections for Language Modelling},\nauthor={Davide Nunes and Luis Antunes},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlMcjC5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJlMcjC5K7", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;3", "wc_review": "419;362;411", "wc_reply_reviewers": "0;118;0", "wc_reply_authors": "258;581;388", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 397.3333333333333, 25.197001585285676 ], "wc_reply_reviewers_avg": [ 39.333333333333336, 55.62573345334173 ], "wc_reply_authors_avg": [ 409.0, 132.69765132309865 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9923592097516472934&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "BJlSHsAcK7", "title": "Overcoming catastrophic forgetting through weight consolidation and long-term memory", "track": "main", "status": "Reject", "tldr": "We enable sequential learning of multiple tasks by adding task-dependent memory units to avoid interference between tasks", "abstract": "Sequential learning of multiple tasks in artificial neural networks using gradient descent leads to catastrophic forgetting, whereby previously learned knowledge is erased during learning of new, disjoint knowledge. Here, we propose a new approach to sequential learning which leverages the recent discovery of adversarial examples. We use adversarial subspaces from previous tasks to enable learning of new tasks with less interference. We apply our method to sequentially learning to classify digits 0, 1, 2 (task 1), 4, 5, 6, (task 2), and 7, 8, 9 (task 3) in MNIST (disjoint MNIST task). We compare and combine our Adversarial Direction (AD) method with the recently proposed Elastic Weight Consolidation (EWC) method for sequential learning. We train each task for 20 epochs, which yields good initial performance (99.24% correct task 1 performance). After training task 2, and then task 3, both plain gradient descent (PGD) and EWC largely forget task 1 (task 1 accuracy 32.95% for PGD and 41.02% for EWC), while our combined approach (AD+EWC) still achieves 94.53% correct on task 1. We obtain similar results with a much more difficult disjoint CIFAR10 task (70.10% initial task 1 performance, 67.73% after learning tasks 2 and 3 for AD+EWC, while PGD and EWC both fall to chance level). We confirm qualitatively similar results for EMNIST with 5 tasks and under 3 variants of our approach. Our results suggest that AD+EWC can provide better sequential learning performance than either PGD or EWC.", "keywords": "Catastrophic Forgetting;Life-Long Learning;adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Shixian Wen;Laurent Itti", "authorids": "shixianwen1993@gmail.com;itti@usc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwen2019overcoming,\ntitle={Overcoming catastrophic forgetting through weight consolidation and long-term memory},\nauthor={Shixian Wen and Laurent Itti},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlSHsAcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJlSHsAcK7", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;4", "wc_review": "407;436;669", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 504.0, 117.27176414920459 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6UAGAbEiiIYJ:scholar.google.com/&scioq=Overcoming+catastrophic+forgetting+through+weight+consolidation+and+long-term+memory&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BJlSQnR5t7", "title": "Deepstr\u00f6m Networks", "track": "main", "status": "Withdraw", "tldr": "A new neural architecture where top dense layers of standard convolutional architectures are replaced with an approximation of a kernel function by relying on the Nystr\u00f6m approximation.", "abstract": "Recent work has focused on combining kernel methods and deep learning. With this in mind, we introduce Deepstr\u00f6m networks -- a new architecture of neural networks which we use to replace top dense layers of standard convolutional architectures with an approximation of a kernel function by relying on the Nystr\u00f6m approximation. \nOur approach is easy highly flexible. It is compatible with any kernel function and it allows exploiting multiple kernels. \nWe show that Deepstr\u00f6m networks reach state-of-the-art performance on standard datasets like SVHN and CIFAR100. One benefit of the method lies in its limited number of learnable parameters which make it particularly suited for small training set sizes, e.g. from 5 to 20 samples per class. Finally we illustrate two ways of using multiple kernels, including a multiple Deepstr\u00f6m setting, that exploits a kernel on each feature map output by the convolutional part of the model. ", "keywords": "kernels;Nystr\u00f6m approximation;deep convnets", "primary_area": "", "supplementary_material": "", "author": "Luc Giffon;Hachem Kadri;St\u00e9phane Ayache;Thierry Arti\u00e8res", "authorids": "luc.giffon@lis-lab.fr;hachem.kadri@lis-lab.fr;stephane.ayache@lis-lab.fr;thierry.artieres@lis-lab.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJlSQnR5t7", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "wc_review": "460;124;774", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 452.6666666666667, 265.412048625445 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13966837704375602457&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BJlVhsA5KX", "title": "Sequenced-Replacement Sampling for Deep Learning", "track": "main", "status": "Reject", "tldr": "Proposed a novel way (without adding new parameters) of training deep neural network in order to improve generalization, especially for the case where we have relatively small images-per-class.", "abstract": "We propose sequenced-replacement sampling (SRS) for training deep neural networks. The basic idea is to assign a fixed sequence index to each sample in the dataset. Once a mini-batch is randomly drawn in each training iteration, we refill the original dataset by successively adding samples according to their sequence index. Thus we carry out replacement sampling but in a batched and sequenced way. In a sense, SRS could be viewed as a way of performing \"mini-batch augmentation\". It is particularly useful for a task where we have a relatively small images-per-class such as CIFAR-100. Together with a longer period of initial large learning rate, it significantly improves the classification accuracy in CIFAR-100 over the current state-of-the-art results. Our experiments indicate that training deeper networks with SRS is less prone to over-fitting. In the best case, we achieve an error rate as low as 10.10%.", "keywords": "deep neural networks;stochastic gradient descent;sequenced-replacement sampling", "primary_area": "", "supplementary_material": "", "author": "Chiu Man Ho;Dae Hoon Park;Wei Yang;Yi Chang", "authorids": "chiuman100@gmail.com;pdhvip@gmail.com;wei.yang2@huawei.com;yichang@acm.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nho2019sequencedreplacement,\ntitle={Sequenced-Replacement Sampling for Deep Learning},\nauthor={Chiu Man Ho and Dae Hoon Park and Wei Yang and Yi Chang},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlVhsA5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJlVhsA5KX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;5", "wc_review": "101;530;252", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 294.3333333333333, 177.67823602106014 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10540959144595613605&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJlXUsR5KQ", "title": "Learning Neuron Non-Linearities with Kernel-Based Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The effectiveness of deep neural architectures has been widely supported in terms of both experimental and foundational principles. There is also clear evidence that the activation function (e.g. the recti\ufb01er and the LSTM units) plays a crucial role in the complexity of learning. Based on this remark, this paper discusses an optimal selection of the neuron non-linearity in a functional framework that is inspired from classic regularization arguments. A representation theorem is given which indicates that the best activation function is a kernel expansion in the training set, that can be effectively approximated over an opportune set of points modeling 1-D clusters. The idea can be naturally extended to recurrent networks, where the expressiveness of kernel-based activation functions turns out to be a crucial ingredient to capture long-term dependencies. We give experimental evidence of this property by a set of challenging experiments, where we compare the results with neural architectures based on state of the art LSTM cells.", "keywords": "Activation functions;Kernel methods;Recurrent networks", "primary_area": "", "supplementary_material": "", "author": "Giuseppe Marra;Dario Zanca;Alessandro Betti;Marco Gori", "authorids": "g.marra@unifi.it;dario.zanca@unifi.it;alessandro.betti@unifi.it;marco.gori@unisi.it", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmarra2019learning,\ntitle={Learning Neuron Non-Linearities with Kernel-Based Deep Neural Networks},\nauthor={Giuseppe Marra and Dario Zanca and Alessandro Betti and Marco Gori},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlXUsR5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJlXUsR5KQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;4", "wc_review": "336;336;435", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 369.0, 46.66904755831214 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4055614753546467269&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJl_VnR9Km", "title": "A Model Cortical Network for Spatiotemporal Sequence Learning and Prediction", "track": "main", "status": "Reject", "tldr": "A new hierarchical cortical model for encoding spatiotemporal memory and video prediction", "abstract": "In this paper we developed a hierarchical network model, called Hierarchical Prediction Network (HPNet) to understand how spatiotemporal memories might be learned and encoded in a representational hierarchy for predicting future video frames. The model is inspired by the feedforward, feedback and lateral recurrent circuits in the mammalian hierarchical visual system. It assumes that spatiotemporal memories are encoded in the recurrent connections within each level and between different levels of the hierarchy. The model contains a feed-forward path that computes and encodes spatiotemporal features of successive complexity and a feedback path that projects interpretation from a higher level to the level below. Within each level, the feed-forward path and the feedback path intersect in a recurrent gated circuit that integrates their signals as well as the circuit's internal memory states to generate a prediction of the incoming signals. The network learns by comparing the incoming signals with its prediction, updating its internal model of the world by minimizing the prediction errors at each level of the hierarchy in the style of {\\em predictive self-supervised learning}. The network processes data in blocks of video frames rather than a frame-to-frame basis. This allows it to learn relationships among movement patterns, yielding state-of-the-art performance in long range video sequence predictions in benchmark datasets. We observed that hierarchical interaction in the network introduces sensitivity to memories of global movement patterns even in the population representation of the units in the earliest level. Finally, we provided neurophysiological evidence, showing that neurons in the early visual cortex of awake monkeys exhibit very similar sensitivity and behaviors. These findings suggest that predictive self-supervised learning might be an important principle for representational learning in the visual cortex. ", "keywords": "cortical models;spatiotemporal memory;video prediction;predictive coding", "primary_area": "", "supplementary_material": "", "author": "Jielin Qiu;Ge Huang;Tai Sing Lee", "authorids": "ternence1996@gmail.com;hgesummer@gmail.com;taislee@andrew.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nqiu2019a,\ntitle={A Model Cortical Network for Spatiotemporal Sequence Learning and Prediction},\nauthor={Jielin Qiu and Ge Huang and Tai Sing Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=BJl_VnR9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJl_VnR9Km", "pdf_size": 0, "rating": "3;7;7", "confidence": "3;3;3", "wc_review": "573;293;605", "wc_reply_reviewers": "0;0;20", "wc_reply_authors": "2547;861;1056", "reply_reviewers": "0;0;1", "reply_authors": "5;2;3", "rating_avg": [ 5.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 490.3333333333333, 140.14595566368973 ], "wc_reply_reviewers_avg": [ 6.666666666666667, 9.428090415820632 ], "wc_reply_authors_avg": [ 1488.0, 753.0458153392793 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MoWdPXj8RPgJ:scholar.google.com/&scioq=A+Model+Cortical+Network+for+Spatiotemporal+Sequence+Learning+and+Prediction&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BJlaYi05tm", "title": "Geometry of Deep Convolutional Networks", "track": "main", "status": "Withdraw", "tldr": "Analysis of deep convolutional networks in terms of associated arrangement of hyperplanes", "abstract": " We give a formal procedure for computing preimages of convolutional\n network outputs using the dual basis defined from the set of\n hyperplanes associated with the layers of the network. We point out\n the special symmetry associated with arrangements of hyperplanes of\n convolutional networks that take the form of regular\n multidimensional polyhedral cones. We discuss the efficiency of of\n large number of layers of nested cones that result from incremental\n small size convolutions in order to give a good compromise between\n efficient contraction of data to low dimensions and shaping of\n preimage manifolds. We demonstrate how a specific network flattens a\n non linear input manifold to an affine output manifold and discuss\n it's relevance to understanding classification properties of deep\n networks.", "keywords": "convolutional networks;geometry", "primary_area": "", "supplementary_material": "", "author": "Stefan Carlsson", "authorids": "stefanc@kth.se", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJlaYi05tm", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;2;4", "wc_review": "231;734;295", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 420.0, 223.56356292264323 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.32732683535398854, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1020033893097730028&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BJlc6iA5YX", "title": "ACE: Artificial Checkerboard Enhancer to Induce and Evade Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "We propose a novel aritificial checkerboard enhancer (ACE) module which guides attacks to a pre-specified pixel space and successfully defends it with a simple padding operation.", "abstract": "The checkerboard phenomenon is one of the well-known visual artifacts in the computer vision field. The origins and solutions of checkerboard artifacts in the pixel space have been studied for a long time, but their effects on the gradient space have rarely been investigated. In this paper, we revisit the checkerboard artifacts in the gradient space which turn out to be the weak point of a network architecture. We explore image-agnostic property of gradient checkerboard artifacts and propose a simple yet effective defense method by utilizing the artifacts. We introduce our defense module, dubbed Artificial Checkerboard Enhancer (ACE), which induces adversarial attacks on designated pixels. This enables the model to deflect attacks by shifting only a single pixel in the image with a remarkable defense rate. We provide extensive experiments to support the effectiveness of our work for various attack scenarios using state-of-the-art attack methods. Furthermore, we show that ACE is even applicable to large-scale datasets including ImageNet dataset and can be easily transferred to various pretrained networks.", "keywords": "Adversarial Examples;Neural Network Security;Deep Neural Network;Checkerboard Artifact", "primary_area": "", "supplementary_material": "", "author": "Jisung Hwang;Younghoon Kim;Sanghyuk Chun;Jaejun Yoo;Ji-Hoon Kim;Dongyoon Han;Jung-Woo Ha", "authorids": "jeshwang92@uchicago.edu;snu13dlx@snu.ac.kr;sanghyuk.c@navercorp.com;jaejun.yoo@navercorp.com;genesis.kim@navercorp.com;dongyoon.han@navercorp.com;jungwoo.ha@navercorp.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nhwang2019ace,\ntitle={{ACE}: Artificial Checkerboard Enhancer to Induce and Evade Adversarial Attacks},\nauthor={Jisung Hwang and Younghoon Kim and Sanghyuk Chun and Jaejun Yoo and Ji-Hoon Kim and Dongyoon Han and Jung-Woo Ha},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlc6iA5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=BJlc6iA5YX", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;2;1", "wc_review": "272;406;25", "wc_reply_reviewers": "158;0;0", "wc_reply_authors": "848;483;0", "reply_reviewers": "1;0;0", "reply_authors": "2;1;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 2.0, 0.816496580927726 ], "wc_review_avg": [ 234.33333333333334, 157.80649190983522 ], "wc_reply_reviewers_avg": [ 52.666666666666664, 74.481914284983 ], "wc_reply_authors_avg": [ 443.6666666666667, 347.30998002105014 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PFpr7flYm9IJ:scholar.google.com/&scioq=ACE:+Artificial+Checkerboard+Enhancer+to+Induce+and+Evade+Adversarial+Attacks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJleciCcKQ", "title": "EXPLORATION OF EFFICIENT ON-DEVICE ACOUSTIC MODELING WITH NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "Multi-timestep parallelizable acoustic modeling with diagonal LSTM, QRNN and Gated ConvNet", "abstract": "Real-time speech recognition on mobile and embedded devices is an important application of neural networks. Acoustic modeling is the fundamental part of speech recognition and is usually implemented with long short-term memory (LSTM)-based recurrent neural networks (RNNs). However, the single thread execution of an LSTM RNN is extremely slow in most embedded devices because the algorithm needs to fetch a large number of parameters from the DRAM for computing each output sample. We explore a few acoustic modeling algorithms that can be executed very efficiently on embedded devices. These algorithms reduce the overhead of memory accesses using multi-timestep parallelization that computes multiple output samples at a time by reading the parameters only once from the DRAM. The algorithms considered are the quasi RNNs (QRNNs), Gated ConvNets, and diagonalized LSTMs. In addition, we explore neural networks that equip one-dimensional (1-D) convolution at each layer of these algorithms, and by which can obtain a very large performance increase in the QRNNs and Gated ConvNets. The experiments were conducted using two tasks, one is the connectionist temporal classification (CTC)-based end-to-end speech recognition on WSJ corpus and the other is the phoneme classification on TIMIT dataset. We not only significantly increase the execution speed but also obtain a much higher accuracy, compared to LSTM RNN-based modeling. Thus, this work can be applicable not only to embedded system-based implementations but also to server-based ones.", "keywords": "Parallelization;Speech Recognition;Sequence Modeling;Recurrent Neural Network;Embedded Systems", "primary_area": "", "supplementary_material": "", "author": "Wonyong Sung;Lukas Lee;Jinwhan Park", "authorids": "wysung@snu.ac.kr;proboscis@snu.ac.kr;bnoo@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsung2019exploration,\ntitle={{EXPLORATION} {OF} {EFFICIENT} {ON}-{DEVICE} {ACOUSTIC} {MODELING} {WITH} {NEURAL} {NETWORKS}},\nauthor={Wonyong Sung and Lukas Lee and Jinwhan Park},\nyear={2019},\nurl={https://openreview.net/forum?id=BJleciCcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJleciCcKQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;5", "wc_review": "309;281;396", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 328.6666666666667, 48.9648399931579 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8TDNLHYe_2kJ:scholar.google.com/&scioq=EXPLORATION+OF+EFFICIENT+ON-DEVICE+ACOUSTIC+MODELING+WITH+NEURAL+NETWORKS&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Differentiable Perturb-and-Parse: Semi-Supervised Parsing with a Structured Variational Autoencoder", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/984", "id": "BJlgNh0qKQ", "author_site": "Caio Corro, Ivan Titov", "tldr": "Differentiable dynamic programming over perturbed input weights with application to semi-supervised VAE", "abstract": "Human annotation for syntactic parsing is expensive, and large resources are available only for a fraction of languages. A question we ask is whether one can leverage abundant unlabeled texts to improve syntactic parsers, beyond just using the texts to obtain more generalisable lexical features (i.e. beyond word embeddings). To this end, we propose a novel latent-variable generative model for semi-supervised syntactic dependency parsing. As exact inference is intractable, we introduce a differentiable relaxation to obtain approximate samples and compute gradients with respect to the parser parameters. Our method (Differentiable Perturb-and-Parse) relies on differentiable dynamic programming over stochastically perturbed edge scores. We demonstrate effectiveness of our approach with experiments on English, French and Swedish.", "keywords": "differentiable dynamic programming;variational auto-encoder;dependency parsing;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Caio Corro;Ivan Titov", "authorids": "c.f.corro@uva.nl;i.a.titov@uva.nl", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ncorro2018differentiable,\ntitle={Differentiable Perturb-and-Parse: Semi-Supervised Parsing with a Structured Variational Autoencoder},\nauthor={Caio Corro and Ivan Titov},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlgNh0qKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "3;3;4", "wc_review": "494;350;263", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "590;73;294", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 369.0, 95.25754563287887 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 319.0, 211.8033679304148 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14755468217840863060&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=BJlgNh0qKQ", "pdf": "https://openreview.net/pdf?id=BJlgNh0qKQ", "email": ";", "author_num": 2 }, { "id": "BJlhEs09YQ", "title": "End-to-end Learning of a Convolutional Neural Network via Deep Tensor Decomposition", "track": "main", "status": "Withdraw", "tldr": "We consider a simplified deep convolutional neural network model. We show that all layers of this network can be approximately learned with a proper application of tensor decomposition.", "abstract": "In this paper we study the problem of learning the weights of a deep convolutional neural network. We consider a network where convolutions are carried out over non-overlapping patches with a single kernel in each layer. We develop an algorithm for simultaneously learning all the kernels from the training data. Our approach dubbed Deep Tensor Decomposition (DeepTD) is based on a rank-1 tensor decomposition. We theoretically investigate DeepTD under a realizable model for the training data where the inputs are chosen i.i.d. from a Gaussian distribution and the labels are generated according to planted convolutional kernels. We show that DeepTD is data-efficient and provably works as soon as the sample size exceeds the total number of convolutional weights in the network. Our numerical experiments demonstrate the effectiveness of DeepTD and verify our theoretical findings.", "keywords": "convolutional neural network;tensor decomposition;sample complexity;approximation", "primary_area": "", "supplementary_material": "", "author": "Samet Oymak;Mahdi Soltanolkotabi", "authorids": "sametoymak@gmail.com;soltanol@usc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=BJlhEs09YQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;3;3", "wc_review": "437;843;293", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 524.3333333333334, 232.87383327076967 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9035534926340982193&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "BJlif3C5FQ", "title": "Learning to Attend On Essential Terms: An Enhanced Retriever-Reader Model for Open-domain Question Answering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Open-domain question answering remains a challenging task as it requires models that are capable of understanding questions and answers, collecting useful information, and reasoning over evidence. Previous work typically formulates this task as a reading comprehension or entailment problem given evidence retrieved from search engines. However, existing techniques struggle to retrieve indirectly related evidence when no directly related evidence is provided, especially for complex questions where it is hard to parse precisely what the question asks. In this paper we propose a retriever-reader model that learns to attend on essential terms during the question answering process. We build (1) an essential term selector which first identifies the most important words in a question, then reformulates the query and searches for related evidence; and (2) an enhanced reader that distinguishes between essential terms and distracting words to predict the answer. We evaluate our model on multiple open-domain QA datasets where it outperforms the existing state-of-the-art, notably leading to an improvement of 8.1% on the AI2 Reasoning Challenge (ARC) dataset.", "keywords": "Open-domain question answering", "primary_area": "", "supplementary_material": "", "author": "Jianmo Ni;Chenguang Zhu;Weizhu Chen;Julian McAuley", "authorids": "jin018@ucsd.edu;chezhu@microsoft.com;wzchen@microsoft.com;jmcauley@cs.ucsd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJlif3C5FQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "313;301;360", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 324.6666666666667, 25.46020860523775 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5026838583872520931&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "BJll6o09tm", "title": "Padam: Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adaptive gradient methods, which adopt historical gradient information to automatically adjust the learning rate, despite the nice property of fast convergence, have been observed to generalize worse than stochastic gradient descent (SGD) with momentum in training deep neural networks. This leaves how to close the generalization gap of adaptive gradient methods an open problem. In this work, we show that adaptive gradient methods such as Adam, Amsgrad, are sometimes \"over adapted\". We design a new algorithm, called Partially adaptive momentum estimation method (Padam), which unifies the Adam/Amsgrad with SGD by introducing a partial adaptive parameter p, to achieve the best from both worlds. Experiments on standard benchmarks show that Padam can maintain fast convergence rate as Adam/Amsgrad while generalizing as well as SGD in training deep neural networks. These results would suggest practitioners pick up adaptive gradient methods once again for faster training of deep neural networks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinghui Chen;Quanquan Gu", "authorids": "jc4zg@virginia.edu;qgu@cs.ucla.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchen2019padam,\ntitle={Padam: Closing the Generalization Gap of Adaptive Gradient Methods in Training Deep Neural Networks},\nauthor={Jinghui Chen and Quanquan Gu},\nyear={2019},\nurl={https://openreview.net/forum?id=BJll6o09tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJll6o09tm", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;4;3", "wc_review": "262;341;327", "wc_reply_reviewers": "125;0;0", "wc_reply_authors": "440;411;157", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 310.0, 34.418987008142274 ], "wc_reply_reviewers_avg": [ 41.666666666666664, 58.92556509887896 ], "wc_reply_authors_avg": [ 336.0, 127.12461078275389 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 239, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5143340560197408650&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "BJlpCsC5Km", "title": "Learning Gibbs-regularized GANs with variational discriminator reparameterization", "track": "main", "status": "Reject", "tldr": "We reparameterize a GAN's discriminator into a form that admits regularization using a structured Gibbs distribution", "abstract": " We propose a novel approach to regularizing generative adversarial networks (GANs) leveraging learned {\\em structured Gibbs distributions}. Our method consists of reparameterizing the discriminator to be an explicit function of two densities: the generator PDF $q$ and a structured Gibbs distribution $\\nu$. Leveraging recent work on invertible pushforward density estimators, this reparameterization is made possible by assuming the generator is invertible, which enables the analytic evaluation of the generator PDF $q$. We further propose optimizing the Jeffrey divergence, which balances mode coverage with sample quality. The combination of this loss and reparameterization allows us to effectively regularize the generator by imposing structure from domain knowledge on $\\nu$, as in classical graphical models. Applying our method to a vehicle trajectory forecasting task, we observe that we are able to obtain quantitatively superior mode coverage as well as better-quality samples compared to traditional methods.", "keywords": "deep generative models;graphical models;trajectory forecasting;GANs;density estimation;structured prediction", "primary_area": "", "supplementary_material": "", "author": "Nicholas Rhinehart;Anqi Liu;Kihyuk Sohn;Paul Vernaza", "authorids": "nrhineha@cs.cmu.edu;anqiliu@caltech.edu;ksohn@nec-labs.com;pvernaza@nec-labs.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nrhinehart2019learning,\ntitle={Learning Gibbs-regularized {GAN}s with variational discriminator reparameterization},\nauthor={Nicholas Rhinehart and Anqi Liu and Kihyuk Sohn and Paul Vernaza},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlpCsC5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJlpCsC5Km", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;3", "wc_review": "374;307;331", "wc_reply_reviewers": "0;14;0", "wc_reply_authors": "510;628;683", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 337.3333333333333, 27.716822007983204 ], "wc_reply_reviewers_avg": [ 4.666666666666667, 6.599663291074443 ], "wc_reply_authors_avg": [ 607.0, 72.17109301283075 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pGMbn_f7kDwJ:scholar.google.com/&scioq=Learning+Gibbs-regularized+GANs+with+variational+discriminator+reparameterization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BJlr0j0ctX", "title": "Label Smoothing and Logit Squeezing: A Replacement for Adversarial Training?", "track": "main", "status": "Withdraw", "tldr": "Achieving strong adversarial robustness comparable to adversarial training without training on adversarial examples", "abstract": "Adversarial training is one of the strongest defenses against adversarial attacks, but it requires adversarial examples to be generated for every mini-batch during optimization. The expense of producing these examples during training often precludes adversarial training from use on complex image datasets. \nIn this study, we explore the mechanisms by which adversarial training improves classifier robustness, and show that these mechanisms can be effectively mimicked using simple regularization methods, including label smoothing and logit squeezing. \nRemarkably, using these simple regularization methods in combination with Gaussian noise injection, we are able to achieve strong adversarial robustness -- often exceeding that of adversarial training -- using no adversarial examples.", "keywords": "adversarial machine learning;machine learning security", "primary_area": "", "supplementary_material": "", "author": "Ali Shafahi;Amin Ghiasi;Furong Huang;Tom Goldstein", "authorids": "ashafahi@cs.umd.edu;amin@cs.umd.edu;furongh@cs.umd.edu;tomg@cs.umd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJlr0j0ctX", "pdf_size": 0, "rating": "2;4;7", "confidence": "5;3;5", "wc_review": "193;551;478", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "688;760;1115", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 4.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 407.3333333333333, 154.4589121920634 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 854.3333333333334, 186.64821337359635 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.1147078669352809, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17213124276410427473&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Janossy Pooling: Learning Deep Permutation-Invariant Functions for Variable-Size Inputs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/923", "id": "BJluy2RcFm", "author_site": "Ryan L Murphy, Balasubramaniam Srinivasan, Vinayak Rao, Bruno Ribeiro", "tldr": "We propose Janossy pooling, a method for learning deep permutation invariant functions designed to exploit relationships within the input sequence and tractable inference strategies such as a stochastic optimization procedure we call piSGD", "abstract": "We consider a simple and overarching representation for permutation-invariant functions of sequences (or set functions). Our approach, which we call Janossy pooling, expresses a permutation-invariant function as the average of a permutation-sensitive function applied to all reorderings of the input sequence. This allows us to leverage the rich and mature literature on permutation-sensitive functions to construct novel and flexible permutation-invariant functions. If carried out naively, Janossy pooling can be computationally prohibitive. To allow computational tractability, we consider three kinds of approximations: canonical orderings of sequences, functions with k-order interactions, and stochastic optimization algorithms with random permutations. Our framework unifies a variety of existing work in the literature, and suggests possible modeling and algorithmic extensions. We explore a few in our experiments, which demonstrate improved performance over current state-of-the-art methods.", "keywords": "representation learning;permutation invariance;set functions;feature pooling", "primary_area": "", "supplementary_material": "", "author": "Ryan L. Murphy;Balasubramaniam Srinivasan;Vinayak Rao;Bruno Ribeiro", "authorids": "murph213@purdue.edu;bsriniv@purdue.edu;varao@purdue.edu;ribeiro@cs.purdue.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmurphy2018janossy,\ntitle={Janossy Pooling: Learning Deep Permutation-Invariant Functions for Variable-Size Inputs},\nauthor={Ryan L. Murphy and Balasubramaniam Srinivasan and Vinayak Rao and Bruno Ribeiro},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJluy2RcFm},\n}", "github": "[![github](/images/github_icon.svg) PurdueMINDS/JanossyPooling](https://github.com/PurdueMINDS/JanossyPooling) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJluy2RcFm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;4", "wc_review": "348;222;693", "wc_reply_reviewers": "0;0;79", "wc_reply_authors": "667;610;747", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 421.0, 199.09294311953903 ], "wc_reply_reviewers_avg": [ 26.333333333333332, 37.2409571424915 ], "wc_reply_authors_avg": [ 674.6666666666666, 56.192130726246326 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 238, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8977286222434486292&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BJluy2RcFm", "pdf": "https://openreview.net/pdf?id=BJluy2RcFm", "email": ";;;", "author_num": 4 }, { "title": "An Empirical Study of Example Forgetting during Deep Neural Network Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/753", "id": "BJlxm30cKm", "author_site": "Mariya Toneva, Alessandro Sordoni, Remi Combes, Adam Trischler, Yoshua Bengio, Geoffrey Gordon", "tldr": "We show that catastrophic forgetting occurs within what is considered to be a single task and find that examples that are not prone to forgetting can be removed from the training set without loss of generalization.", "abstract": "Inspired by the phenomenon of catastrophic forgetting, we investigate the learning dynamics of neural networks as they train on single classification tasks. Our goal is to understand whether a related phenomenon occurs when data does not undergo a clear distributional shift. We define a ``forgetting event'' to have occurred when an individual training example transitions from being classified correctly to incorrectly over the course of learning. Across several benchmark data sets, we find that: (i) certain examples are forgotten with high frequency, and some not at all; (ii) a data set's (un)forgettable examples generalize across neural architectures; and (iii) based on forgetting dynamics, a significant fraction of examples can be omitted from the training data set while still maintaining state-of-the-art generalization performance.", "keywords": "catastrophic forgetting;sample weighting;deep generalization", "primary_area": "", "supplementary_material": "", "author": "Mariya Toneva*;Alessandro Sordoni*;Remi Tachet des Combes*;Adam Trischler;Yoshua Bengio;Geoffrey J. Gordon", "authorids": "mariya.k.toneva@gmail.com;alsordon@microsoft.com;retachet@microsoft.com;adtrisch@microsoft.com;yoshua.bengio@mila.quebec;geoff.gordon@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ntoneva2018an,\ntitle={An Empirical Study of Example Forgetting during Deep Neural Network Learning},\nauthor={Mariya Toneva and Alessandro Sordoni and Remi Tachet des Combes and Adam Trischler and Yoshua Bengio and Geoffrey J. Gordon},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlxm30cKm},\n}", "github": "[![github](/images/github_icon.svg) mtoneva/example_forgetting](https://github.com/mtoneva/example_forgetting) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BJlxm30cKm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;4;5", "wc_review": "251;613;310", "wc_reply_reviewers": "56;364;0", "wc_reply_authors": "347;779;252", "reply_reviewers": "1;2;0", "reply_authors": "2;3;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 391.3333333333333, 158.58191013549504 ], "wc_reply_reviewers_avg": [ 140.0, 160.03332986183432 ], "wc_reply_authors_avg": [ 459.3333333333333, 229.3415696195427 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 848, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14912040563601232331&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BJlxm30cKm", "pdf": "https://openreview.net/pdf?id=BJlxm30cKm", "email": ";;;;;", "author_num": 6 }, { "id": "BJlyznAcFm", "title": "Advocacy Learning", "track": "main", "status": "Reject", "tldr": "We introduce a method that encourages different components in a networks to compete, and show that this can improve attention quality.", "abstract": "We introduce advocacy learning, a novel supervised training scheme for classification problems. This training scheme applies to a framework consisting of two connected networks: 1) the Advocates, composed of one subnetwork per class, which take the input and provide a convincing class-conditional argument in the form of an attention map, and 2) a Judge, which predicts the inputs class label based on these arguments. Each Advocate aims to convince the Judge that the input example belongs to their corresponding class. In contrast to a standard network, in which all subnetworks are trained to jointly cooperate, we train the Advocates to competitively argue for their class, even when the input belongs to a different class. We also explore a variant, honest advocacy learning, where the Advocates are only trained on data corresponding to their class. Applied to several different classification tasks, we show that advocacy learning can lead to small improvements in classification accuracy over an identical supervised baseline. Through a series of follow-up experiments, we analyze when and how Advocates improve discriminative performance. Though it may seem counter-intuitive, a framework in which subnetworks are trained to competitively provide evidence in support of their class shows promise, performing as well as or better than standard approaches. This provides a foundation for further exploration into the effect of competition and class-conditional representations.", "keywords": "competition;supervision;deep learning;adversarial;debate", "primary_area": "", "supplementary_material": "", "author": "Ian Fox;Jenna Wiens", "authorids": "ifox@umich.edu;wiensj@umich.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfox2019advocacy,\ntitle={Advocacy Learning},\nauthor={Ian Fox and Jenna Wiens},\nyear={2019},\nurl={https://openreview.net/forum?id=BJlyznAcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJlyznAcFm", "pdf_size": 0, "rating": "4;4;8", "confidence": "4;4;2", "wc_review": "214;260;210", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "771;1186;479", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 228.0, 22.686266036231405 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 812.0, 290.08389591059114 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "RNNs implicitly implement tensor-product representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1139", "id": "BJx0sjC5FX", "author_site": "Tom McCoy, Tal Linzen, Ewan Dunbar, Paul Smolensky", "tldr": "RNNs implicitly implement tensor-product representations, a principled and interpretable method for representing symbolic structures in continuous space.", "abstract": "Recurrent neural networks (RNNs) can learn continuous vector representations of symbolic structures such as sequences and sentences; these representations often exhibit linear regularities (analogies). Such regularities motivate our hypothesis that RNNs that show such regularities implicitly compile symbolic structures into tensor product representations (TPRs; Smolensky, 1990), which additively combine tensor products of vectors representing roles (e.g., sequence positions) and vectors representing fillers (e.g., particular words). To test this hypothesis, we introduce Tensor Product Decomposition Networks (TPDNs), which use TPRs to approximate existing vector representations. We demonstrate using synthetic data that TPDNs can successfully approximate linear and tree-based RNN autoencoder representations, suggesting that these representations exhibit interpretable compositional structure; we explore the settings that lead RNNs to induce such structure-sensitive representations. By contrast, further TPDN experiments show that the representations of four models trained to encode naturally-occurring sentences can be largely approximated with a bag of words, with only marginal improvements from more sophisticated structures. We conclude that TPDNs provide a powerful method for interpreting vector representations, and that standard RNNs can induce compositional sequence representations that are remarkably well approximated byTPRs; at the same time, existing training tasks for sentence representation learning may not be sufficient for inducing robust structural representations", "keywords": "tensor-product representations;compositionality;neural network interpretability;recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "R. Thomas McCoy;Tal Linzen;Ewan Dunbar;Paul Smolensky", "authorids": "tom.mccoy@jhu.edu;tal.linzen@jhu.edu;ewan.dunbar@univ-paris-diderot.fr;smolensky@jhu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmccoy2018rnns,\ntitle={{RNN}s implicitly implement tensor-product representations},\nauthor={R. Thomas McCoy and Tal Linzen and Ewan Dunbar and Paul Smolensky},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJx0sjC5FX},\n}", "github": "[![github](/images/github_icon.svg) tommccoy1/tpdn](https://github.com/tommccoy1/tpdn)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "325;268;17", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "464;520;104", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 203.33333333333334, 133.7966948603573 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 362.6666666666667, 184.3282096937115 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8578120166770522666&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=BJx0sjC5FX", "pdf": "https://openreview.net/pdf?id=BJx0sjC5FX", "email": ";;;", "author_num": 4 }, { "id": "BJx1SsAcYQ", "title": "Discovering Low-Precision Networks Close to Full-Precision Networks for Efficient Embedded Inference", "track": "main", "status": "Reject", "tldr": "Finetuning after quantization matches or exceeds full-precision state-of-the-art networks at both 8- and 4-bit quantization.", "abstract": "To realize the promise of ubiquitous embedded deep network inference, it is essential to seek limits of energy and area efficiency. To this end, low-precision networks offer tremendous promise because both energy and area scale down quadratically with the reduction in precision. Here, for the first time, we demonstrate ResNet-18, ResNet-34, ResNet-50, ResNet-152, Inception-v3, densenet-161, and VGG-16bn networks on the ImageNet classification benchmark that, at 8-bit precision exceed the accuracy of the full-precision baseline networks after one epoch of finetuning, thereby leveraging the availability of pretrained models.\nWe also demonstrate ResNet-18, ResNet-34, and ResNet-50 4-bit models that match the accuracy of the full-precision baseline networks -- the highest scores to date. Surprisingly, the weights of the low-precision networks are very close (in cosine similarity) to the weights of the corresponding baseline networks, making training from scratch unnecessary.\n\nWe find that gradient noise due to quantization during training increases with reduced precision, and seek ways to overcome this noise. The number of iterations required by stochastic gradient descent to achieve a given training error is related to the square of (a) the distance of the initial solution from the final plus (b) the maximum variance of the gradient estimates. By drawing inspiration from this observation, we (a) reduce solution distance by starting with pretrained fp32 precision baseline networks and fine-tuning, and (b) combat noise introduced by quantizing weights and activations during training, by using larger batches along with matched learning rate annealing. Sensitivity analysis indicates that these techniques, coupled with proper activation function range calibration, offer a promising heuristic to discover low-precision networks, if they exist, close to fp32 precision baseline networks.\n", "keywords": "Deep Learning;Convolutional Neural Networks;Low-precision inference;Network quantization", "primary_area": "", "supplementary_material": "", "author": "Jeffrey L. McKinstry;Steven K. Esser;Rathinakumar Appuswamy;Deepika Bablani;John V. Arthur;Izzet B. Yildiz;Dharmendra S. Modha", "authorids": "jlmckins@us.ibm.com;sesser@us.ibm.com;rappusw@us.ibm.com;deepika.bablani@ibm.com;arthurjo@us.ibm.com;izzet.burak.yildiz@gmail.com;dmodha@us.ibm.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nmckinstry2019discovering,\ntitle={Discovering Low-Precision Networks Close to Full-Precision Networks for Efficient Embedded Inference},\nauthor={Jeffrey L. McKinstry and Steven K. Esser and Rathinakumar Appuswamy and Deepika Bablani and John V. Arthur and Izzet B. Yildiz and Dharmendra S. Modha},\nyear={2019},\nurl={https://openreview.net/forum?id=BJx1SsAcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJx1SsAcYQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;3", "wc_review": "360;312;372", "wc_reply_reviewers": "59;0;0", "wc_reply_authors": "1031;687;780", "reply_reviewers": "1;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 348.0, 25.92296279363144 ], "wc_reply_reviewers_avg": [ 19.666666666666668, 27.812866726670865 ], "wc_reply_authors_avg": [ 832.6666666666666, 145.29127831895332 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7440331761562184084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJx9f305t7", "title": "W2GAN: RECOVERING AN OPTIMAL TRANSPORT MAP WITH A GAN", "track": "main", "status": "Reject", "tldr": "\"A GAN-style model to recover a solution of the Monge Problem\"", "abstract": "Understanding and improving Generative Adversarial Networks (GAN) using notions from Optimal Transport (OT) theory has been a successful area of study, originally established by the introduction of the Wasserstein GAN (WGAN). An increasing number of GANs incorporate OT for improving their discriminators, but that is so far the sole way for the two domains to cross-fertilize. In this work we address the converse question: is it possible to recover an optimal map in a GAN fashion? To achieve this, we build a new model relying on the second Wasserstein distance. This choice enables the use of many results from OT community. In particular, we may completely describe the dynamics of the generator during training. In addition, experiments show that practical uses of our model abide by the rule of evolution we describe. As an application, our generator may be considered as a new way of computing an optimal transport map. It is competitive in low-dimension with standard and deterministic ways to approach the same problem. In high dimension, the fact it is a GAN-style method makes it more powerful than other methods.", "keywords": "Optimal Transportation;Deep Learning;Generative Adversarial Networks;Wasserstein Distance", "primary_area": "", "supplementary_material": "", "author": "Leygonie Jacob*;Jennifer She*;Amjad Almahairi;Sai Rajeswar;Aaron Courville", "authorids": "jacob.leygonie@gmail.com;jennifershe123@gmail.com;amjadmahayri@gmail.com;rajsai24@gmail.com;aaron.courville@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\njacob*2019wgan,\ntitle={W2{GAN}: {RECOVERING} {AN} {OPTIMAL} {TRANSPORT} {MAP} {WITH} A {GAN}},\nauthor={Leygonie Jacob* and Jennifer She* and Amjad Almahairi and Sai Rajeswar and Aaron Courville},\nyear={2019},\nurl={https://openreview.net/forum?id=BJx9f305t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJx9f305t7", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;3", "wc_review": "470;215;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "942;326;853", "reply_reviewers": "0;0;0", "reply_authors": "3;2;2", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 319.6666666666667, 108.9964321536362 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 707.0, 271.8467705650863 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13500298779030989958&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BJxLH2AcYX", "title": "Unsupervised Multi-Target Domain Adaptation: An Information Theoretic Approach", "track": "main", "status": "Reject", "tldr": "", "abstract": "Unsupervised domain adaptation (uDA) models focus on pairwise adaptation settings where there is a single, labeled, source and a single target domain. However, in many real-world settings one seeks to adapt to multiple, but somewhat similar, target domains. Applying pairwise adaptation approaches to this setting may be suboptimal, as they would fail to leverage shared information among the multiple domains. In this work we propose an information theoretic approach for domain adaptation in the novel context of multiple target domains with unlabeled instances and one source domain with labeled instances. Our model aims to find a shared latent space common to all domains, while simultaneously accounting for the remaining private, domain-specific factors. Disentanglement of shared and private information is accomplished using a unified information-theoretic approach, which also serves to provide a stronger link between the latent representations and the observed data. The resulting single model, accompanied by an efficient optimization algorithm, allows simultaneous adaptation from a single source to multiple target domains.\nWe test our approach on three publicly-available datasets, showing that it outperforms several popular domain adaptation methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Behnam Gholami;Pritish Sahu;Ognjen (Oggi) Rudovic;Konstantinos Bousmalis;Vladimir Pavlovic", "authorids": "bb510@cs.rutgers.edu;ps851@cs.rutgers.edu;orudovic@mit.edu;konstantinos@google.com;vladimir@cs.rutgers.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ngholami2019unsupervised,\ntitle={Unsupervised Multi-Target Domain Adaptation: An Information Theoretic Approach},\nauthor={Behnam Gholami and Pritish Sahu and Ognjen (Oggi) Rudovic and Konstantinos Bousmalis and Vladimir Pavlovic},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxLH2AcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BJxLH2AcYX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;5", "wc_review": "311;126;398", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "582;574;777", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 278.3333333333333, 113.42055467252054 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 644.3333333333334, 93.86633522668757 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 234, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11434873393011375483&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "BJxOHs0cKm", "title": "Identifying Generalization Properties in Neural Networks", "track": "main", "status": "Reject", "tldr": "a theory connecting Hessian of the solution and the generalization power of the model", "abstract": "While it has not yet been proven, empirical evidence suggests that model generalization is related to local properties of the optima which can be described via the Hessian. We connect model generalization with the local property of a solution under the PAC-Bayes paradigm. In particular, we prove that model generalization ability is related to the Hessian, the higher-order \"smoothness\" terms characterized by the Lipschitz constant of the Hessian, and the scales of the parameters. Guided by the proof, we propose a metric to score the generalization capability of the model, as well as an algorithm that optimizes the perturbed model accordingly. ", "keywords": "generalization;PAC-Bayes;Hessian;perturbation", "primary_area": "", "supplementary_material": "", "author": "Huan Wang;Nitish Shirish Keskar;Caiming Xiong;Richard Socher", "authorids": "huan.wang@salesforce.com;nkeskar@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwang2019identifying,\ntitle={Identifying Generalization Properties in Neural Networks},\nauthor={Huan Wang and Nitish Shirish Keskar and Caiming Xiong and Richard Socher},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxOHs0cKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJxOHs0cKm", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "513;676;427", "wc_reply_reviewers": "0;174;0", "wc_reply_authors": "763;837;755", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 538.6666666666666, 103.26126519120754 ], "wc_reply_reviewers_avg": [ 58.0, 82.02438661763951 ], "wc_reply_authors_avg": [ 785.0, 36.914315199752345 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1463356261549431499&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJxPk2A9Km", "title": "Learning What to Remember: Long-term Episodic Memory Networks for Learning from Streaming Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Current generation of memory-augmented neural networks has limited scalability as they cannot efficiently process data that are too large to fit in the external memory storage. One example of this is lifelong learning scenario where the model receives unlimited length of data stream as an input which contains vast majority of uninformative entries. We tackle this problem by proposing a memory network fit for long-term lifelong learning scenario, which we refer to as Long-term Episodic Memory Networks (LEMN), that features a RNN-based retention agent that learns to replace less important memory entries based on the retention probability generated on each entry that is learned to identify data instances of generic importance relative to other memory entries, as well as its historical importance. Such learning of retention agent allows our long-term episodic memory network to retain memory entries of generic importance for a given task. We validate our model on a path-finding task as well as synthetic and real question answering tasks, on which our model achieves significant improvements over the memory augmented networks with rule-based memory scheduling as well as an RL-based baseline that does not consider relative or historical importance of the memory.", "keywords": "Memory Network;Lifelong Learning", "primary_area": "", "supplementary_material": "", "author": "Hyunwoo Jung;Moonsu Han;Minki Kang;Sungju Hwang", "authorids": "hyunwooj@kaist.ac.kr;mshan92@kaist.ac.kr;zzxc1133@kaist.ac.kr;sjhwang82@kaist.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\njung2019learning,\ntitle={Learning What to Remember: Long-term Episodic Memory Networks for Learning from Streaming Data},\nauthor={Hyunwoo Jung and Moonsu Han and Minki Kang and Sungju Hwang},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxPk2A9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJxPk2A9Km", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;5", "wc_review": "760;431;374", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "383;307;143", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 521.6666666666666, 170.12609702479185 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 277.6666666666667, 100.15099711047425 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10935305334060724215&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BJxRVnC5Fm", "title": "Mean Replacement Pruning", "track": "main", "status": "Reject", "tldr": "Mean Replacement is an efficient method to improve the loss after pruning and Taylor approximation based scoring functions works better with absolute values. ", "abstract": "Pruning units in a deep network can help speed up inference and training as well as reduce the size of the model. We show that bias propagation is a pruning technique which consistently outperforms the common approach of merely removing units, regardless of the architecture and the dataset. We also show how a simple adaptation to an existing scoring function allows us to select the best units to prune. Finally, we show that the units selected by the best performing scoring functions are somewhat consistent over the course of training, implying the dead parts of the network appear during the stages of training.", "keywords": "pruning;saliency;neural networks;optimization;redundancy;model compression", "primary_area": "", "supplementary_material": "", "author": "Utku Evci;Nicolas Le Roux;Pablo Castro;Leon Bottou", "authorids": "evcu@google.com;nicolas@le-roux.name;psc@google.com;leon@bottou.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nevci2019mean,\ntitle={Mean Replacement Pruning },\nauthor={Utku Evci and Nicolas Le Roux and Pablo Castro and Leon Bottou},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxRVnC5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJxRVnC5Fm", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;3", "wc_review": "177;536;391", "wc_reply_reviewers": "0;283;353", "wc_reply_authors": "330;874;460", "reply_reviewers": "0;2;1", "reply_authors": "1;3;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 368.0, 147.46072923550415 ], "wc_reply_reviewers_avg": [ 212.0, 152.60624714167722 ], "wc_reply_authors_avg": [ 554.6666666666666, 231.95593451247493 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yvdDMUO48fAJ:scholar.google.com/&scioq=Mean+Replacement+Pruning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJxYEsAqY7", "title": "FEED: Feature-level Ensemble Effect for knowledge Distillation", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes a versatile and powerful training algorithm named Feature-level Ensemble Effect for knowledge Distillation(FEED), which is inspired by the work of factor transfer. The factor transfer is one of the knowledge transfer methods that improves the performance of a student network with a strong teacher network. It transfers the knowledge of a teacher in the feature map level using high-capacity teacher network, and our training algorithm FEED is an extension of it. FEED aims to transfer ensemble knowledge, using either multiple teachers in parallel or multiple training sequences. Adapting the peer-teaching framework, we introduce a couple of training algorithms that transfer ensemble knowledge to the student at the feature map level, both of which help the student network find more generalized solutions in the parameter space. Experimental results on CIFAR-100 and ImageNet show that our method, FEED, has clear performance enhancements,without introducing any additional parameters or computations at test time.", "keywords": "Knowledge Distillation;Ensemble Effect;Knowledge Transfer", "primary_area": "", "supplementary_material": "", "author": "SeongUk Park;Nojun Kwak", "authorids": "swpark0703@snu.ac.kr;nojunk@snu.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npark2019feed,\ntitle={{FEED}: Feature-level Ensemble Effect for knowledge Distillation},\nauthor={SeongUk Park and Nojun Kwak},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxYEsAqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJxYEsAqY7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;3", "wc_review": "256;381;291", "wc_reply_reviewers": "102;0;0", "wc_reply_authors": "336;263;210", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 309.3333333333333, 52.65189666310439 ], "wc_reply_reviewers_avg": [ 34.0, 48.08326112068523 ], "wc_reply_authors_avg": [ 269.6666666666667, 51.65483735549094 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r08F825um0MJ:scholar.google.com/&scioq=FEED:+Feature-level+Ensemble+Effect+for+knowledge+Distillation&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BJxbYoC9FQ", "title": "Classifier-agnostic saliency map extraction", "track": "main", "status": "Reject", "tldr": "We propose a new saliency map extraction method which results in extracting higher quality maps.", "abstract": "Extracting saliency maps, which indicate parts of the image important to classification, requires many tricks to achieve satisfactory performance when using classifier-dependent methods. Instead, we propose classifier-agnostic saliency map extraction, which finds all parts of the image that any classifier could use, not just one given in advance. We observe that the proposed approach extracts higher quality saliency maps and outperforms existing weakly-supervised localization techniques, setting the new state of the art result on the ImageNet dataset.", "keywords": "saliency maps;explainable AI;convolutional neural networks;generative adversarial training;classification", "primary_area": "", "supplementary_material": "", "author": "Konrad Zolna;Krzysztof J. Geras;Kyunghyun Cho", "authorids": "konrad.zolna@gmail.com;k.j.geras@nyu.edu;kyunghyun.cho@nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\n\u017co\u0142na2019classifieragnostic,\ntitle={Classifier-agnostic saliency map extraction},\nauthor={Konrad \u017bo\u0142na and Krzysztof J. Geras and Kyunghyun Cho},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxbYoC9FQ},\n}", "github": "[![github](/images/github_icon.svg) kondiz/casme](https://github.com/kondiz/casme)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BJxbYoC9FQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "wc_review": "1122;384;251", "wc_reply_reviewers": "102;0;54", "wc_reply_authors": "563;186;319", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 585.6666666666666, 383.1121100786151 ], "wc_reply_reviewers_avg": [ 52.0, 41.66533331199932 ], "wc_reply_authors_avg": [ 356.0, 156.11747713394124 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8814844156390208367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "BJxfJnC9YX", "title": "Learning Spatio-Temporal Representations Using Spike-Based Backpropagation", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Spiking neural networks (SNNs) offer a promising alternative to current artificial neural networks to enable low-power event-driven neuromorphic hardware. However, training SNNs remains a challenge primarily because of the complex non-differentiable neuronal behavior arising from their spike-based computation. In this paper, we propose an algorithm to train spiking autoencoders on regenerative learning tasks. A sigmoid approximation is used in place of the Leaky Integrate-and-Fire neuron's threshold based activation during backpropagation to enable differentiability. The loss is computed on the membrane potential of the output layer, which is then backpropagated through the network at each time step. These spiking autoencoders learn meaningful spatio-temporal representations of the data, across two modalities - audio and visual. We demonstrate audio to image synthesis in a spike-based environment by sharing these spatio-temporal representations between the two modalities. These models achieve very low reconstruction loss, comparable to ANNs, on MNIST and Fashion-MNIST datasets, and while converting TI-46 digits audio samples to MNIST images. ", "keywords": "spiking neural networks;autoencoders;representation learning;backpropagation;multimodal", "primary_area": "", "supplementary_material": "", "author": "Deboleena Roy;Priyadarshini Panda;Kaushik Roy", "authorids": "roy77@purdue.edu;pandap@purdue.edu;kaushik@purdue.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJxfJnC9YX", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;5", "wc_review": "448;619;230", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 432.3333333333333, 159.19450018417373 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4865972111403986230&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Learning To Solve Circuit-SAT: An Unsupervised Differentiable Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/750", "id": "BJxgz2R9t7", "author_site": "Saeed Amizadeh, Sergiy Matusevych, Markus Weimer", "tldr": "We propose a neural framework that can learn to solve the Circuit Satisfiability problem from (unlabeled) circuit instances.", "abstract": "Recent efforts to combine Representation Learning with Formal Methods, commonly known as the Neuro-Symbolic Methods, have given rise to a new trend of applying rich neural architectures to solve classical combinatorial optimization problems. In this paper, we propose a neural framework that can learn to solve the Circuit Satisfiability problem. Our framework is built upon two fundamental contributions: a rich embedding architecture that encodes the problem structure and an end-to-end differentiable training procedure that mimics Reinforcement Learning and trains the model directly toward solving the SAT problem. The experimental results show the superior out-of-sample generalization performance of our framework compared to the recently developed NeuroSAT method.", "keywords": "Neuro-Symbolic Methods;Circuit Satisfiability;Neural SAT Solver;Graph Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Saeed Amizadeh;Sergiy Matusevych;Markus Weimer", "authorids": "saeed.amizadeh@gmail.com;sergiym@microsoft.com;markus.weimer@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\namizadeh2018learning,\ntitle={Learning To Solve Circuit-{SAT}: An Unsupervised Differentiable Approach},\nauthor={Saeed Amizadeh and Sergiy Matusevych and Markus Weimer},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxgz2R9t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;3;4", "wc_review": "487;549;275", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "836;798;161", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 437.0, 117.31439241059329 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 598.3333333333334, 309.63024543621196 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8913283008212437757&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJxgz2R9t7", "pdf": "https://openreview.net/pdf?id=BJxgz2R9t7", "email": ";;", "author_num": 3 }, { "title": "Dynamic Channel Pruning: Feature Boosting and Suppression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/857", "id": "BJxh2j0qYm", "author_site": "Xitong Gao, Yiren Zhao, \u0141ukasz Dudziak, Robert Mullins, Cheng-zhong Xu", "tldr": "We make convolutional layers run faster by dynamically boosting and suppressing channels in feature computation.", "abstract": "Making deep convolutional neural networks more accurate typically comes at the cost of increased computational and memory resources. In this paper, we reduce this cost by exploiting the fact that the importance of features computed by convolutional layers is highly input-dependent, and propose feature boosting and suppression (FBS), a new method to predictively amplify salient convolutional channels and skip unimportant ones at run-time. FBS introduces small auxiliary connections to existing convolutional layers. In contrast to channel pruning methods which permanently remove channels, it preserves the full network structures and accelerates convolution by dynamically skipping unimportant input and output channels. FBS-augmented networks are trained with conventional stochastic gradient descent, making it readily available for many state-of-the-art CNNs. We compare FBS to a range of existing channel pruning and dynamic execution schemes and demonstrate large improvements on ImageNet classification. Experiments show that FBS can respectively provide 5\u00d7 and 2\u00d7 savings in compute on VGG-16 and ResNet-18, both with less than 0.6% top-5 accuracy loss.", "keywords": "dynamic network;faster CNNs;channel pruning", "primary_area": "", "supplementary_material": "", "author": "Xitong Gao;Yiren Zhao;\u0141ukasz Dudziak;Robert Mullins;Cheng-zhong Xu", "authorids": "xt.gao@siat.ac.cn;yaz21@cam.ac.uk;lukaszd.mail@gmail.com;robert.mullins@cl.cam.ac.uk;czxu@um.edu.mo", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngao2018dynamic,\ntitle={Dynamic Channel Pruning: Feature Boosting and Suppression},\nauthor={Xitong Gao and Yiren Zhao and \u0141ukasz Dudziak and Robert Mullins and Cheng-zhong Xu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxh2j0qYm},\n}", "github": "[![github](/images/github_icon.svg) deep-fry/mayo](https://github.com/deep-fry/mayo) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BJxh2j0qYm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer5", "pdf_size": 0, "rating": "6;6;7;7", "confidence": "4;3;4;5", "wc_review": "383;169;126;126", "wc_reply_reviewers": "27;0;0;0", "wc_reply_authors": "1331;117;121;110", "reply_reviewers": "1;0;0;0", "reply_authors": "3;1;1;1", "rating_avg": [ 6.5, 0.5 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "wc_review_avg": [ 201.0, 106.5340321211959 ], "wc_reply_reviewers_avg": [ 6.75, 11.691342951089922 ], "wc_reply_authors_avg": [ 419.75, 526.1251633404355 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.7071067811865475, "gs_citation": 410, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1895104173020407133&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BJxh2j0qYm", "pdf": "https://openreview.net/pdf?id=BJxh2j0qYm", "email": ";;;;", "author_num": 5 }, { "title": "signSGD with Majority Vote is Communication Efficient and Fault Tolerant", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/876", "id": "BJxhijAcY7", "author_site": "Jeremy Bernstein, Jiawei Zhao, Kamyar Azizzadenesheli, Anima Anandkumar", "tldr": "Workers send gradient signs to the server, and the update is decided by majority vote. We show that this algorithm is convergent, communication efficient and fault tolerant, both in theory and in practice.", "abstract": "Training neural networks on large datasets can be accelerated by distributing the workload over a network of machines. As datasets grow ever larger, networks of hundreds or thousands of machines become economically viable. The time cost of communicating gradients limits the effectiveness of using such large machine counts, as may the increased chance of network faults. We explore a particularly simple algorithm for robust, communication-efficient learning---signSGD. Workers transmit only the sign of their gradient vector to a server, and the overall update is decided by a majority vote. This algorithm uses 32x less communication per iteration than full-precision, distributed SGD. Under natural conditions verified by experiment, we prove that signSGD converges in the large and mini-batch settings, establishing convergence for a parameter regime of Adam as a byproduct. Aggregating sign gradients by majority vote means that no individual worker has too much power. We prove that unlike SGD, majority vote is robust when up to 50% of workers behave adversarially. The class of adversaries we consider includes as special cases those that invert or randomise their gradient estimate. On the practical side, we built our distributed training system in Pytorch. Benchmarking against the state of the art collective communications library (NCCL), our framework---with the parameter server housed entirely on one machine---led to a 25% reduction in time for training resnet50 on Imagenet when using 15 AWS p3.2xlarge machines.", "keywords": "large-scale learning;distributed systems;communication efficiency;convergence rate analysis;robust optimisation", "primary_area": "", "supplementary_material": "", "author": "Jeremy Bernstein;Jiawei Zhao;Kamyar Azizzadenesheli;Anima Anandkumar", "authorids": "bernstein@caltech.edu;jiaweizhao.zjw@qq.com;kazizzad@uci.edu;anima@caltech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbernstein2018signsgd,\ntitle={sign{SGD} with Majority Vote is Communication Efficient and Fault Tolerant},\nauthor={Jeremy Bernstein and Jiawei Zhao and Kamyar Azizzadenesheli and Anima Anandkumar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxhijAcY7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BJxhijAcY7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;5;4", "wc_review": "559;815;331", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "475;224;174", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 568.3333333333334, 197.70235765468811 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 291.0, 131.69915211065964 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15973736581723285506&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BJxhijAcY7", "pdf": "https://openreview.net/pdf?id=BJxhijAcY7", "email": ";;;", "author_num": 4 }, { "id": "BJxmXhRcK7", "title": "TENSOR RING NETS ADAPTED DEEP MULTI-TASK LEARNING", "track": "main", "status": "Reject", "tldr": "a deep multi-task learning model adapting tensor ring representation", "abstract": "Recent deep multi-task learning (MTL) has been witnessed its success in alleviating data scarcity of some task by utilizing domain-specific knowledge from related tasks. Nonetheless, several major issues of deep MTL, including the effectiveness of sharing mechanisms, the efficiency of model complexity and the flexibility of network architectures, still remain largely unaddressed. To this end, we propose a novel generalized latent-subspace based knowledge sharing mechanism for linking task-specific models, namely tensor ring multi-task learning (TRMTL). TRMTL has a highly compact representation, and it is very effective in transferring task-invariant knowledge while being super flexible in learning task-specific features, successfully mitigating the dilemma of both negative-transfer in lower layers and under-transfer in higher layers. Under our TRMTL, it is feasible for each task to have heterogenous input data dimensionality or distinct feature sizes at different hidden layers. Experiments on a variety of datasets demonstrate our model is capable of significantly improving each single task\u2019s performance, particularly favourable in scenarios where some of the tasks have insufficient data.", "keywords": "deep learning;deep multi-task learning;tensor factorization;tensor ring nets", "primary_area": "", "supplementary_material": "", "author": "Xinqi Chen;Ming Hou;Guoxu Zhou;Qibin Zhao", "authorids": "xinqicham@gmail.com;ming.hou@riken.jp;gx.zhou@gdut.edu.cn;qibin.zhao@riken.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2019tensor,\ntitle={{TENSOR} {RING} {NETS} {ADAPTED} {DEEP} {MULTI}-{TASK} {LEARNING}},\nauthor={Xinqi Chen and Ming Hou and Guoxu Zhou and Qibin Zhao},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxmXhRcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJxmXhRcK7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "699;290;144", "wc_reply_reviewers": "0;197;0", "wc_reply_authors": "1262;1162;171", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 377.6666666666667, 234.9047088123655 ], "wc_reply_reviewers_avg": [ 65.66666666666667, 92.86669059583322 ], "wc_reply_authors_avg": [ 865.0, 492.4273212024965 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GDPN1v9mUgQJ:scholar.google.com/&scioq=TENSOR+RING+NETS+ADAPTED+DEEP+MULTI-TASK+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Bounce and Learn: Modeling Scene Dynamics with Real-World Bounces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1100", "id": "BJxssoA5KX", "author_site": "Senthil Purushwalkam, Abhinav Gupta, Danny Kaufman, Bryan Russell", "tldr": "", "abstract": "We introduce an approach to model surface properties governing bounces in everyday scenes. Our model learns end-to-end, starting from sensor inputs, to predict post-bounce trajectories and infer \ntwo underlying physical properties that govern bouncing - restitution and effective collision normals. Our model, Bounce and Learn, comprises two modules -- a Physics Inference Module (PIM) and a Visual Inference Module (VIM). VIM learns to infer physical parameters for locations in a scene given a single still image, while PIM learns to model physical interactions for the prediction task given physical parameters and observed pre-collision 3D trajectories. \nTo achieve our results, we introduce the Bounce Dataset comprising 5K RGB-D videos of bouncing trajectories of a foam ball to probe surfaces of varying shapes and materials in everyday scenes including homes and offices. \nOur proposed model learns from our collected dataset of real-world bounces and is bootstrapped with additional information from simple physics simulations. We show on our newly collected dataset that our model out-performs baselines, including trajectory fitting with Newtonian physics, in predicting post-bounce trajectories and inferring physical properties of a scene.", "keywords": "intuitive physics;visual prediction;surface normal;restitution;bounces", "primary_area": "", "supplementary_material": "", "author": "Senthil Purushwalkam;Abhinav Gupta;Danny Kaufman;Bryan Russell", "authorids": "spurushw@andrew.cmu.edu;abhinavg@cs.cmu.edu;dkaufman@adobe.com;brussell@adobe.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\npurushwalkam2018bounce,\ntitle={Bounce and Learn: Modeling Scene Dynamics with Real-World Bounces},\nauthor={Senthil Purushwalkam and Abhinav Gupta and Danny Kaufman and Bryan Russell},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxssoA5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;4", "wc_review": "643;220;324", "wc_reply_reviewers": "437;0;0", "wc_reply_authors": "1862;12;515", "reply_reviewers": "1;0;0", "reply_authors": "4;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 395.6666666666667, 179.97098531584348 ], "wc_reply_reviewers_avg": [ 145.66666666666666, 206.00377558568087 ], "wc_reply_authors_avg": [ 796.3333333333334, 781.0191347776884 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3030062067056637219&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJxssoA5KX", "pdf": "https://openreview.net/pdf?id=BJxssoA5KX", "email": ";;;", "author_num": 4 }, { "title": "K for the Price of 1: Parameter-efficient Multi-task and Transfer Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/898", "id": "BJxvEh0cFQ", "author_site": "Pramod Kaushik Mudrakarta, Mark Sandler, Andrey Zhmoginov, Andrew Howard", "tldr": "A novel and practically effective method to adapt pretrained neural networks to new tasks by retraining a minimal (e.g., less than 2%) number of parameters", "abstract": "We introduce a novel method that enables parameter-efficient transfer and multi-task learning with deep neural networks. The basic approach is to learn a model patch - a small set of parameters - that will specialize to each task, instead of fine-tuning the last layer or the entire network. For instance, we show that learning a set of scales and biases is sufficient to convert a pretrained network to perform well on qualitatively different problems (e.g. converting a Single Shot MultiBox Detection (SSD) model into a 1000-class image classification model while reusing 98% of parameters of the SSD feature extractor). Similarly, we show that re-learning existing low-parameter layers (such as depth-wise convolutions) while keeping the rest of the network frozen also improves transfer-learning accuracy significantly. Our approach allows both simultaneous (multi-task) as well as sequential transfer learning. In several multi-task learning problems, despite using much fewer parameters than traditional logits-only fine-tuning, we match single-task performance. \n", "keywords": "deep learning;mobile;transfer learning;multi-task learning;computer vision;small models;imagenet;inception;batch normalization", "primary_area": "", "supplementary_material": "", "author": "Pramod Kaushik Mudrakarta;Mark Sandler;Andrey Zhmoginov;Andrew Howard", "authorids": "pramodkm@uchicago.edu;mark.sandler@gmail.com;azhmogin@google.com;howarda@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmudrakarta2018k,\ntitle={K For The Price Of 1: Parameter Efficient Multi-task And Transfer Learning},\nauthor={Pramod Kaushik Mudrakarta and Mark Sandler and Andrey Zhmoginov and Andrew Howard},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJxvEh0cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;5;4", "wc_review": "186;107;315", "wc_reply_reviewers": "14;0;23", "wc_reply_authors": "412;272;279", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 202.66666666666666, 85.7295489055877 ], "wc_reply_reviewers_avg": [ 12.333333333333334, 9.463379711052259 ], "wc_reply_authors_avg": [ 321.0, 64.41014412859722 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6019481069112213609&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BJxvEh0cFQ", "pdf": "https://openreview.net/pdf?id=BJxvEh0cFQ", "email": ";;;", "author_num": 4 }, { "id": "BJxz5jRcFm", "title": "Tangent-Normal Adversarial Regularization for Semi-supervised Learning", "track": "main", "status": "Withdraw", "tldr": "We propose a novel manifold regularization strategy based on adversarial training, which can significantly improve the performance of semi-supervised learning.", "abstract": "The ever-increasing size of modern datasets combined with the difficulty of obtaining label information has made semi-supervised learning of significant practical importance in modern machine learning applications. In comparison to supervised learning, the key difficulty in semi-supervised learning is how to make full use of the unlabeled data. In order to utilize manifold information provided by unlabeled data, we propose a novel regularization called the tangent-normal adversarial regularization, which is composed by two parts. The two parts complement with each other and jointly enforce the smoothness along two different directions that are crucial for semi-supervised learning. One is applied along the tangent space of the data manifold, aiming to enforce local invariance of the classifier on the manifold, while the other is performed on the normal space orthogonal to the tangent space, intending to impose robustness on the classifier against the noise causing the observed data deviating from the underlying data manifold. Both of the two regularizers are achieved by the strategy of virtual adversarial training. Our method has achieved state-of-the-art performance on semi-supervised learning tasks on both artificial dataset and practical datasets.", "keywords": "semi-supervised learning;manifold regularization;adversarial training", "primary_area": "", "supplementary_material": "", "author": "Bing Yu;Jingfeng Wu;Jinwen Ma;Zhanxing Zhu", "authorids": "byu@pku.edu.cn;pkuwjf@pku.edu.cn;jwma@math.pku.edu.cn;zhanxing.zhu@pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BJxz5jRcFm", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;3;4", "wc_review": "383;655;321", "wc_reply_reviewers": "0;37;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;1;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 453.0, 145.0609067483954 ], "wc_reply_reviewers_avg": [ 12.333333333333334, 17.441967269268172 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.3273268353539886, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9145773952325087137&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 11 }, { "id": "BJzVUj0qtQ", "title": "Evading Defenses to Transferable Adversarial Examples by Mitigating Attention Shift", "track": "main", "status": "Withdraw", "tldr": "We propose an attention-invariant attack method to generate more transferable adversarial examples for black-box attacks, which can fool state-of-the-art defenses with a high success rate.", "abstract": "Deep neural networks are vulnerable to adversarial examples, which can mislead classifiers by adding imperceptible perturbations. An intriguing property of adversarial examples is their good transferability, making black-box attacks feasible in real-world applications. Due to the threat of adversarial attacks, many methods have been proposed to improve the robustness, and several state-of-the-art defenses are shown to be robust against transferable adversarial examples. In this paper, we identify the attention shift phenomenon, which may hinder the transferability of adversarial examples to the defense models. It indicates that the defenses rely on different discriminative regions to make predictions compared with normally trained models. Therefore, we propose an attention-invariant attack method to generate more transferable adversarial examples. Extensive experiments on the ImageNet dataset validate the effectiveness of the proposed method. Our best attack fools eight state-of-the-art defenses at an 82% success rate on average based only on the transferability, demonstrating the insecurity of the defense techniques. ", "keywords": "adversarial examples;black-box attack;transferability", "primary_area": "", "supplementary_material": "", "author": "Yinpeng Dong;Tianyu Pang;Hang Su;Jun Zhu", "authorids": "dyp17@mails.tsinghua.edu.cn;pty17@mails.tsinghua.edu.cn;suhangss@mail.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BJzVUj0qtQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;3;4", "wc_review": "328;176;266", "wc_reply_reviewers": "5;0;0", "wc_reply_authors": "154;220;334", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 256.6666666666667, 62.403703593794994 ], "wc_reply_reviewers_avg": [ 1.6666666666666667, 2.357022603955158 ], "wc_reply_authors_avg": [ 236.0, 74.35052118176442 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15407665504829263612&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Towards Metamerism via Foveated Style Transfer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/749", "id": "BJzbG20cFQ", "author_site": "Arturo Deza, Aditya Jonnalagadda, Miguel Eckstein", "tldr": "We introduce a novel feed-forward framework to generate visual metamers", "abstract": "The problem of visual metamerism is defined as finding a family of perceptually\nindistinguishable, yet physically different images. In this paper, we propose our\nNeuroFovea metamer model, a foveated generative model that is based on a mixture\nof peripheral representations and style transfer forward-pass algorithms. Our\ngradient-descent free model is parametrized by a foveated VGG19 encoder-decoder\nwhich allows us to encode images in high dimensional space and interpolate\nbetween the content and texture information with adaptive instance normalization\nanywhere in the visual field. Our contributions include: 1) A framework for\ncomputing metamers that resembles a noisy communication system via a foveated\nfeed-forward encoder-decoder network \u2013 We observe that metamerism arises as a\nbyproduct of noisy perturbations that partially lie in the perceptual null space; 2)\nA perceptual optimization scheme as a solution to the hyperparametric nature of\nour metamer model that requires tuning of the image-texture tradeoff coefficients\neverywhere in the visual field which are a consequence of internal noise; 3) An\nABX psychophysical evaluation of our metamers where we also find that the rate\nof growth of the receptive fields in our model match V1 for reference metamers\nand V2 between synthesized samples. Our model also renders metamers at roughly\na second, presenting a \u00d71000 speed-up compared to the previous work, which now\nallows for tractable data-driven metamer experiments.", "keywords": "Metamerism;foveation;perception;style transfer;psychophysics", "primary_area": "", "supplementary_material": "", "author": "Arturo Deza;Aditya Jonnalagadda;Miguel P. Eckstein", "authorids": "deza@dyns.ucsb.edu;aditya_jonnalagadda@ece.ucsb.edu;eckstein@psych.ucsb.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndeza2018towards,\ntitle={Towards Metamerism via Foveated Style Transfer},\nauthor={Arturo Deza and Aditya Jonnalagadda and Miguel P. Eckstein},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BJzbG20cFQ},\n}", "github": "[![github](/images/github_icon.svg) ArturoDeza/NeuroFovea](https://github.com/ArturoDeza/NeuroFovea)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;4;4", "wc_review": "764;320;166", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1674;563;860", "reply_reviewers": "0;0;0", "reply_authors": "3;1;2", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 416.6666666666667, 253.52098313332743 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1032.3333333333333, 469.6483317925824 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17935865817929282522&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BJzbG20cFQ", "pdf": "https://openreview.net/pdf?id=BJzbG20cFQ", "email": ";;", "author_num": 3 }, { "id": "BJzmzn0ctX", "title": "Scalable Neural Theorem Proving on Knowledge Bases and Natural Language", "track": "main", "status": "Reject", "tldr": "We scale Neural Theorem Provers to large datasets, improve the rule learning process, and extend it to jointly reason over text and Knowledge Bases.", "abstract": "Reasoning over text and Knowledge Bases (KBs) is a major challenge for Artificial Intelligence, with applications in machine reading, dialogue, and question answering. Transducing text to logical forms which can be operated on is a brittle and error-prone process. Operating directly on text by jointly learning representations and transformations thereof by means of neural architectures that lack the ability to learn and exploit general rules can be very data-inefficient and not generalise correctly. These issues are addressed by Neural Theorem Provers (NTPs) (Rockt\u00e4schel & Riedel, 2017), neuro-symbolic systems based on a continuous relaxation of Prolog\u2019s backward chaining algorithm, where symbolic unification between atoms is replaced by a differentiable operator computing the similarity between their embedding representations. In this paper, we first propose Neighbourhood-approximated Neural Theorem Provers (NaNTPs) consisting of two extensions toNTPs, namely a) a method for drastically reducing the previously prohibitive time and space complexity during inference and learning, and b) an attention mechanism for improving the rule learning process, deeming them usable on real-world datasets. Then, we propose a novel approach for jointly reasoning over KB facts and textual mentions, by jointly embedding them in a shared embedding space. The proposed method is able to extract rules and provide explanations\u2014involving both textual patterns and KB relations\u2014from large KBs and text corpora. We show that NaNTPs perform on par with NTPs at a fraction of a cost, and can achieve competitive link prediction results on challenging large-scale datasets, including WN18, WN18RR, and FB15k-237 (with and without textual mentions) while being able to provide explanations for each prediction and extract interpretable rules.", "keywords": "Machine Reading;Natural Language Processing;Neural Theorem Proving;Representation Learning;First Order Logic", "primary_area": "", "supplementary_material": "", "author": "Pasquale Minervini;Matko Bosnjak;Tim Rockt\u00e4schel;Edward Grefenstette;Sebastian Riedel", "authorids": "p.minervini@gmail.com;matko.bosnjak@gmail.com;tim.rocktaeschel@gmail.com;etg@google.com;etg@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nminervini2019scalable,\ntitle={Scalable Neural Theorem Proving on Knowledge Bases and Natural Language},\nauthor={Pasquale Minervini and Matko Bosnjak and Tim Rockt\u00e4schel and Edward Grefenstette and Sebastian Riedel},\nyear={2019},\nurl={https://openreview.net/forum?id=BJzmzn0ctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BJzmzn0ctX", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;3;3", "wc_review": "274;185;293", "wc_reply_reviewers": "734;0;138", "wc_reply_authors": "1938;209;1065", "reply_reviewers": "2;0;1", "reply_authors": "5;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 250.66666666666666, 47.07676945396978 ], "wc_reply_reviewers_avg": [ 290.6666666666667, 318.5062357666208 ], "wc_reply_authors_avg": [ 1070.6666666666667, 705.8726671448769 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18139025211397386208&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BJzuKiC9KX", "title": "Revisiting Reweighted Wake-Sleep", "track": "main", "status": "Reject", "tldr": "Empirical analysis and explanation of particle-based gradient estimators for approximate inference with deep generative models.", "abstract": " Discrete latent-variable models, while applicable in a variety of settings, can often be difficult to learn. Sampling discrete latent variables can result in high-variance gradient estimators for two primary reasons: 1) branching on the samples within the model, and 2) the lack of a pathwise derivative for the samples. While current state-of-the-art methods employ control-variate schemes for the former and continuous-relaxation methods for the latter, their utility is limited by the complexities of implementing and training effective control-variate schemes and the necessity of evaluating (potentially exponentially) many branch paths in the model. Here, we revisit the Reweighted Wake Sleep (RWS; Bornschein and Bengio, 2015) algorithm, and through extensive evaluations, show that it circumvents both these issues, outperforming current state-of-the-art methods in learning discrete latent-variable models. Moreover, we observe that, unlike the Importance-weighted Autoencoder, RWS learns better models and inference networks with increasing numbers of particles, and that its benefits extend to continuous latent-variable models as well. Our results suggest that RWS is a competitive, often preferable, alternative for learning deep generative models.", "keywords": "variational inference;approximate inference;generative models;gradient estimators", "primary_area": "", "supplementary_material": "", "author": "Tuan Anh Le;Adam R. Kosiorek;N. Siddharth;Yee Whye Teh;Frank Wood", "authorids": "tuananh@robots.ox.ac.uk;adamk@robots.ox.ac.uk;nsid@robots.ox.ac.uk;y.w.teh@stats.ox.ac.uk;fwood@cs.ubc.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nle2019revisiting,\ntitle={Revisiting Reweighted Wake-Sleep},\nauthor={Tuan Anh Le and Adam R. Kosiorek and N. Siddharth and Yee Whye Teh and Frank Wood},\nyear={2019},\nurl={https://openreview.net/forum?id=BJzuKiC9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BJzuKiC9KX", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;3", "wc_review": "191;323;131", "wc_reply_reviewers": "0;85;0", "wc_reply_authors": "247;776;79", "reply_reviewers": "0;1;0", "reply_authors": "1;3;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 215.0, 80.19975062305369 ], "wc_reply_reviewers_avg": [ 28.333333333333332, 40.069384267237695 ], "wc_reply_authors_avg": [ 367.3333333333333, 296.9986906069153 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14379225081252784913&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkE8NjCqYm", "title": "(Unconstrained) Beam Search is Sensitive to Large Search Discrepancies", "track": "main", "status": "Reject", "tldr": "Analysis of the performance degradation in beam search and how constraining the the search can help avoiding it", "abstract": "Beam search is the most popular inference algorithm for decoding neural sequence models. Unlike greedy search, beam search allows for a non-greedy local decisions that can potentially lead to a sequence with a higher overall probability. However, previous work found that the performance of beam search tends to degrade with large beam widths. In this work, we perform an empirical study of the behavior of the beam search algorithm across three sequence synthesis tasks. We find that increasing the beam width leads to sequences that are disproportionately based on early and highly non-greedy decisions. These sequences typically include a very low probability token that is followed by a sequence of tokens with higher (conditional) probability leading to an overall higher probability sequence. However, as beam width increases, such sequences are more likely to have a lower evaluation score. Based on our empirical analysis we propose to constrain the beam search from taking highly non-greedy decisions early in the search. We evaluate two methods to constrain the search and show that constrained beam search effectively eliminates the problem of beam search degradation and in some cases even leads to higher evaluation scores. Our results generalize and improve upon previous observations on copies and training set predictions.", "keywords": "beam search;sequence models;search;sequence to sequence", "primary_area": "", "supplementary_material": "", "author": "Eldan Cohen;J. Christopher Beck", "authorids": "ecohen@mie.utoronto.ca;jcb@mie.utoronto.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ncohen2019unconstrained,\ntitle={(Unconstrained) Beam Search is Sensitive to Large Search Discrepancies},\nauthor={Eldan Cohen and J. Christopher Beck},\nyear={2019},\nurl={https://openreview.net/forum?id=BkE8NjCqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkE8NjCqYm", "pdf_size": 0, "rating": "5;5;7", "confidence": "5;4;5", "wc_review": "295;866;295", "wc_reply_reviewers": "0;184;0", "wc_reply_authors": "1328;1011;306", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 485.3333333333333, 269.1719813716791 ], "wc_reply_reviewers_avg": [ 61.333333333333336, 86.73843182554982 ], "wc_reply_authors_avg": [ 881.6666666666666, 427.1348993259884 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hG8sLLVnligJ:scholar.google.com/&scioq=(Unconstrained)+Beam+Search+is+Sensitive+to+Large+Search+Discrepancies&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Post Selection Inference with Incomplete Maximum Mean Discrepancy Estimator", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/717", "id": "BkG5SjR5YQ", "author_site": "Makoto Yamada, Yi Wu, Yao-Hung Hubert Tsai, Hirofumi Ohta, Ruslan Salakhutdinov, Ichiro Takeuchi, Kenji Fukumizu", "tldr": "", "abstract": "Measuring divergence between two distributions is essential in machine learning and statistics and has various applications including binary classification, change point detection, and two-sample test. Furthermore, in the era of big data, designing divergence measure that is interpretable and can handle high-dimensional and complex data becomes extremely important. In this paper, we propose a post selection inference (PSI) framework for divergence measure, which can select a set of statistically significant features that discriminate two distributions. Specifically, we employ an additive variant of maximum mean discrepancy (MMD) for features and introduce a general hypothesis test for PSI. A novel MMD estimator using the incomplete U-statistics, which has an asymptotically normal distribution (under mild assumptions) and gives high detection power in PSI, is also proposed and analyzed theoretically. Through synthetic and real-world feature selection experiments, we show that the proposed framework can successfully detect statistically significant features. Last, we propose a sample selection framework for analyzing different members in the Generative Adversarial Networks (GANs) family. ", "keywords": "Maximum Mean Discrepancy;Selective Inference;Feature Selection;GAN", "primary_area": "", "supplementary_material": "", "author": "Makoto Yamada;Denny Wu;Yao-Hung Hubert Tsai;Hirofumi Ohta;Ruslan Salakhutdinov;Ichiro Takeuchi;Kenji Fukumizu", "authorids": "makoto.yamada@riken.jp;yiwu1@andrew.cmu.edu;yaohungt@cs.cmu.edu;hirofumi-ohta@g.ecc.u-tokyo.ac.jp;rsalakhu@cs.cmu.edu;takeuchi.ichiro@nitech.ac.jp;fukumizu@ism.ac.jp", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nyamada2018post,\ntitle={Post Selection Inference with Incomplete Maximum Mean Discrepancy Estimator},\nauthor={Makoto Yamada and Denny Wu and Yao-Hung Hubert Tsai and Hirofumi Ohta and Ruslan Salakhutdinov and Ichiro Takeuchi and Kenji Fukumizu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkG5SjR5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;4", "wc_review": "118;198;458", "wc_reply_reviewers": "0;0;44", "wc_reply_authors": "307;514;131", "reply_reviewers": "0;0;1", "reply_authors": "1;1;3", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 258.0, 145.1436070471816 ], "wc_reply_reviewers_avg": [ 14.666666666666666, 20.741798914805393 ], "wc_reply_authors_avg": [ 317.3333333333333, 156.52972738606414 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3227686816764844914&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkG5SjR5YQ", "pdf": "https://openreview.net/pdf?id=BkG5SjR5YQ", "email": ";;;;;;", "author_num": 7 }, { "title": "Emergent Coordination Through Competition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/928", "id": "BkG8sjR5Km", "author_site": "SIQI LIU, Guy Lever, Josh Merel, Saran Tunyasuvunakool, Nicolas Heess, Thore Graepel", "tldr": "We introduce a new MuJoCo soccer environment for continuous multi-agent reinforcement learning research, and show that population-based training of independent reinforcement learners can learn cooperative behaviors", "abstract": "We study the emergence of cooperative behaviors in reinforcement learning agents by introducing a challenging competitive multi-agent soccer environment with continuous simulated physics. We demonstrate that decentralized, population-based training with co-play can lead to a progression in agents' behaviors: from random, to simple ball chasing, and finally showing evidence of cooperation. Our study highlights several of the challenges encountered in large scale multi-agent training in continuous control. In particular, we demonstrate that the automatic optimization of simple shaping rewards, not themselves conducive to co-operative behavior, can lead to long-horizon team behavior. We further apply an evaluation scheme, grounded by game theoretic principals, that can assess agent performance in the absence of pre-defined evaluation tasks or human baselines.", "keywords": "Multi-agent learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Siqi Liu;Guy Lever;Josh Merel;Saran Tunyasuvunakool;Nicolas Heess;Thore Graepel", "authorids": "liusiqi@google.com;guylever@google.com;jsmerel@google.com;stunya@google.com;heess@google.com;thore@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nliu2018emergent,\ntitle={Emergent Coordination Through Competition},\nauthor={Siqi Liu and Guy Lever and Nicholas Heess and Josh Merel and Saran Tunyasuvunakool and Thore Graepel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkG8sjR5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;3", "wc_review": "539;279;134", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "749;362;8", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 317.3333333333333, 167.54767149149586 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 373.0, 302.61196275097916 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 186, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9035094704575368173&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkG8sjR5Km", "pdf": "https://openreview.net/pdf?id=BkG8sjR5Km", "email": ";;;;;", "author_num": 6 }, { "id": "BkGiPoC5FX", "title": "Efficient Convolutional Neural Network Training with Direct Feedback Alignment", "track": "main", "status": "Reject", "tldr": "", "abstract": "There were many algorithms to substitute the back-propagation (BP) in the deep neural network (DNN) training. However, they could not become popular because their training accuracy and the computational efficiency were worse than BP. One of them was direct feedback alignment (DFA), but it showed low training performance especially for the convolutional neural network (CNN). In this paper, we overcome the limitation of the DFA algorithm by combining with the conventional BP during the CNN training. To improve the training stability, we also suggest the feedback weight initialization method by analyzing the patterns of the fixed random matrices in the DFA. Finally, we propose the new training algorithm, binary direct feedback alignment (BDFA) to minimize the computational cost while maintaining the training accuracy compared with the DFA. In our experiments, we use the CIFAR-10 and CIFAR-100 dataset to simulate the CNN learning from the scratch and apply the BDFA to the online learning based object tracking application to examine the training in the small dataset environment. Our proposed algorithms show better performance than conventional BP in both two different training tasks especially when the dataset is small.", "keywords": "Direct Feedback Alignment;Convolutional Neural Network;DNN Training", "primary_area": "", "supplementary_material": "", "author": "Donghyeon Han;Hoi-jun Yoo", "authorids": "hdh4797@kaist.ac.kr;hjyoo@kaist.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhan2019efficient,\ntitle={Efficient Convolutional Neural Network Training with Direct Feedback Alignment},\nauthor={Donghyeon Han and Hoi-jun Yoo},\nyear={2019},\nurl={https://openreview.net/forum?id=BkGiPoC5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkGiPoC5FX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "717;124;281", "wc_reply_reviewers": "70;0;0", "wc_reply_authors": "416;293;233", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 374.0, 250.86384089116285 ], "wc_reply_reviewers_avg": [ 23.333333333333332, 32.99831645537222 ], "wc_reply_authors_avg": [ 314.0, 76.17086057016817 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10195885819074477427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BkMWx309FX", "title": "Reinforcement Learning with Perturbed Rewards", "track": "main", "status": "Reject", "tldr": "A new approach for learning with noisy rewards in reinforcement learning", "abstract": "Recent studies have shown the vulnerability of reinforcement learning (RL) models in noisy settings. The sources of noises differ across scenarios. For instance, in practice, the observed reward channel is often subject to noise (e.g., when observed rewards are collected through sensors), and thus observed rewards may not be credible as a result. Also, in applications such as robotics, a deep reinforcement learning (DRL) algorithm can be manipulated to produce arbitrary errors. In this paper, we consider noisy RL problems where observed rewards by RL agents are generated with a reward confusion matrix. We call such observed rewards as perturbed rewards. We develop an unbiased reward estimator aided robust RL framework that enables RL agents to learn in noisy environments while observing only perturbed rewards. Our framework draws upon approaches for supervised learning with noisy data. The core ideas of our solution include estimating a reward confusion matrix and defining a set of unbiased surrogate rewards. We prove the convergence and sample complexity of our approach. Extensive experiments on different DRL platforms show that policies based on our estimated surrogate reward can achieve higher expected rewards, and converge faster than existing baselines. For instance, the state-of-the-art PPO algorithm is able to obtain 67.5% and 46.7% improvements in average on five Atari games, when the error rates are 10% and 30% respectively. ", "keywords": "robust reinforcement learning;noisy reward;sample complexity", "primary_area": "", "supplementary_material": "", "author": "Jingkang Wang;Yang Liu;Bo Li", "authorids": "wangjksjtu_01@sjtu.edu.cn;yangliu@ucsc.edu;lxbosky@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2019reinforcement,\ntitle={Reinforcement Learning with Perturbed Rewards},\nauthor={Jingkang Wang and Yang Liu and Bo Li},\nyear={2019},\nurl={https://openreview.net/forum?id=BkMWx309FX},\n}", "github": "[![github](/images/github_icon.svg) wangjksjtu/rl-perturbed-reward](https://github.com/wangjksjtu/rl-perturbed-reward)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=BkMWx309FX", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;4", "wc_review": "258;890;1498", "wc_reply_reviewers": "75;0;75", "wc_reply_authors": "838;644;692", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 882.0, 506.25948550784375 ], "wc_reply_reviewers_avg": [ 50.0, 35.35533905932738 ], "wc_reply_authors_avg": [ 724.6666666666666, 82.49983164965988 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 167, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11846250026305238801&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "BkMXkhA5Fm", "title": "Learning State Representations in Complex Systems with Multimodal Data", "track": "main", "status": "Reject", "tldr": "Multimodal synthetic dataset, collected from X-plane flight simulator, used for learning state representation and unified evaluation framework for representation learning", "abstract": "Representation learning becomes especially important for complex systems with multimodal data sources such as cameras or sensors. Recent advances in reinforcement learning and optimal control make it possible to design control algorithms on these latent representations, but the field still lacks a large-scale standard dataset for unified comparison. In this work, we present a large-scale dataset and evaluation framework for representation learning for the complex task of landing an airplane. We implement and compare several approaches to representation learning on this dataset in terms of the quality of simple supervised learning tasks and disentanglement scores. The resulting representations can be used for further tasks such as anomaly detection, optimal control, model-based reinforcement learning, and other applications.", "keywords": "deep learning;representation learning;state representation;disentangled representation;dataset;autonomous system;temporal multimodal data", "primary_area": "", "supplementary_material": "", "author": "Pavel Solovev;Vladimir Aliev;Pavel Ostyakov;Gleb Sterkin;Elizaveta Logacheva;Stepan Troeshestov;Roman Suvorov;Anton Mashikhin;Oleg Khomenko;Sergey I. Nikolenko", "authorids": "pavel.solovev.ilich@gmail.com;vldr.aliev@gmail.com;pavelosta@gmail.com;sterkin.gleb@gmail.com;elimohl@gmail.com;troeshust96@gmail.com;windj007@gmail.com;antonagoo@gmail.com;olegkhomenkoru@gmail.com;snikolenko@gmail.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@misc{\nsolovev2019learning,\ntitle={Learning State Representations in Complex Systems with Multimodal Data},\nauthor={Pavel Solovev and Vladimir Aliev and Pavel Ostyakov and Gleb Sterkin and Elizaveta Logacheva and Stepan Troeshestov and Roman Suvorov and Anton Mashikhin and Oleg Khomenko and Sergey I. Nikolenko},\nyear={2019},\nurl={https://openreview.net/forum?id=BkMXkhA5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkMXkhA5Fm", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;3", "wc_review": "291;227;465", "wc_reply_reviewers": "0;0;65", "wc_reply_authors": "440;467;801", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 327.6666666666667, 100.5628603853773 ], "wc_reply_reviewers_avg": [ 21.666666666666668, 30.641293851417057 ], "wc_reply_authors_avg": [ 569.3333333333334, 164.18350167487054 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10321560100034335885&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Prior Convictions: Black-box Adversarial Attacks with Bandits and Priors", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/851", "id": "BkMiWhR5K7", "author_site": "Andrew Ilyas, Logan Engstrom, Aleksander Madry", "tldr": "We present a unifying view on black-box adversarial attacks as a gradient estimation problem, and then present a framework (based on bandits optimization) to integrate priors into gradient estimation, leading to significantly increased performance.", "abstract": "We study the problem of generating adversarial examples in a black-box setting in which only loss-oracle access to a model is available. We introduce a framework that conceptually unifies much of the existing work on black-box attacks, and demonstrate that the current state-of-the-art methods are optimal in a natural sense. Despite this optimality, we show how to improve black-box attacks by bringing a new element into the problem: gradient priors. We give a bandit optimization-based algorithm that allows us to seamlessly integrate any such priors, and we explicitly identify and incorporate two examples. The resulting methods use two to four times fewer queries and fail two to five times less than the current state-of-the-art. The code for reproducing our work is available at https://git.io/fAjOJ.", "keywords": "adversarial examples;gradient estimation;black-box attacks;model-based optimization;bandit optimization", "primary_area": "", "supplementary_material": "", "author": "Andrew Ilyas;Logan Engstrom;Aleksander Madry", "authorids": "ailyas@mit.edu;engstrom@mit.edu;madry@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nilyas2018prior,\ntitle={Prior Convictions: Black-box Adversarial Attacks with Bandits and Priors},\nauthor={Andrew Ilyas and Logan Engstrom and Aleksander Madry},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkMiWhR5K7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BkMiWhR5K7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;5;2", "wc_review": "232;621;300", "wc_reply_reviewers": "0;156;0", "wc_reply_authors": "360;692;206", "reply_reviewers": "0;2;0", "reply_authors": "2;5;2", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 384.3333333333333, 169.6355570693309 ], "wc_reply_reviewers_avg": [ 52.0, 73.53910524340094 ], "wc_reply_authors_avg": [ 419.3333333333333, 202.79601135678735 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 471, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4612085557896805496&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BkMiWhR5K7", "pdf": "https://openreview.net/pdf?id=BkMiWhR5K7", "email": ";;", "author_num": 3 }, { "id": "BkMn9jAcYQ", "title": "Countering Language Drift via Grounding", "track": "main", "status": "Reject", "tldr": "Grounding helps avoid language drift during fine-tuning natural language agents with policy gradients.", "abstract": "While reinforcement learning (RL) shows a lot of promise for natural language processing\u2014e.g. when fine-tuning natural language systems for optimizing a certain objective\u2014there has been little investigation into potential language drift: when an external reward is used to train a system, the agents\u2019 communication protocol may easily and radically diverge from natural language. By re-casting translation as a communication game, we show that language drift indeed happens when pre-trained agents are fine-tuned with policy gradient methods. We contend that simply adding a \"naturalness\" constraint to the reward, e.g. by using language model log likelihood, does not fully address the issue, and argue that (perceptual) grounding is required. That is, while language model constraints impose syntactic conformity, they do not lead to semantic correspondence. Our experiments show that grounded models give the best communication performance, while retaining English syntax along with the ability to convey the intended semantics.", "keywords": "grounding;policy gradient;language drift;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Jason Lee;Kyunghyun Cho;Douwe Kiela", "authorids": "jason@cs.nyu.edu;kyunghyun.cho@nyu.edu;dkiela@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2019countering,\ntitle={Countering Language Drift via Grounding},\nauthor={Jason Lee and Kyunghyun Cho and Douwe Kiela},\nyear={2019},\nurl={https://openreview.net/forum?id=BkMn9jAcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkMn9jAcYQ", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "wc_review": "586;130;144", "wc_reply_reviewers": "226;0;0", "wc_reply_authors": "2490;238;116", "reply_reviewers": "1;0;0", "reply_authors": "7;1;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 286.6666666666667, 211.73778332855215 ], "wc_reply_reviewers_avg": [ 75.33333333333333, 106.53742169877317 ], "wc_reply_authors_avg": [ 948.0, 1091.4956100079683 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 2.6246692913372702 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8367685764692589861&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkMq0oRqFQ", "title": "Normalization Gradients are Least-squares Residuals", "track": "main", "status": "Reject", "tldr": "Gaussian normalization performs a least-squares fit during back-propagation, which zero-centers and decorrelates partial derivatives from normalized activations.", "abstract": "Batch Normalization (BN) and its variants have seen widespread adoption in the deep learning community because they improve the training of deep neural networks. Discussions of why this normalization works so well remain unsettled. We make explicit the relationship between ordinary least squares and partial derivatives computed when back-propagating through BN. We recast the back-propagation of BN as a least squares fit, which zero-centers and decorrelates partial derivatives from normalized activations. This view, which we term {\\em gradient-least-squares}, is an extensible and arithmetically accurate description of BN. To further explore this perspective, we motivate, interpret, and evaluate two adjustments to BN.", "keywords": "Deep Learning;Normalization;Least squares;Gradient regression", "primary_area": "", "supplementary_material": "", "author": "Yi Liu", "authorids": "liu.yi.pei@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nliu2019normalization,\ntitle={Normalization Gradients are Least-squares Residuals},\nauthor={Yi Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=BkMq0oRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkMq0oRqFQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;5", "wc_review": "330;666;164", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "141;145;68", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 386.6666666666667, 208.82102916665798 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 118.0, 35.393031329156685 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UBlbzx696RsJ:scholar.google.com/&scioq=Normalization+Gradients+are+Least-squares+Residuals&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BkMqUiA5KX", "title": "Improving latent variable descriptiveness by modelling rather than ad-hoc factors", "track": "main", "status": "Withdraw", "tldr": "This paper introduces a novel generative modelling framework that avoids latent-variable collapse and clarifies the use of certain ad-hoc factors in training Variational Autoencoders.", "abstract": "Powerful generative models, particularly in Natural Language Modelling, are commonly trained by maximizing a variational lower bound on the data log likelihood. These models often suffer from poor use of their latent variable, with ad-hoc annealing factors used to encourage retention of information in the latent variable. We discuss an alternative and general approach to latent variable modelling, based on an objective that encourages a perfect reconstruction by tying a stochastic autoencoder with a variational autoencoder (VAE). This ensures by design that the latent variable captures information about the observations, whilst retaining the ability to generate well. Interestingly, although our model is fundamentally different to a VAE, the lower bound attained is identical to the standard VAE bound but with the addition of a simple pre-factor; thus, providing a formal interpretation of the commonly used, ad-hoc pre-factors in training VAEs.", "keywords": "generative modelling;latent variable modelling;variational autoencoders;variational inference;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Alex Mansbridge;Roberto Fierimonte;Ilya Feige;David Barber", "authorids": "amansbridge@turing.ac.uk;roberto.fierimonte@gmail.com;ilya@asidatascience.com;david.barber@ucl.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BkMqUiA5KX", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;3", "wc_review": "169;180;625", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "248;187;362", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 324.6666666666667, 212.41521184280145 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 265.6666666666667, 72.52738946234189 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3293458794692064575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "title": "Sample Efficient Imitation Learning for Continuous Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1050", "id": "BkN5UoAqF7", "author_site": "Fumihiro Sasaki", "tldr": "In this paper, we proposed a model-free, off-policy IL algorithm for continuous control. Experimental results showed that our algorithm achieves competitive results with GAIL while significantly reducing the environment interactions.", "abstract": "The goal of imitation learning (IL) is to enable a learner to imitate expert behavior given expert demonstrations. Recently, generative adversarial imitation learning (GAIL) has shown significant progress on IL for complex continuous tasks. However, GAIL and its extensions require a large number of environment interactions during training. In real-world environments, the more an IL method requires the learner to interact with the environment for better imitation, the more training time it requires, and the more damage it causes to the environments and the learner itself. We believe that IL algorithms could be more applicable to real-world problems if the number of interactions could be reduced. \nIn this paper, we propose a model-free IL algorithm for continuous control. Our algorithm is made up mainly three changes to the existing adversarial imitation learning (AIL) methods \u2013 (a) adopting off-policy actor-critic (Off-PAC) algorithm to optimize the learner policy, (b) estimating the state-action value using off-policy samples without learning reward functions, and (c) representing the stochastic policy function so that its outputs are bounded. Experimental results show that our algorithm achieves competitive results with GAIL while significantly reducing the environment interactions.", "keywords": "Imitation Learning;Continuous Control;Reinforcement Learning;Inverse Reinforcement Learning;Conditional Generative Adversarial Network", "primary_area": "", "supplementary_material": "", "author": "Fumihiro Sasaki;Tetsuya Yohira;Atsuo Kawaguchi", "authorids": "fumihiro.fs.sasaki@jp.ricoh.com;fumihiro.fs.sasaki@jp.ricoh.com;fumihiro.fs.sasaki@jp.ricoh.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsasaki2018sample,\ntitle={Sample Efficient Imitation Learning for Continuous Control},\nauthor={Fumihiro Sasaki and Tetsuya Yohira and Atsuo Kawaguchi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkN5UoAqF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4", "pdf_size": 0, "rating": "5;5;5;7", "confidence": "5;5;4;5", "wc_review": "146;320;288;150", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 5.5, 0.8660254037844386 ], "confidence_avg": [ 4.75, 0.4330127018922193 ], "wc_review_avg": [ 226.0, 78.82892870006543 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.3333333333333333, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18386892554747535428&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "openreview": "https://openreview.net/forum?id=BkN5UoAqF7", "pdf": "https://openreview.net/pdf?id=BkN5UoAqF7", "email": ";;", "author_num": 3 }, { "id": "BkNUFjR5KQ", "title": "Learning Internal Dense But External Sparse Structures of Deep Neural Network", "track": "main", "status": "Reject", "tldr": "In this paper, we explore an internal dense yet external sparse network structure of deep neural networks and analyze its key properties.", "abstract": "Recent years have witnessed two seemingly opposite developments of deep convolutional neural networks (CNNs). On one hand, increasing the density of CNNs by adding cross-layer connections achieve higher accuracy. On the other hand, creating sparsity structures through regularization and pruning methods enjoys lower computational costs. In this paper, we bridge these two by proposing a new network structure with locally dense yet externally sparse connections. This new structure uses dense modules, as basic building blocks and then sparsely connects these modules via a novel algorithm during the training process. Experimental results demonstrate that the locally dense yet externally sparse structure could acquire competitive performance on benchmark tasks (CIFAR10, CIFAR100, and ImageNet) while keeping the network structure slim.", "keywords": "Convolutional Neural Network;Hierarchical Neural Architecture;Structural Sparsity;Evolving Algorithm", "primary_area": "", "supplementary_material": "", "author": "Yiqun Duan", "authorids": "duanyiquncc@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nduan2019learning,\ntitle={Learning Internal Dense But External Sparse Structures of Deep Neural Network},\nauthor={Yiqun Duan},\nyear={2019},\nurl={https://openreview.net/forum?id=BkNUFjR5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkNUFjR5KQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;2", "wc_review": "204;299;89", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "566;1085;0", "reply_reviewers": "0;0;0", "reply_authors": "1;2;0", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 197.33333333333334, 85.86164581594171 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 550.3333333333334, 443.0879019888592 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qjXJHq_pKO4J:scholar.google.com/&scioq=Learning+Internal+Dense+But+External+Sparse+Structures+of+Deep+Neural+Network&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BkVVOi0cFX", "title": "Denoise while Aggregating: Collaborative Learning in Open-Domain Question Answering", "track": "main", "status": "Reject", "tldr": "We propose denoising strategies to leverage information from supervised RC datasets to handle the noise issue in the open-domain QA task.", "abstract": "The open-domain question answering (OpenQA) task aims to extract answers that match specific questions from a distantly supervised corpus. Unlike supervised reading comprehension (RC) datasets where questions are designed for particular paragraphs, background sentences in OpenQA datasets are more prone to noise. We observe that most existing OpenQA approaches are vulnerable to noise since they simply regard those sentences that contain the answer span as ground truths and ignore the plausible correlation between the sentences and the question. To address this deficiency, we introduce a unified and collaborative model that leverages alignment information from query-sentence pairs in a small-scale supervised RC dataset and aggregates relevant evidence from distantly supervised corpus to answer open-domain questions. We evaluate our model on several real-world OpenQA datasets, and experimental results show that our collaborative learning methods outperform the existing baselines significantly.", "keywords": "natural language processing;open-domain question answering;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Haozhe Ji;Yankai Lin;Zhiyuan Liu;Maosong Sun", "authorids": "jihaozhe@gmail.com;mrlyk423@gmail.com;liuzy@tsinghua.edu.cn;sms@tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nji2019denoise,\ntitle={Denoise while Aggregating: Collaborative Learning in Open-Domain Question Answering},\nauthor={Haozhe Ji and Yankai Lin and Zhiyuan Liu and Maosong Sun},\nyear={2019},\nurl={https://openreview.net/forum?id=BkVVOi0cFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkVVOi0cFX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "278;497;302", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 359.0, 98.07140255956371 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Qimxgn0seGUJ:scholar.google.com/&scioq=Denoise+while+Aggregating:+Collaborative+Learning+in+Open-Domain+Question+Answering&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "BkVvwj0qFm", "title": "Geometric Operator Convolutional Neural Network", "track": "main", "status": "Withdraw", "tldr": "Traditional image processing algorithms are combined with Convolutional Neural Networks\uff0ca new neural network.", "abstract": "The Convolutional Neural Network (CNN) has been successfully applied in many fields during recent decades; however it lacks the ability to utilize prior domain knowledge when dealing with many realistic problems. We present a framework called Geometric Operator Convolutional Neural Network (GO-CNN) that uses domain knowledge, wherein the kernel of the first convolutional layer is replaced with a kernel generated by a geometric operator function. This framework integrates many conventional geometric operators, which allows it to adapt to a diverse range of problems. Under certain conditions, we theoretically analyze the convergence and the bound of the generalization errors between GO-CNNs and common CNNs. Although the geometric operator convolution kernels have fewer trainable parameters than common convolution kernels, the experimental results indicate that GO-CNN performs more accurately than common CNN on CIFAR-10/100. Furthermore, GO-CNN reduces dependence on the amount of training examples and enhances adversarial stability.", "keywords": "Convolutional Neural Network;Geometric Operator;Image Classification;Theoretical Analysis", "primary_area": "", "supplementary_material": "", "author": "Yangling Ma;Yixin Luo;Zhouwang Yang", "authorids": "yangma@mail.ustc.edu.cn;seeing@mail.ustc.edu.cn;yangzw@ustc.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkVvwj0qFm", "pdf_size": 0, "rating": "2;3;5", "confidence": "5;4;5", "wc_review": "347;492;886", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 575.0, 227.7381537350882 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11280613154224104955&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "Bke0rjR5F7", "title": "Stochastic Learning of Additive Second-Order Penalties with Applications to Fairness", "track": "main", "status": "Reject", "tldr": "We propose a method to stochastically optimize second-order penalties and show how this may apply to training fairness-aware classifiers.", "abstract": "Many notions of fairness may be expressed as linear constraints, and the resulting constrained objective is often optimized by transforming the problem into its Lagrangian dual with additive linear penalties. In non-convex settings, the resulting problem may be difficult to solve as the Lagrangian is not guaranteed to have a deterministic saddle-point equilibrium. In this paper, we propose to modify the linear penalties to second-order ones, and we argue that this results in a more practical training procedure in non-convex, large-data settings. For one, the use of second-order penalties allows training the penalized objective with a fixed value of the penalty coefficient, thus avoiding the instability and potential lack of convergence associated with two-player min-max games. Secondly, we derive a method for efficiently computing the gradients associated with the second-order penalties in stochastic mini-batch settings. Our resulting algorithm performs well empirically, learning an appropriately fair classifier on a number of standard benchmarks.", "keywords": "fairness", "primary_area": "", "supplementary_material": "", "author": "Heinrich Jiang;Yifan Wu;Ofir Nachum", "authorids": "heinrichj@google.com;yw4@andrew.cmu.edu;ofirnachum@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njiang2019stochastic,\ntitle={Stochastic Learning of Additive Second-Order Penalties with Applications to Fairness},\nauthor={Heinrich Jiang and Yifan Wu and Ofir Nachum},\nyear={2019},\nurl={https://openreview.net/forum?id=Bke0rjR5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bke0rjR5F7", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;3", "wc_review": "436;352;230", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "458;273;410", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 339.3333333333333, 84.57475325940295 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 380.3333333333333, 78.38508928503062 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NI-MdollS3gJ:scholar.google.com/&scioq=Stochastic+Learning+of+Additive+Second-Order+Penalties+with+Applications+to+Fairness&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Generative Code Modeling with Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1000", "id": "Bke4KsA5FX", "author_site": "Marc Brockschmidt, Miltiadis Allamanis, Alexander Gaunt, Oleksandr Polozov", "tldr": "Representing programs as graphs including semantics helps when generating programs", "abstract": "Generative models forsource code are an interesting structured prediction problem, requiring to reason about both hard syntactic and semantic constraints as well as about natural, likely programs. We present a novel model for this problem that uses a graph to represent the intermediate state of the generated output. Our model generates code by interleaving grammar-driven expansion steps with graph augmentation and neural message passing steps. An experimental evaluation shows that our new model can generate semantically meaningful expressions, outperforming a range of strong baselines.", "keywords": "Generative Model;Source Code;Graph Learning", "primary_area": "", "supplementary_material": "", "author": "Marc Brockschmidt;Miltiadis Allamanis;Alexander L. Gaunt;Oleksandr Polozov", "authorids": "mabrocks@microsoft.com;miallama@microsoft.com;algaunt@microsoft.com;polozov@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbrockschmidt2018generative,\ntitle={Generative Code Modeling with Graphs},\nauthor={Marc Brockschmidt and Miltiadis Allamanis and Alexander L. Gaunt and Oleksandr Polozov},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bke4KsA5FX},\n}", "github": "[![github](/images/github_icon.svg) Microsoft/graph-based-code-modelling](https://github.com/Microsoft/graph-based-code-modelling)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;4;4", "wc_review": "482;107;360", "wc_reply_reviewers": "221;0;9", "wc_reply_authors": "1826;108;632", "reply_reviewers": "3;0;1", "reply_authors": "5;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 316.3333333333333, 156.17582683913952 ], "wc_reply_reviewers_avg": [ 76.66666666666667, 102.12519549824887 ], "wc_reply_authors_avg": [ 855.3333333333334, 718.9294510280932 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2376600485661149991&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Bke4KsA5FX", "pdf": "https://openreview.net/pdf?id=Bke4KsA5FX", "email": ";;;", "author_num": 4 }, { "id": "Bke96sC5tm", "title": "SOLAR: Deep Structured Representations for Model-Based Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Model-based reinforcement learning (RL) methods can be broadly categorized as global model methods, which depend on learning models that provide sensible predictions in a wide range of states, or local model methods, which iteratively refit simple models that are used for policy improvement. While predicting future states that will result from the current actions is difficult, local model methods only attempt to understand system dynamics in the neighborhood of the current policy, making it possible to produce local improvements without ever learning to predict accurately far into the future. The main idea in this paper is that we can learn representations that make it easy to retrospectively infer simple dynamics given the data from the current policy, thus enabling local models to be used for policy learning in complex systems. We evaluate our approach against other model-based and model-free RL methods on a suite of robotics tasks, including manipulation tasks on a real Sawyer robotic arm directly from camera images.", "keywords": "model-based reinforcement learning;structured representation learning;robotics", "primary_area": "", "supplementary_material": "", "author": "Marvin Zhang*;Sharad Vikram*;Laura Smith;Pieter Abbeel;Matthew Johnson;Sergey Levine", "authorids": "marvin@cs.berkeley.edu;svikram@cs.ucsd.edu;smithlaura@berkeley.edu;pabbeel@cs.berkeley.edu;mattjj@google.com;svlevine@cs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzhang*2019solar,\ntitle={{SOLAR}: Deep Structured Representations for Model-Based Reinforcement Learning},\nauthor={Marvin Zhang* and Sharad Vikram* and Laura Smith and Pieter Abbeel and Matthew Johnson and Sergey Levine},\nyear={2019},\nurl={https://openreview.net/forum?id=Bke96sC5tm},\n}", "github": "[![github](/images/github_icon.svg) sharadmv/parasol](https://github.com/sharadmv/parasol)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bke96sC5tm", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "wc_review": "973;450;906", "wc_reply_reviewers": "515;0;463", "wc_reply_authors": "1087;495;1559", "reply_reviewers": "1;0;1", "reply_authors": "3;2;3", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 776.3333333333334, 232.36800315208825 ], "wc_reply_reviewers_avg": [ 326.0, 231.49226048977678 ], "wc_reply_authors_avg": [ 1047.0, 435.29606782816984 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 320, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3160286257401504607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "BkeAf2CqY7", "title": "Efficient Federated Learning via Variational Dropout", "track": "main", "status": "Withdraw", "tldr": "a joint model and gradient sparsification method for federated learning", "abstract": "As an emerging field, federated learning has recently attracted considerable attention.\nCompared to distributed learning in the datacenter setting, federated learning\nhas more strict constraints on computate efficiency of the learned model and communication\ncost during the training process. In this work, we propose an efficient\nfederated learning framework based on variational dropout. Our approach is able\nto jointly learn a sparse model while reducing the amount of gradients exchanged\nduring the iterative training process. We demonstrate the superior performance\nof our approach on achieving significant model compression and communication\nreduction ratios with no accuracy loss.", "keywords": "federated learning;communication efficient;variational dropout;sparse model", "primary_area": "", "supplementary_material": "", "author": "Wei Du;Xiao Zeng;Ming Yan;Mi Zhang", "authorids": "duwei1@msu.edu;zengxia6@msu.edu;myan@msu.edu;mizhang@msu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BkeAf2CqY7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;4", "wc_review": "649;353;160", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 387.3333333333333, 201.10417422044947 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11525699970752120988&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkeDEoCctQ", "title": "Deep Curiosity Search: Intra-Life Exploration Can Improve Performance on Challenging Deep Reinforcement Learning Problems", "track": "main", "status": "Reject", "tldr": "", "abstract": "Traditional exploration methods in reinforcement learning (RL) require agents to perform random actions to find rewards. But these approaches struggle on sparse-reward domains like Montezuma\u2019s Revenge where the probability that any random action sequence leads to reward is extremely low. Recent algorithms have performed well on such tasks by encouraging agents to visit new states or perform new actions in relation to all prior training episodes (which we call across-training novelty). But such algorithms do not consider whether an agent exhibits intra-life novelty: doing something new within the current episode, regardless of whether those behaviors have been performed in previous episodes. We hypothesize that across-training novelty might discourage agents from revisiting initially non-rewarding states that could become important stepping stones later in training\u2014a problem remedied by encouraging intra-life novelty. We introduce Curiosity Search for deep reinforcement learning, or Deep Curiosity Search (DeepCS), which encourages intra-life exploration by rewarding agents for visiting as many different states as possible within each episode, and show that DeepCS matches the performance of current state-of-the-art methods on Montezuma\u2019s Revenge. We further show that DeepCS improves exploration on Amidar, Freeway, Gravitar, and Tutankham (many of which are hard exploration games). Surprisingly, DeepCS also doubles A2C performance on Seaquest, a game we would not have expected to benefit from intra-life exploration because the arena is small and already easily navigated by naive exploration techniques. In one run, DeepCS achieves a maximum training score of 80,000 points on Seaquest\u2014higher than any methods other than Ape-X. The strong performance of DeepCS on these sparse- and dense-reward tasks suggests that encouraging intra-life novelty is an interesting, new approach for improving performance in Deep RL and motivates further research into hybridizing across-training and intra-life exploration methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Christopher Stanton;Jeff Clune", "authorids": "cstanto3@uwyo.edu;jeffclune@uwyo.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nstanton2019deep,\ntitle={Deep Curiosity Search: Intra-Life Exploration Can Improve Performance on Challenging Deep Reinforcement Learning Problems},\nauthor={Christopher Stanton and Jeff Clune},\nyear={2019},\nurl={https://openreview.net/forum?id=BkeDEoCctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkeDEoCctQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "1;3;3", "wc_review": "173;487;465", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "20;20;20", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 375.0, 143.1176672066264 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 20.0, 0.0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17757191852784187426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "BkeK-nRcFX", "title": "The Nonlinearity Coefficient - Predicting Generalization in Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We introduce the NLC, a metric that is cheap to compute in the networks randomly initialized state and is highly predictive of generalization, at least in fully-connected networks.", "abstract": "For a long time, designing neural architectures that exhibit high performance was considered a dark art that required expert hand-tuning. One of the few well-known guidelines for architecture design is the avoidance of exploding or vanishing gradients. However, even this guideline has remained relatively vague and circumstantial, because there exists no well-defined, gradient-based metric that can be computed {\\it before} training begins and can robustly predict the performance of the network {\\it after} training is complete.\n\nWe introduce what is, to the best of our knowledge, the first such metric: the nonlinearity coefficient (NLC). Via an extensive empirical study, we show that the NLC, computed in the network's randomly initialized state, is a powerful predictor of test error and that attaining a right-sized NLC is essential for attaining an optimal test error, at least in fully-connected feedforward networks. The NLC is also conceptually simple, cheap to compute, and is robust to a range of confounders and architectural design choices that comparable metrics are not necessarily robust to. Hence, we argue the NLC is an important tool for architecture search and design, as it can robustly predict poor training outcomes before training even begins.", "keywords": "deep learning;neural networks;nonlinearity;activation functions;exploding gradients;vanishing gradients;neural architecture search", "primary_area": "", "supplementary_material": "", "author": "George Philipp;Jaime G. Carbonell", "authorids": "george.philipp@email.de;jgc@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nphilipp2019the,\ntitle={The Nonlinearity Coefficient - Predicting Generalization in Deep Neural Networks},\nauthor={George Philipp and Jaime G. Carbonell},\nyear={2019},\nurl={https://openreview.net/forum?id=BkeK-nRcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BkeK-nRcFX", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;4", "wc_review": "346;438;548", "wc_reply_reviewers": "0;993;141", "wc_reply_authors": "868;1958;875", "reply_reviewers": "0;2;1", "reply_authors": "2;3;2", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 444.0, 82.57521823565874 ], "wc_reply_reviewers_avg": [ 378.0, 438.66388043694684 ], "wc_reply_authors_avg": [ 1233.6666666666667, 512.1889842192583 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5361734762676970880&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Critical Learning Periods in Deep Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1140", "id": "BkeStsCcKQ", "author_site": "Alessandro Achille, Matteo Rovere, Stefano Soatto", "tldr": "Sensory deficits in early training phases can lead to irreversible performance loss in both artificial and neuronal networks, suggesting information phenomena as the common cause, and point to the importance of the initial transient and forgetting.", "abstract": "Similar to humans and animals, deep artificial neural networks exhibit critical periods during which a temporary stimulus deficit can impair the development of a skill. The extent of the impairment depends on the onset and length of the deficit window, as in animal models, and on the size of the neural network. Deficits that do not affect low-level statistics, such as vertical flipping of the images, have no lasting effect on performance and can be overcome with further training. To better understand this phenomenon, we use the Fisher Information of the weights to measure the effective connectivity between layers of a network during training. Counterintuitively, information rises rapidly in the early phases of training, and then decreases, preventing redistribution of information resources in a phenomenon we refer to as a loss of \"Information Plasticity\". Our analysis suggests that the first few epochs are critical for the creation of strong connections that are optimal relative to the input data distribution. Once such strong connections are created, they do not appear to change during additional training. These findings suggest that the initial learning transient, under-scrutinized compared to asymptotic behavior, plays a key role in determining the outcome of the training process. Our findings, combined with recent theoretical results in the literature, also suggest that forgetting (decrease of information in the weights) is critical to achieving invariance and disentanglement in representation learning. Finally, critical periods are not restricted to biological systems, but can emerge naturally in learning systems, whether biological or artificial, due to fundamental constrains arising from learning dynamics and information processing.", "keywords": "Critical Period;Deep Learning;Information Theory;Artificial Neuroscience;Information Plasticity", "primary_area": "", "supplementary_material": "", "author": "Alessandro Achille;Matteo Rovere;Stefano Soatto", "authorids": "achille@cs.ucla.edu;matrovere@gmail.com;soatto@cs.ucla.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nachille2018critical,\ntitle={Critical Learning Periods in Deep Networks},\nauthor={Alessandro Achille and Matteo Rovere and Stefano Soatto},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkeStsCcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "pdf_size": 0, "rating": "6;8;9", "confidence": "5;4;4", "wc_review": "193;761;230", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "568;958;85", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 7.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 394.6666666666667, 259.47682405606525 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 537.0, 357.07422197632803 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 211, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18279151376576816648&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkeStsCcKQ", "pdf": "https://openreview.net/pdf?id=BkeStsCcKQ", "email": ";;", "author_num": 3 }, { "id": "BkeSusCcYm", "title": "Combining Global Sparse Gradients with Local Gradients", "track": "main", "status": "Withdraw", "tldr": "We improve gradient dropping (a technique of only exchanging large gradients on distributed training) by incorporating local gradients while doing a parameter update to reduce quality loss and further improve the training time.", "abstract": "Data-parallel neural network training is network-intensive, so gradient dropping was designed to exchange only large gradients. However, gradient dropping has been shown to slow convergence. We propose to improve convergence by having each node combine its locally computed gradient with the sparse global gradient exchanged over the network. We empirically confirm with machine translation tasks that gradient dropping with local gradients approaches convergence 48% faster than non-compressed multi-node training and 28% faster compared to vanilla gradient dropping. We also show that gradient dropping with a local gradient update does not reduce the model's final quality.", "keywords": "Distributed training;stochastic gradient descent;machine translation", "primary_area": "", "supplementary_material": "", "author": "Alham Fikri Aji;Kenneth Heafield", "authorids": "a.fikri@ed.ac.uk;kheafiel@inf.ed.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkeSusCcYm", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;3;4", "wc_review": "293;278;366", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 312.3333333333333, 38.43898484033567 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1912255723249526637&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "CEM-RL: Combining evolutionary and gradient-based methods for policy search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1031", "id": "BkeU5j0ctQ", "author_site": "Alo\u00efs Pourchot, Olivier Sigaud", "tldr": "We propose a new combination of evolution strategy and deep reinforcement learning which takes the best of both worlds", "abstract": "Deep neuroevolution and deep reinforcement learning (deep RL) algorithms are two popular approaches to policy search. The former is widely applicable and rather stable, but suffers from low sample efficiency. By contrast, the latter is more sample efficient, but the most sample efficient variants are also rather unstable and highly sensitive to hyper-parameter setting. So far, these families of methods have mostly been compared as competing tools. However, an emerging approach consists in combining them so as to get the best of both worlds. Two previously existing combinations use either an ad hoc evolutionary algorithm or a goal exploration process together with the Deep Deterministic Policy Gradient (DDPG) algorithm, a sample efficient off-policy deep RL algorithm. In this paper, we propose a different combination scheme using the simple cross-entropy\nmethod (CEM) and Twin Delayed Deep Deterministic policy gradient (TD3), another off-policy deep RL algorithm which improves over DDPG. We evaluate the resulting method, CEM-RL, on a set of benchmarks classically used in deep RL.\nWe show that CEM-RL benefits from several advantages over its competitors and offers a satisfactory trade-off between performance and sample efficiency.", "keywords": "evolution strategy;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Pourchot;Sigaud", "authorids": "alois.pourchot@telecom-paristech.fr;olivier.sigaud@upmc.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\npourchot2018cemrl,\ntitle={{CEM}-{RL}: Combining evolutionary and gradient-based methods for policy search},\nauthor={Pourchot and Sigaud},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkeU5j0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;4", "wc_review": "346;287;492", "wc_reply_reviewers": "8;0;22", "wc_reply_authors": "157;285;906", "reply_reviewers": "1;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 375.0, 86.16650548018451 ], "wc_reply_reviewers_avg": [ 10.0, 9.092121131323903 ], "wc_reply_authors_avg": [ 449.3333333333333, 327.1129604416323 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 197, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11981496156929972562&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkeU5j0ctQ", "pdf": "https://openreview.net/pdf?id=BkeU5j0ctQ", "email": ";", "author_num": 2 }, { "id": "BkeUasA5YQ", "title": "LIT: Block-wise Intermediate Representation Training for Model Compression", "track": "main", "status": "Reject", "tldr": "", "abstract": "Knowledge distillation (KD) is a popular method for reducing the computational over-\nhead of deep network inference, in which the output of a teacher model is used to train\na smaller, faster student model. Hint training (i.e., FitNets) extends KD by regressing a\nstudent model\u2019s intermediate representation to a teacher model\u2019s intermediate representa-\ntion. In this work, we introduce bLock-wise Intermediate representation Training (LIT),\na novel model compression technique that extends the use of intermediate represen-\ntations in deep network compression, outperforming KD and hint training. LIT has two\nkey ideas: 1) LIT trains a student of the same width (but shallower depth) as the teacher\nby directly comparing the intermediate representations, and 2) LIT uses the intermediate\nrepresentation from the previous block in the teacher model as an input to the current stu-\ndent block during training, avoiding unstable intermediate representations in the student\nnetwork. We show that LIT provides substantial reductions in network depth without\nloss in accuracy \u2014 for example, LIT can compress a ResNeXt-110 to a ResNeXt-20\n(5.5\u00d7) on CIFAR10 and a VDCNN-29 to a VDCNN-9 (3.2\u00d7) on Amazon Reviews\nwithout loss in accuracy, outperforming KD and hint training in network size at a given\naccuracy. We also show that applying LIT to identical student/teacher architectures\nincreases the accuracy of the student model above the teacher model, outperforming the\nrecently-proposed Born Again Networks procedure on ResNet, ResNeXt, and VDCNN.\nFinally, we show that LIT can effectively compress GAN generators.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Animesh Koratana*;Daniel Kang*;Peter Bailis;Matei Zaharia", "authorids": "koratana@stanford.edu;ddkang@stanford.edu;pbailis@cs.stanford.edu;matei@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkoratana*2019lit,\ntitle={{LIT}: Block-wise Intermediate Representation Training for Model Compression},\nauthor={Animesh Koratana* and Daniel Kang* and Peter Bailis and Matei Zaharia},\nyear={2019},\nurl={https://openreview.net/forum?id=BkeUasA5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkeUasA5YQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;3", "wc_review": "375;520;214", "wc_reply_reviewers": "32;0;0", "wc_reply_authors": "382;321;493", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 369.6666666666667, 124.98088742772723 ], "wc_reply_reviewers_avg": [ 10.666666666666666, 15.084944665313014 ], "wc_reply_authors_avg": [ 398.6666666666667, 71.20081148101863 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8096800925314739470&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 3 }, { "id": "BkedwoC5t7", "title": "Formal Limitations on the Measurement of Mutual Information", "track": "main", "status": "Reject", "tldr": "We give a theoretical analysis of the measurement and optimization of mutual information.", "abstract": "Motivated by applications to unsupervised learning, we consider the problem of measuring mutual information. Recent analysis has shown that naive kNN estimators of mutual information have serious statistical limitations motivating more refined methods. In this paper we prove that serious statistical limitations are inherent to any measurement method. More specifically, we show that any distribution-free high-confidence lower bound on mutual information cannot be larger than $O(\\ln N)$ where $N$ is the size of the data sample. We also analyze the Donsker-Varadhan lower bound on KL divergence in particular and show that, when simple statistical considerations are taken into account, this bound can never produce a high-confidence value larger than $\\ln N$. While large high-confidence lower bounds are impossible, in practice one can use estimators without formal guarantees. We suggest expressing mutual information as a difference of entropies and using cross entropy as an entropy estimator. We observe that, although cross entropy is only an upper bound on entropy, cross-entropy estimates converge to the true cross entropy at the rate of $1/\\sqrt{N}$.", "keywords": "mutual information;predictive coding;unsupervised learning;predictive learning;generalization bounds;MINE;DIM;contrastive predictive coding", "primary_area": "", "supplementary_material": "", "author": "David McAllester;Karl Stratos", "authorids": "mcallester@ttic.edu;stratos@ttic.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmcallester2019formal,\ntitle={Formal Limitations on the Measurement of Mutual Information},\nauthor={David McAllester and Karl Stratos},\nyear={2019},\nurl={https://openreview.net/forum?id=BkedwoC5t7},\n}", "github": "[![github](/images/github_icon.svg) karlstratos/doe](https://github.com/karlstratos/doe) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BkedwoC5t7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BkedwoC5t7", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;3;5", "wc_review": "680;396;353", "wc_reply_reviewers": "53;363;11", "wc_reply_authors": "236;190;31", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 476.3333333333333, 145.0800545292916 ], "wc_reply_reviewers_avg": [ 142.33333333333334, 156.9741663105394 ], "wc_reply_authors_avg": [ 152.33333333333334, 87.82684985558548 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 329, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1389531273282997728&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "title": "LanczosNet: Multi-Scale Deep Graph Convolutional Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1099", "id": "BkedznAqKQ", "author_site": "Renjie Liao, Zhizhen Zhao, Raquel Urtasun, Richard Zemel", "tldr": "", "abstract": "We propose Lanczos network (LanczosNet) which uses the Lanczos algorithm to construct low rank approximations of the graph Laplacian for graph convolution.\nRelying on the tridiagonal decomposition of the Lanczos algorithm, we not only efficiently exploit multi-scale information via fast approximated computation of matrix power but also design learnable spectral filters.\nBeing fully differentiable, LanczosNet facilitates both graph kernel learning as well as learning node embeddings. \nWe show the connection between our LanczosNet and graph based manifold learning, especially diffusion maps.\nWe benchmark our model against $8$ recent deep graph networks on citation datasets and QM8 quantum chemistry dataset. \nExperimental results show that our model achieves the state-of-the-art performance in most tasks.", "keywords": "Lanczos Network;Graph Neural Networks;Deep Graph Convolutional Networks;Deep Learning on Graph Structured Data;QM8 Quantum Chemistry Benchmark", "primary_area": "", "supplementary_material": "", "author": "Renjie Liao;Zhizhen Zhao;Raquel Urtasun;Richard Zemel", "authorids": "rjliao@cs.toronto.edu;zhizhenz@illinois.edu;urtasun@uber.com;zemel@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nliao2018lanczosnet,\ntitle={LanczosNet: Multi-Scale Deep Graph Convolutional Networks},\nauthor={Renjie Liao and Zhizhen Zhao and Raquel Urtasun and Richard Zemel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkedznAqKQ},\n}", "github": "[![github](/images/github_icon.svg) lrjconan/LanczosNetwork](https://github.com/lrjconan/LanczosNetwork)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;3;4", "wc_review": "393;272;161", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "679;68;16", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 275.3333333333333, 94.74292703005445 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 254.33333333333334, 301.03414350461236 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 307, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4668385491596284189&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=BkedznAqKQ", "pdf": "https://openreview.net/pdf?id=BkedznAqKQ", "email": ";;;", "author_num": 4 }, { "id": "BkesGnCcFX", "title": "Learning Goal-Conditioned Value Functions with one-step Path rewards rather than Goal-Rewards", "track": "main", "status": "Reject", "tldr": "Do Goal-Conditioned Value Functions need Goal-Rewards to Learn?", "abstract": "Multi-goal reinforcement learning (MGRL) addresses tasks where the desired goal state can change for every trial. State-of-the-art algorithms model these problems such that the reward formulation depends on the goals, to associate them with high reward. This dependence introduces additional goal reward resampling steps in algorithms like Hindsight Experience Replay (HER) that reuse trials in which the agent fails to reach the goal by recomputing rewards as if reached states were psuedo-desired goals. We propose a reformulation of goal-conditioned value functions for MGRL that yields a similar algorithm, while removing the dependence of reward functions on the goal. Our formulation thus obviates the requirement of reward-recomputation that is needed by HER and its extensions. We also extend a closely related algorithm, Floyd-Warshall Reinforcement Learning, from tabular domains to deep neural networks for use as a baseline. Our results are competetive with HER while substantially improving sampling efficiency in terms of reward computation. \n", "keywords": "Floyd-Warshall;Reinforcement learning;goal conditioned value functions;multi-goal", "primary_area": "", "supplementary_material": "", "author": "Vikas Dhiman;Shurjo Banerjee;Jeffrey M Siskind;Jason J Corso", "authorids": "dhiman@umich.edu;shurjo@umich.edu;qobi@purdue.edu;jjcorso@umich.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndhiman2019learning,\ntitle={Learning Goal-Conditioned Value Functions with one-step Path rewards rather than Goal-Rewards},\nauthor={Vikas Dhiman and Shurjo Banerjee and Jeffrey M Siskind and Jason J Corso},\nyear={2019},\nurl={https://openreview.net/forum?id=BkesGnCcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BkesGnCcFX", "pdf_size": 0, "rating": "1;3;4", "confidence": "4;4;3", "wc_review": "513;101;522", "wc_reply_reviewers": "658;0;0", "wc_reply_authors": "1428;257;795", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 2.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 378.6666666666667, 196.37435904131905 ], "wc_reply_reviewers_avg": [ 219.33333333333334, 310.1841746804988 ], "wc_reply_authors_avg": [ 826.6666666666666, 478.58286174450035 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4212759347040209556&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkesJ3R9YX", "title": "Where and when to look? Spatial-temporal attention for action recognition in videos", "track": "main", "status": "Reject", "tldr": "", "abstract": "Inspired by the observation that humans are able to process videos efficiently by only paying attention when and where it is needed, we propose a novel spatial-temporal attention mechanism for video-based action recognition. For spatial attention, we learn a saliency mask to allow the model to focus on the most salient parts of the feature maps. \nFor temporal attention, we employ a soft temporal attention mechanism to identify the most relevant frames from an input video. Further, we propose a set of regularizers that ensure that our attention mechanism attends to coherent regions in space and time. Our model is efficient, as it proposes a separable spatio-temporal mechanism for video attention, while being able to identify important parts of the video both spatially and temporally. We demonstrate the efficacy of our approach on three public video action recognition datasets. The proposed approach leads to state-of-the-art performance on all of them, including the new large-scale Moments in Time dataset. Furthermore, we quantitatively and qualitatively evaluate our model's ability to accurately localize discriminative regions spatially and critical frames temporally. This is despite our model only being trained with per video classification labels. ", "keywords": "visual attention;video action recognition;network interpretability", "primary_area": "", "supplementary_material": "", "author": "Lili Meng;Bo Zhao;Bo Chang;Gao Huang;Frederick Tung;Leonid Sigal", "authorids": "lilimeng1103@gmail.com;bzhao03@cs.ubc.ca;bchang@stat.ubc.ca;gh349@cornell.edu;ftung@sfu.ca;lsigal@cs.ubc.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nmeng2019where,\ntitle={Where and when to look? Spatial-temporal attention for action recognition in videos},\nauthor={Lili Meng and Bo Zhao and Bo Chang and Gao Huang and Frederick Tung and Leonid Sigal},\nyear={2019},\nurl={https://openreview.net/forum?id=BkesJ3R9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkesJ3R9YX", "pdf_size": 0, "rating": "3;6;6", "confidence": "5;4;4", "wc_review": "325;367;684", "wc_reply_reviewers": "24;124;44", "wc_reply_authors": "531;544;767", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 458.6666666666667, 160.25465845196376 ], "wc_reply_reviewers_avg": [ 64.0, 43.20493798938573 ], "wc_reply_authors_avg": [ 614.0, 108.31743473082561 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7467388446020147581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Bkeuz20cYm", "title": "Double Neural Counterfactual Regret Minimization", "track": "main", "status": "Reject", "tldr": "We proposed a double neural CFR which can match the performance of tabular based CFR and opens up the possibility for a purely neural approach to directly solve large imperfect information game.", "abstract": "Counterfactual regret minimization (CRF) is a fundamental and effective technique for solving imperfect information games. However, the original CRF algorithm only works for discrete state and action spaces, and the resulting strategy is maintained as a tabular representation. Such tabular representation limits the method from being directly applied to large games and continuing to improve from a poor strategy profile. In this paper, we propose a double neural representation for the Imperfect Information Games, where one neural network represents the cumulative regret, and the other represents the average strategy. Furthermore, we adopt the counterfactual regret minimization algorithm to optimize this double neural representation. To make neural learning efficient, we also developed several novel techniques including a robust sampling method, mini-batch Monte Carlo counterfactual regret minimization (MCCFR) and Monte Carlo counterfactual regret minimization plus (MCCFR+) which may be of independent interests. Experimentally, we demonstrate that the proposed double neural algorithm converges significantly better than the reinforcement learning counterpart. ", "keywords": "Counterfactual Regret Minimization;Imperfect Information game", "primary_area": "", "supplementary_material": "", "author": "Hui Li;Kailiang Hu;Zhibang Ge;Tao Jiang;Yuan Qi;Le Song", "authorids": "ken.lh@antfin.com;hkl163251@antfin.com;zhibang.zg@antfin.com;lvshan.jt@antfin.com;yuan.qi@antfin.com;lsong@cc.gatech.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nli2019double,\ntitle={Double Neural Counterfactual Regret Minimization},\nauthor={Hui Li and Kailiang Hu and Zhibang Ge and Tao Jiang and Yuan Qi and Le Song},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkeuz20cYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Bkeuz20cYm", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;2", "wc_review": "791;518;190", "wc_reply_reviewers": "0;0;86", "wc_reply_authors": "1267;522;199", "reply_reviewers": "0;0;1", "reply_authors": "2;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 499.6666666666667, 245.699455071073 ], "wc_reply_reviewers_avg": [ 28.666666666666668, 40.54078878802872 ], "wc_reply_authors_avg": [ 662.6666666666666, 447.2108625196347 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9429260306571208094&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BkewX2C9tX", "title": "Analyzing Federated Learning through an Adversarial Lens", "track": "main", "status": "Reject", "tldr": "Effective model poisoning attacks on federated learning able to cause high-confidence targeted misclassification of desired inputs", "abstract": "Federated learning distributes model training among a multitude of agents, who, guided by privacy concerns, perform training using their local data but share only model parameter updates, for iterative aggregation at the server. In this work, we explore the threat of model poisoning attacks on federated learning initiated by a single, non-colluding malicious agent where the adversarial objective is to cause the model to misclassify a set of chosen inputs with high confidence. We explore a number of strategies to carry out this attack, starting with simple boosting of the malicious agent's update to overcome the effects of other agents' updates. To increase attack stealth, we propose an alternating minimization strategy, which alternately optimizes for the training loss and the adversarial objective. We follow up by using parameter estimation for the benign agents' updates to improve on attack success. Finally, we use a suite of interpretability techniques to generate visual explanations of model decisions for both benign and malicious models and show that the explanations are nearly visually indistinguishable. Our results indicate that even a highly constrained adversary can carry out model poisoning attacks while simultaneously maintaining stealth, thus highlighting the vulnerability of the federated learning setting and the need to develop effective defense strategies.", "keywords": "federated learning;model poisoning", "primary_area": "", "supplementary_material": "", "author": "Arjun Nitin Bhagoji;Supriyo Chakraborty;Seraphin Calo;Prateek Mittal", "authorids": "abhagoji@princeton.edu;supriyo@us.ibm.com;scalo@us.ibm.com;pmittal@princeton.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbhagoji2019analyzing,\ntitle={Analyzing Federated Learning through an Adversarial Lens},\nauthor={Arjun Nitin Bhagoji and Supriyo Chakraborty and Seraphin Calo and Prateek Mittal},\nyear={2019},\nurl={https://openreview.net/forum?id=BkewX2C9tX},\n}", "github": "[![github](/images/github_icon.svg) inspire-group/ModelPoisoning](https://github.com/inspire-group/ModelPoisoning) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=BkewX2C9tX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkewX2C9tX", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;4", "wc_review": "367;319;343", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "758;454;125", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 343.0, 19.595917942265423 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 445.6666666666667, 258.48834059241864 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1464, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16839948122426603319&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "Bkf1tjR9KQ", "title": "DVOLVER: Efficient Pareto-Optimal Neural Network Architecture Search", "track": "main", "status": "Reject", "tldr": "Multi-objective Neural architecture search as an efficient way to find fast and accurate architecture for mobile devices.", "abstract": "Automatic search of neural network architectures is a standing research topic. In addition to the fact that it presents a faster alternative to hand-designed architectures, it can improve their efficiency and for instance generate Convolutional Neural Networks (CNN) adapted for mobile devices. In this paper, we present a multi-objective neural architecture search method to find a family of CNN models with the best accuracy and computational resources tradeoffs, in a search space inspired by the state-of-the-art findings in neural search. Our work, called Dvolver, evolves a population of architectures and iteratively improves an approximation of the optimal Pareto front. Applying Dvolver on the model accuracy and on the number of floating points operations as objective functions, we are able to find, in only 2.5 days 1 , a set of competitive mobile models on ImageNet. Amongst these models one architecture has the same Top-1 accuracy on ImageNet as NASNet-A mobile with 8% less floating point operations and another one has a Top-1 accuracy of 75.28% on ImageNet exceeding by 0.28% the best MobileNetV2 model for the same computational resources.", "keywords": "architecture search;Pareto optimality;multi-objective;optimization;cnn;deep learning", "primary_area": "", "supplementary_material": "", "author": "Guillaume Michel;Mohammed Amine Alaoui;Alice Lebois;Amal Feriani;Mehdi Felhi", "authorids": "guillaume.michel@netatmo.com;mohammed-amine.alaoui@netatmo.com;alice.lebois@netatmo.com;amal.feriani@netatmo.com;mehdi.felhi@netatmo.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmichel2019dvolver,\ntitle={{DVOLVER}: Efficient Pareto-Optimal Neural Network Architecture Search},\nauthor={Guillaume Michel and Mohammed Amine Alaoui and Alice Lebois and Amal Feriani and Mehdi Felhi},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkf1tjR9KQ},\n}", "github": "[![github](/images/github_icon.svg) guillaume-michel/dvolver](https://github.com/guillaume-michel/dvolver)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Bkf1tjR9KQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "325;534;114", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "597;694;206", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 324.3333333333333, 171.4649300067574 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 499.0, 210.93284871415042 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6606882693071594719&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "BkfPnoActQ", "title": "Towards Consistent Performance on Atari using Expert Demonstrations", "track": "main", "status": "Reject", "tldr": "Ape-X DQfD = Distributed (many actors + one learner + prioritized replay) DQN with demonstrations optimizing the unclipped 0.999-discounted return on Atari.", "abstract": "Despite significant advances in the field of deep Reinforcement Learning (RL), today's algorithms still fail to learn human-level policies consistently over a set of diverse tasks such as Atari 2600 games. We identify three key challenges that any algorithm needs to master in order to perform well on all games: processing diverse reward distributions, reasoning over long time horizons, and exploring efficiently. In this paper, we propose an algorithm that addresses each of these challenges and is able to learn human-level policies on nearly all Atari games. A new transformed Bellman operator allows our algorithm to process rewards of varying densities and scales; an auxiliary temporal consistency loss allows us to train stably using a discount factor of 0.999 (instead of 0.99) extending the effective planning horizon by an order of magnitude; and we ease the exploration problem by using human demonstrations that guide the agent towards rewarding states. When tested on a set of 42 Atari games, our algorithm exceeds the performance of an average human on 40 games using a common set of hyper parameters.", "keywords": "Reinforcement Learning;Atari;RL;Demonstrations", "primary_area": "", "supplementary_material": "", "author": "Tobias Pohlen;Bilal Piot;Todd Hester;Mohammad Gheshlaghi Azar;Dan Horgan;David Budden;Gabriel Barth-Maron;Hado van Hasselt;John Quan;Mel Ve\u010der\u00edk;Matteo Hessel;R\u00e9mi Munos;Olivier Pietquin", "authorids": "pohlen@google.com;piot@google.com;toddhester@google.com;mazar@google.com;horgan@google.com;budden@google.com;gabrielbm@google.com;hado@google.com;johnquan@google.com;vec@google.com;mtthss@google.com;munos@google.com;pietquin@google.com", "gender": ";;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;", "aff": ";;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;", "position": ";;;;;;;;;;;;", "bibtex": "@misc{\npohlen2019towards,\ntitle={Towards Consistent Performance on Atari using Expert Demonstrations},\nauthor={Tobias Pohlen and Bilal Piot and Todd Hester and Mohammad Gheshlaghi Azar and Dan Horgan and David Budden and Gabriel Barth-Maron and Hado van Hasselt and John Quan and Mel Ve\u010der\u00edk and Matteo Hessel and R\u00e9mi Munos and Olivier Pietquin},\nyear={2019},\nurl={https://openreview.net/forum?id=BkfPnoActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer5;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkfPnoActQ", "pdf_size": 0, "rating": "5;6;7;7", "confidence": "4;4;4;1", "wc_review": "458;287;767;39", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "586;485;537;0", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;0", "rating_avg": [ 6.25, 0.82915619758885 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "wc_review_avg": [ 387.75, 264.831432235677 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 402.0, 234.82653172075766 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 13, 0 ], "corr_rating_confidence": -0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Fpk_xPhBtWoJ:scholar.google.com/&scioq=Towards+Consistent+Performance+on+Atari+using+Expert+Demonstrations&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Excessive Invariance Causes Adversarial Vulnerability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/900", "id": "BkfbpsAcF7", "author_site": "Joern-Henrik Jacobsen, Jens Behrmann, Richard Zemel, Matthias Bethge", "tldr": "We show deep networks are not only too sensitive to task-irrelevant changes of their input, but also too invariant to a wide range of task-relevant changes, thus making vast regions in input space vulnerable to adversarial attacks.", "abstract": "Despite their impressive performance, deep neural networks exhibit striking failures on out-of-distribution inputs. One core idea of adversarial example research is to reveal neural network errors under such distribution shifts. We decompose these errors into two complementary sources: sensitivity and invariance. We show deep networks are not only too sensitive to task-irrelevant changes of their input, as is well-known from epsilon-adversarial examples, but are also too invariant to a wide range of task-relevant changes, thus making vast regions in input space vulnerable to adversarial attacks. We show such excessive invariance occurs across various tasks and architecture types. On MNIST and ImageNet one can manipulate the class-specific content of almost any image without changing the hidden activations. We identify an insufficiency of the standard cross-entropy loss as a reason for these failures. Further, we extend this objective based on an information-theoretic analysis so it encourages the model to consider all task-dependent features in its decision. This provides the first approach tailored explicitly to overcome excessive invariance and resulting vulnerabilities.", "keywords": "Generalization;Adversarial Examples;Invariance;Information Theory;Invertible Networks", "primary_area": "", "supplementary_material": "", "author": "Joern-Henrik Jacobsen;Jens Behrmann;Richard Zemel;Matthias Bethge", "authorids": "j.jacobsen@vectorinstitute.ai;jensb@uni-bremen.de;zemel@cs.toronto.edu;matthias.bethge@uni-tuebingen.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njacobsen2018excessive,\ntitle={Excessive Invariance Causes Adversarial Vulnerability},\nauthor={Joern-Henrik Jacobsen and Jens Behrmann and Richard Zemel and Matthias Bethge},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkfbpsAcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "2;4;4", "wc_review": "266;547;399", "wc_reply_reviewers": "0;63;0", "wc_reply_authors": "226;1446;270", "reply_reviewers": "0;1;0", "reply_authors": "1;4;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 404.0, 114.77223822278044 ], "wc_reply_reviewers_avg": [ 21.0, 29.698484809834994 ], "wc_reply_authors_avg": [ 647.3333333333334, 565.0282195509255 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 201, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3121934433504047296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BkfbpsAcF7", "pdf": "https://openreview.net/pdf?id=BkfbpsAcF7", "email": ";;;", "author_num": 4 }, { "id": "BkfhZnC9t7", "title": "Zero-shot Learning for Speech Recognition with Universal Phonetic Model", "track": "main", "status": "Reject", "tldr": "We apply zero-shot learning for speech recognition to recognize unseen phonemes", "abstract": "There are more than 7,000 languages in the world, but due to the lack of training sets, only a small number of them have speech recognition systems. Multilingual speech recognition provides a solution if at least some audio training data is available. Often, however, phoneme inventories differ between the training languages and the target language, making this approach infeasible. In this work, we address the problem of building an acoustic model for languages with zero audio resources. Our model is able to recognize unseen phonemes in the target language, if only a small text corpus is available. We adopt the idea of zero-shot learning, and decompose phonemes into corresponding phonetic attributes such as vowel and consonant. Instead of predicting phonemes directly, we first predict distributions over phonetic attributes, and then compute phoneme distributions with a customized acoustic model. We extensively evaluate our English-trained model on 20 unseen languages, and find that on average, it achieves 9.9% better phone error rate over a traditional CTC based acoustic model trained on English.", "keywords": "zero-shot learning;speech recognition;acoustic modeling", "primary_area": "", "supplementary_material": "", "author": "Xinjian Li;Siddharth Dalmia;David R. Mortensen;Florian Metze;Alan W Black", "authorids": "xinjianl@andrew.cmu.edu;sdalmia@cs.cmu.edu;dmortens@cs.cmu.edu;fmetze@cs.cmu.edu;awb@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019zeroshot,\ntitle={Zero-shot Learning for Speech Recognition with Universal Phonetic Model},\nauthor={Xinjian Li and Siddharth Dalmia and David R. Mortensen and Florian Metze and Alan W Black},\nyear={2019},\nurl={https://openreview.net/forum?id=BkfhZnC9t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BkfhZnC9t7", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "wc_review": "175;780;259", "wc_reply_reviewers": "0;12;0", "wc_reply_authors": "524;584;317", "reply_reviewers": "0;1;0", "reply_authors": "1;3;3", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 404.6666666666667, 267.60709175123804 ], "wc_reply_reviewers_avg": [ 4.0, 5.656854249492381 ], "wc_reply_authors_avg": [ 475.0, 114.37657102746174 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5286028674201537244&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BkfxKj09Km", "title": "DiffraNet: Automatic Classification of Serial Crystallography Diffraction Patterns", "track": "main", "status": "Reject", "tldr": "We introduce a new synthetic dataset for serial crystallography that can be used to train image classification models and explore computer vision and deep learning approaches to classify them.", "abstract": "Serial crystallography is the field of science that studies the structure and properties of crystals via diffraction patterns. In this paper, we introduce a new serial crystallography dataset generated through the use of a simulator; the synthetic images are labeled and they are both scalable and accurate. The resulting synthetic dataset is called DiffraNet, and it is composed of 25,000 512x512 grayscale labeled images. We explore several computer vision approaches for classification on DiffraNet such as standard feature extraction algorithms associated with Random Forests and Support Vector Machines but also an end-to-end CNN topology dubbed DeepFreak tailored to work on this new dataset. All implementations are publicly available and have been fine-tuned using off-the-shelf AutoML optimization tools for a fair comparison. Our best model achieves 98.5% accuracy. We believe that the DiffraNet dataset and its classification methods will have in the long term a positive impact in accelerating discoveries in many disciplines, including chemistry, geology, biology, materials science, metallurgy, and physics.", "keywords": "Serial Crystallography;Deep Learning;Image Classification", "primary_area": "", "supplementary_material": "", "author": "Artur Souza;Leonardo B. Oliveira;Sabine Hollatz;Matt Feldman;Kunle Olukotun;James M. Holton;Aina E. Cohen;Luigi Nardi", "authorids": "arturluis@dcc.ufmg.br;leob@dcc.ufmg.br;shollatz@slac.stanford.edu;mattfel@stanford.edu;kunle@stanford.edu;jmholton@slac.stanford.edu;acohen@slac.stanford.edu;lnardi@stanford.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nsouza2019diffranet,\ntitle={DiffraNet: Automatic Classification of Serial Crystallography Diffraction Patterns},\nauthor={Artur Souza and Leonardo B. Oliveira and Sabine Hollatz and Matt Feldman and Kunle Olukotun and James M. Holton and Aina E. Cohen and Luigi Nardi},\nyear={2019},\nurl={https://openreview.net/forum?id=BkfxKj09Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkfxKj09Km", "pdf_size": 0, "rating": "3;5;8", "confidence": "5;4;4", "wc_review": "213;510;117", "wc_reply_reviewers": "109;0;0", "wc_reply_authors": "783;817;187", "reply_reviewers": "1;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 280.0, 167.29016707505556 ], "wc_reply_reviewers_avg": [ 36.333333333333336, 51.383092766222454 ], "wc_reply_authors_avg": [ 595.6666666666666, 289.30414599325894 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.8029550685469661, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VeJKlVYkn7kJ:scholar.google.com/&scioq=DiffraNet:+Automatic+Classification+of+Serial+Crystallography+Diffraction+Patterns&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Hindsight policy gradients", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/891", "id": "Bkg2viA5FQ", "author_site": "Paulo Rauber, Avinash Ummadisingu, Filipe Mutz, J\u00fcrgen Schmidhuber", "tldr": "We introduce the capacity to exploit information about the degree to which an arbitrary goal has been achieved while another goal was intended to policy gradient methods.", "abstract": "A reinforcement learning agent that needs to pursue different goals across episodes requires a goal-conditional policy. In addition to their potential to generalize desirable behavior to unseen goals, such policies may also enable higher-level planning based on subgoals. In sparse-reward environments, the capacity to exploit information about the degree to which an arbitrary goal has been achieved while another goal was intended appears crucial to enable sample efficient learning. However, reinforcement learning agents have only recently been endowed with such capacity for hindsight. In this paper, we demonstrate how hindsight can be introduced to policy gradient methods, generalizing this idea to a broad class of successful algorithms. Our experiments on a diverse selection of sparse-reward environments show that hindsight leads to a remarkable increase in sample efficiency.", "keywords": "reinforcement learning;policy gradients;multi-goal reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Paulo Rauber;Avinash Ummadisingu;Filipe Mutz;J\u00fcrgen Schmidhuber", "authorids": "paulo@idsia.ch;avinash.ummadisingu@usi.ch;filipe.mutz@ifes.edu.br;juergen@idsia.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nrauber2018hindsight,\ntitle={Hindsight policy gradients},\nauthor={Paulo Rauber and Avinash Ummadisingu and Filipe Mutz and J\u00fcrgen Schmidhuber},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkg2viA5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "wc_review": "296;605;688", "wc_reply_reviewers": "57;0;0", "wc_reply_authors": "724;1149;1134", "reply_reviewers": "1;0;0", "reply_authors": "1;2;2", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 529.6666666666666, 168.66600790385186 ], "wc_reply_reviewers_avg": [ 19.0, 26.870057685088806 ], "wc_reply_authors_avg": [ 1002.3333333333334, 196.90663326110226 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15003179791243238208&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Bkg2viA5FQ", "pdf": "https://openreview.net/pdf?id=Bkg2viA5FQ", "email": ";;;", "author_num": 4 }, { "title": "Adaptive Gradient Methods with Dynamic Bound of Learning Rate", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/974", "id": "Bkg3g2R9FX", "author_site": "Liangchen Luo, Yuanhao Xiong, Yan Liu, Xu Sun", "tldr": "Novel variants of optimization methods that combine the benefits of both adaptive and non-adaptive methods.", "abstract": "Adaptive optimization methods such as AdaGrad, RMSprop and Adam have been proposed to achieve a rapid training process with an element-wise scaling term on learning rates. Though prevailing, they are observed to generalize poorly compared with SGD or even fail to converge due to unstable and extreme learning rates. Recent work has put forward some algorithms such as AMSGrad to tackle this issue but they failed to achieve considerable improvement over existing methods. In our paper, we demonstrate that extreme learning rates can lead to poor performance. We provide new variants of Adam and AMSGrad, called AdaBound and AMSBound respectively, which employ dynamic bounds on learning rates to achieve a gradual and smooth transition from adaptive methods to SGD and give a theoretical proof of convergence. We further conduct experiments on various popular tasks and models, which is often insufficient in previous work. Experimental results show that new variants can eliminate the generalization gap between adaptive methods and SGD and maintain higher learning speed early in training at the same time. Moreover, they can bring significant improvement over their prototypes, especially on complex deep networks. The implementation of the algorithm can be found at https://github.com/Luolc/AdaBound .", "keywords": "Optimization;SGD;Adam;Generalization", "primary_area": "", "supplementary_material": "", "author": "Liangchen Luo;Yuanhao Xiong;Yan Liu;Xu Sun", "authorids": "luolc@pku.edu.cn;xiongyh@zju.edu.cn;yanliu.cs@usc.edu;xusun@pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nluo2018adaptive,\ntitle={Adaptive Gradient Methods with Dynamic Bound of Learning Rate},\nauthor={Liangchen Luo and Yuanhao Xiong and Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkg3g2R9FX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=Bkg3g2R9FX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;4;4", "wc_review": "551;210;310", "wc_reply_reviewers": "0;12;0", "wc_reply_authors": "1385;297;256", "reply_reviewers": "0;1;0", "reply_authors": "3;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 357.0, 143.12465429361453 ], "wc_reply_reviewers_avg": [ 4.0, 5.656854249492381 ], "wc_reply_authors_avg": [ 646.0, 522.8199180087411 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 877, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7531609261550586378&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Bkg3g2R9FX", "pdf": "https://openreview.net/pdf?id=Bkg3g2R9FX", "email": ";;;", "author_num": 4 }, { "id": "Bkg5aoAqKm", "title": "Fast Binary Functional Search on Graph", "track": "main", "status": "Reject", "tldr": "Efficient Search by Neural Network based searching measures.", "abstract": "The large-scale search is an essential task in modern information systems. Numerous learning based models are proposed to capture semantic level similarity measures for searching or ranking. However, these measures are usually complicated and beyond metric distances. As Approximate Nearest Neighbor Search (ANNS) techniques have specifications on metric distances, efficient searching by advanced measures is still an open question. In this paper, we formulate large-scale search as a general task, Optimal Binary Functional Search (OBFS), which contains ANNS as special cases. We analyze existing OBFS methods' limitations and explain they are not applicable for complicated searching measures. We propose a flexible graph-based solution for OBFS, Search on L2 Graph (SL2G). SL2G approximates gradient decent in Euclidean space, with accessible conditions. Experiments demonstrate SL2G's efficiency in searching by advanced matching measures (i.e., Neural Network based measures).", "keywords": "Binary Functional Search;Large-scale Search;Approximate Nearest Neighbor Search", "primary_area": "", "supplementary_material": "", "author": "Shulong Tan;Zhixin Zhou;Zhaozhuo Xu;Ping Li", "authorids": "laos1984@gmail.com;zhixin0825@gmail.com;zhaozhuoxu@gmail.com;pingli98@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ntan2019fast,\ntitle={Fast Binary Functional Search on Graph},\nauthor={Shulong Tan and Zhixin Zhou and Zhaozhuo Xu and Ping Li},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkg5aoAqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bkg5aoAqKm", "pdf_size": 0, "rating": "4;5", "confidence": "4;5", "wc_review": "228;713", "wc_reply_reviewers": "35;238", "wc_reply_authors": "226;1227", "reply_reviewers": "1;1", "reply_authors": "1;3", "rating_avg": [ 4.5, 0.5 ], "confidence_avg": [ 4.5, 0.5 ], "wc_review_avg": [ 470.5, 242.5 ], "wc_reply_reviewers_avg": [ 136.5, 101.5 ], "wc_reply_authors_avg": [ 726.5, 500.5 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UbtCM7dBlJsJ:scholar.google.com/&scioq=Fast+Binary+Functional+Search+on+Graph&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Decoupled Weight Decay Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/935", "id": "Bkg6RiCqY7", "author_site": "Ilya Loshchilov, Frank Hutter", "tldr": "", "abstract": "L$_2$ regularization and weight decay regularization are equivalent for standard stochastic gradient descent (when rescaled by the learning rate), but as we demonstrate this is \\emph{not} the case for adaptive gradient algorithms, such as Adam. While common implementations of these algorithms employ L$_2$ regularization (often calling it ``weight decay'' in what may be misleading due to the inequivalence we expose), we propose a simple modification to recover the original formulation of weight decay regularization by \\emph{decoupling} the weight decay from the optimization steps taken w.r.t. the loss function. We provide empirical evidence that our proposed modification (i) decouples the optimal choice of weight decay factor from the setting of the learning rate for both standard SGD and Adam and (ii) substantially improves Adam's generalization performance, allowing it to compete with SGD with momentum on image classification datasets (on which it was previously typically outperformed by the latter). Our proposed decoupled weight decay has already been adopted by many researchers, and the community has implemented it in TensorFlow and PyTorch; the complete source code for our experiments is available at \\url{https://github.com/loshchil/AdamW-and-SGDW}", "keywords": "optimization;regularization;weight decay;Adam", "primary_area": "", "supplementary_material": "", "author": "Ilya Loshchilov;Frank Hutter", "authorids": "ilya.loshchilov@gmail.com;fh@cs.uni-freiburg.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nloshchilov2018decoupled,\ntitle={Decoupled Weight Decay Regularization},\nauthor={Ilya Loshchilov and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkg6RiCqY7},\n}", "github": "[![github](/images/github_icon.svg) loshchil/AdamW-and-SGDW](https://github.com/loshchil/AdamW-and-SGDW) + [![Papers with Code](/images/pwc_icon.svg) 19 community implementations](https://paperswithcode.com/paper/?openreview=Bkg6RiCqY7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "399;336;692", "wc_reply_reviewers": "0;0;207", "wc_reply_authors": "494;458;708", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 475.6666666666667, 155.11787632492766 ], "wc_reply_reviewers_avg": [ 69.0, 97.58073580374356 ], "wc_reply_authors_avg": [ 553.3333333333334, 110.34894149419327 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 27890, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5602734827563786057&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Bkg6RiCqY7", "pdf": "https://openreview.net/pdf?id=Bkg6RiCqY7", "email": ";", "author_num": 2 }, { "title": "Optimistic mirror descent in saddle-point problems: Going the extra (gradient) mile", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/840", "id": "Bkg8jjC9KQ", "author_site": "Panayotis Mertikopoulos, Bruno Lecouat, Houssam Zenati, Chuan-Sheng Foo, Vijay Chandrasekhar, Georgios Piliouras", "tldr": "We show how the inclusion of an extra-gradient step in first-order GAN training methods can improve stability and lead to improved convergence results.", "abstract": "Owing to their connection with generative adversarial networks (GANs), saddle-point problems have recently attracted considerable interest in machine learning and beyond. By necessity, most theoretical guarantees revolve around convex-concave (or even linear) problems; however, making theoretical inroads towards efficient GAN training depends crucially on moving beyond this classic framework. To make piecemeal progress along these lines, we analyze the behavior of mirror descent (MD) in a class of non-monotone problems whose solutions coincide with those of a naturally associated variational inequality \u2013 a property which we call coherence. We first show that ordinary, \u201cvanilla\u201d MD converges under a strict version of this condition, but not otherwise; in particular, it may fail to converge even in bilinear models with a unique solution. We then show that this deficiency is mitigated by optimism: by taking an \u201cextra-gradient\u201d step, optimistic mirror descent (OMD) converges in all coherent problems. Our analysis generalizes and extends the results of Daskalakis et al. [2018] for optimistic gradient descent (OGD) in bilinear problems, and makes concrete headway for provable convergence beyond convex-concave games. We also provide stochastic analogues of these results, and we validate our analysis by numerical experiments in a wide array of GAN models (including Gaussian mixture models, and the CelebA and CIFAR-10 datasets).", "keywords": "Mirror descent;extra-gradient;generative adversarial networks;saddle-point problems", "primary_area": "", "supplementary_material": "", "author": "Panayotis Mertikopoulos;Bruno Lecouat;Houssam Zenati;Chuan-Sheng Foo;Vijay Chandrasekhar;Georgios Piliouras", "authorids": "panayotis.mertikopoulos@imag.fr;bruno_lecouat@i2r.a-star.edu.sg;houssam_zenati@i2r.a-star.edu.sg;foocs@i2r.a-star.edu.sg;vijay@i2r.a-star.edu.sg;georgios@sutd.edu.sg", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nmertikopoulos2018optimistic,\ntitle={Optimistic mirror descent in saddle-point problems: Going the extra(-gradient) mile},\nauthor={Panayotis Mertikopoulos and Bruno Lecouat and Houssam Zenati and Chuan-Sheng Foo and Vijay Chandrasekhar and Georgios Piliouras},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkg8jjC9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;5;3", "wc_review": "183;523;66", "wc_reply_reviewers": "0;158;0", "wc_reply_authors": "626;1043;35", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 257.3333333333333, 193.83211521543296 ], "wc_reply_reviewers_avg": [ 52.666666666666664, 74.481914284983 ], "wc_reply_authors_avg": [ 568.0, 413.55289867198366 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 366, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4411409906934175363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=Bkg8jjC9KQ", "pdf": "https://openreview.net/pdf?id=Bkg8jjC9KQ", "email": ";;;;;", "author_num": 6 }, { "id": "Bkg93jC5YX", "title": "BLISS in Non-Isometric Embedding Spaces", "track": "main", "status": "Reject", "tldr": "A novel method to test for isometry between word embedding spaces, and a semi-supervised method for learning better mappings between them", "abstract": "Recent work on bilingual lexicon induction (BLI) has frequently depended either on aligned bilingual lexicons or on distribution matching, often with an assumption about the isometry of the two spaces. We propose a technique to quantitatively estimate this assumption of the isometry between two embedding spaces and empirically show that this assumption weakens as the languages in question become increasingly etymologically distant. We then propose Bilingual Lexicon Induction with Semi-Supervision (BLISS) --- a novel semi-supervised approach that relaxes the isometric assumption while leveraging both limited aligned bilingual lexicons and a larger set of unaligned word embeddings, as well as a novel hubness filtering technique. Our proposed method improves over strong baselines for 11 of 14 on the MUSE dataset, particularly for languages whose embedding spaces do not appear to be isometric. In addition, we also show that adding supervision stabilizes the learning procedure, and is effective even with minimal supervision.", "keywords": "bilingual lexicon induction;semi-supervised methods;embeddings", "primary_area": "", "supplementary_material": "", "author": "Barun Patra;Joel Ruben Antony Moniz;Sarthak Garg;Matthew R Gormley;Graham Neubig", "authorids": "bpatra@andrew.cmu.edu;jrmoniz@andrew.cmu.edu;sarthakg@andrew.cmu.edu;mgormley@andrew.cmu.edu;gneubig@andrew.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npatra2019bliss,\ntitle={{BLISS} in Non-Isometric Embedding Spaces},\nauthor={Barun Patra and Joel Ruben Antony Moniz and Sarthak Garg and Matthew R Gormley and Graham Neubig},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkg93jC5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Bkg93jC5YX", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;4;5", "wc_review": "493;228;578", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "672;219;814", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 433.0, 149.05256343540916 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 568.3333333333334, 253.72732520474725 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16855197828505939369&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "DialogWAE: Multimodal Response Generation with Conditional Wasserstein Auto-Encoder", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/743", "id": "BkgBvsC9FQ", "author_site": "Xiaodong Gu, Kyunghyun Cho, Jung-Woo Ha, Sunghun Kim", "tldr": "", "abstract": "Variational autoencoders (VAEs) have shown a promise in data-driven conversation modeling. However, most VAE conversation models match the approximate posterior distribution over the latent variables to a simple prior such as standard normal distribution, thereby restricting the generated responses to a relatively simple (e.g., single-modal) scope. In this paper, we propose DialogWAE, a conditional Wasserstein autoencoder (WAE) specially designed for dialogue modeling. Unlike VAEs that impose a simple distribution over the latent variables, DialogWAE models the distribution of data by training a GAN within the latent variable space. Specifically, our model samples from the prior and posterior distributions over the latent variables by transforming context-dependent random noise using neural networks and minimizes the Wasserstein distance between the two distributions. We further develop a Gaussian mixture prior network to enrich the latent space. Experiments on two popular datasets show that DialogWAE outperforms the state-of-the-art approaches in generating more coherent, informative and diverse responses.", "keywords": "dialogue;GAN;VAE;WAE;chatbot", "primary_area": "", "supplementary_material": "", "author": "Xiaodong Gu;Kyunghyun Cho;Jung-Woo Ha;Sunghun Kim", "authorids": "guxiaodong1987@126.com;kyunghyun.cho@nyu.edu;jungwoo.ha@navercorp.com;hunkim@cse.ust.hk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngu2018dialogwae,\ntitle={Dialog{WAE}: Multimodal Response Generation with Conditional Wasserstein Auto-Encoder},\nauthor={Xiaodong Gu and Kyunghyun Cho and Jung-Woo Ha and Sunghun Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgBvsC9FQ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BkgBvsC9FQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;3;4", "wc_review": "390;269;542", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "290;395;376", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 400.3333333333333, 111.69104211569022 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 353.6666666666667, 45.68247901426639 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3591710081490434640&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BkgBvsC9FQ", "pdf": "https://openreview.net/pdf?id=BkgBvsC9FQ", "email": ";;;", "author_num": 4 }, { "id": "BkgFqiAqFX", "title": "Recovering the Lowest Layer of Deep Networks with High Threshold Activations", "track": "main", "status": "Reject", "tldr": "We provably recover the lowest layer in a deep neural network assuming that the lowest layer uses a \"high threshold\" activation and the above network is a \"well-behaved\" polynomial.", "abstract": "Giving provable guarantees for learning neural networks is a core challenge of machine learning theory. Most prior work gives parameter recovery guarantees for one hidden layer networks, however, the networks used in practice have multiple non-linear layers. In this work, we show how we can strengthen such results to deeper networks -- we address the problem of uncovering the lowest layer in a deep neural network under the assumption that the lowest layer uses a high threshold before applying the activation, the upper network can be modeled as a well-behaved polynomial and the input distribution is gaussian.", "keywords": "Deep Learning;Parameter Recovery;Non-convex optimization;high threshold activation", "primary_area": "", "supplementary_material": "", "author": "Surbhi Goel;Rina Panigrahy", "authorids": "surbhi@cs.utexas.edu;rinap@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngoel2019recovering,\ntitle={Recovering the Lowest Layer of Deep Networks with High Threshold Activations},\nauthor={Surbhi Goel and Rina Panigrahy},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgFqiAqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer5", "site": "https://openreview.net/forum?id=BkgFqiAqFX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "252;329;213", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "436;426;42", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 264.6666666666667, 48.19635763093399 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 301.3333333333333, 183.42179683874966 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BP4XSUC-vc0J:scholar.google.com/&scioq=Recovering+the+Lowest+Layer+of+Deep+Networks+with+High+Threshold+Activations&hl=en&as_sdt=0,5", "gs_version_total": 4 }, { "id": "BkgGmh09FQ", "title": "Understanding Opportunities for Efficiency in Single-image Super Resolution Networks", "track": "main", "status": "Reject", "tldr": "We build an understanding of resource-efficient techniques on Super-Resolution", "abstract": "A successful application of convolutional architectures is to increase the resolution of single low-resolution images -- a image restoration task called super-resolution (SR). Naturally, SR is of value to resource constrained devices like mobile phones, electronic photograph frames and televisions to enhance image quality. However, SR demands perhaps the most extreme amounts of memory and compute operations of any mainstream vision task known today, preventing SR from being deployed to devices that require them. In this paper, we perform a early systematic study of system resource efficiency for SR, within the context of a variety of architectural and low-precision approaches originally developed for discriminative neural networks. We present a rich set of insights, representative SR architectures, and efficiency trade-offs; for example, the prioritization of ways to compress models to reach a specific memory and computation target and techniques to compact SR models so that they are suitable for DSPs and FPGAs. As a result of doing so, we manage to achieve better and comparable performance with previous models in the existing literature, highlighting the practicality of using existing efficiency techniques in SR tasks. Collectively, we believe these results provides the foundation for further research into the little explored area of resource efficiency for SR. ", "keywords": "Super-Resolution;Resource-Efficiency", "primary_area": "", "supplementary_material": "", "author": "Royson Lee;Nic Lane;Marko Stankovic;Sourav Bhattacharya", "authorids": "rs@roysonlee.com;nicholas.d.lane@gmail.com;marko.stankovic996@gmail.com;bsourav@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlee2019understanding,\ntitle={Understanding Opportunities for Efficiency in Single-image Super Resolution Networks},\nauthor={Royson Lee and Nic Lane and Marko Stankovic and Sourav Bhattacharya},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgGmh09FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BkgGmh09FQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;5;4", "wc_review": "141;273;667", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "509;543;769", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 360.3333333333333, 223.44176472231464 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 607.0, 115.38919649025496 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wZVfv042-2kJ:scholar.google.com/&scioq=Understanding+Opportunities+for+Efficiency+in+Single-image+Super+Resolution+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "No Training Required: Exploring Random Encoders for Sentence Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1087", "id": "BkgPajAcY7", "author_site": "John Wieting, Douwe Kiela", "tldr": "", "abstract": "We explore various methods for computing sentence representations from pre-trained word embeddings without any training, i.e., using nothing but random parameterizations. Our aim is to put sentence embeddings on more solid footing by 1) looking at how much modern sentence embeddings gain over random methods---as it turns out, surprisingly little; and by 2) providing the field with more appropriate baselines going forward---which are, as it turns out, quite strong. We also make important observations about proper experimental protocol for sentence classification evaluation, together with recommendations for future research.", "keywords": "sentence embeddings", "primary_area": "", "supplementary_material": "", "author": "John Wieting;Douwe Kiela", "authorids": "jwieting@cs.cmu.edu;dkiela@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwieting2018no,\ntitle={No Training Required: Exploring Random Encoders for Sentence Classification},\nauthor={John Wieting and Douwe Kiela},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgPajAcY7},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/randsent](https://github.com/facebookresearch/randsent)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;4", "wc_review": "403;681;275", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "841;529;135", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 453.0, 169.47762880883914 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 501.6666666666667, 288.87059771153974 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12787240152315433650&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BkgPajAcY7", "pdf": "https://openreview.net/pdf?id=BkgPajAcY7", "email": ";", "author_num": 2 }, { "id": "BkgVx3A9Km", "title": "A More Globally Accurate Dimensionality Reduction Method Using Triplets", "track": "main", "status": "Reject", "tldr": "A new dimensionality reduction method using triplets which is significantly faster than t-SNE and provides more accurate results globally", "abstract": "We first show that the commonly used dimensionality reduction (DR) methods such as t-SNE and LargeVis\npoorly capture the global structure of the data in the low dimensional embedding. We show this via a number of tests for the DR methods that can be easily applied by any practitioner to the dataset at hand. Surprisingly enough, t-SNE performs the best w.r.t. the commonly used measures that reward the local neighborhood accuracy such as precision-recall while having the worst performance in our tests for global structure. We then contrast the performance of these two DR method\nagainst our new method called TriMap. The main idea behind TriMap is to capture higher orders of structure with triplet information (instead of pairwise information used by t-SNE and LargeVis), and to minimize a robust loss function for satisfying the chosen triplets. We provide compelling experimental evidence on large natural datasets for the clear advantage of the TriMap DR results. As LargeVis, TriMap is fast and and provides comparable runtime on large datasets.", "keywords": "Dimensionality Reduction;Visualization;Triplets;t-SNE;LargeVis", "primary_area": "", "supplementary_material": "", "author": "Ehsan Amid;Manfred K. Warmuth", "authorids": "eamid@ucsc.edu;manfred@ucsc.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\namid2019a,\ntitle={A More Globally Accurate Dimensionality Reduction Method Using Triplets},\nauthor={Ehsan Amid and Manfred K. Warmuth},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgVx3A9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BkgVx3A9Km", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;5", "wc_review": "779;227;521", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 509.0, 225.51274908527898 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3572130037885300027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Neural Graph Evolution: Towards Efficient Automatic Robot Design", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/919", "id": "BkgWHnR5tm", "author_site": "Tingwu Wang, Yuhao Zhou, Sanja Fidler, Jimmy Ba", "tldr": "Automatic robotic design search with graph neural networks", "abstract": "Despite the recent successes in robotic locomotion control, the design of robot relies heavily on human engineering. Automatic robot design has been a long studied subject, but the recent progress has been slowed due to the large combinatorial search space and the difficulty in evaluating the found candidates. To address the two challenges, we formulate automatic robot design as a graph search problem and perform evolution search in graph space. We propose Neural Graph Evolution (NGE), which performs selection on current candidates and evolves new ones iteratively. Different from previous approaches, NGE uses graph neural networks to parameterize the control policies, which reduces evaluation cost on new candidates with the help of skill transfer from previously evaluated designs. In addition, NGE applies Graph Mutation with Uncertainty (GM-UC) by incorporating model uncertainty, which reduces the search space by balancing exploration and exploitation. We show that NGE significantly outperforms previous methods by an order of magnitude. As shown in experiments, NGE is the first algorithm that can automatically discover kinematically preferred robotic graph structures, such as a fish with two symmetrical flat side-fins and a tail, or a cheetah with athletic front and back legs. Instead of using thousands of cores for weeks, NGE efficiently solves searching problem within a day on a single 64 CPU-core Amazon EC2\nmachine.\n", "keywords": "Reinforcement learning;graph neural networks;robotics;deep learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Tingwu Wang;Yuhao Zhou;Sanja Fidler;Jimmy Ba", "authorids": "tingwuwang@cs.toronto.edu;henryzhou@cs.toronto.edu;fidler@cs.toronto.edu;jba@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwang2018neural,\ntitle={Neural Graph Evolution: Automatic Robot Design},\nauthor={Tingwu Wang and Yuhao Zhou and Sanja Fidler and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgWHnR5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;4", "wc_review": "662;423;187", "wc_reply_reviewers": "0;41;35", "wc_reply_authors": "619;783;399", "reply_reviewers": "0;1;1", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 424.0, 193.91922717117728 ], "wc_reply_reviewers_avg": [ 25.333333333333332, 18.080068829760823 ], "wc_reply_authors_avg": [ 600.3333333333334, 157.32203349252202 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2252025967426248193&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkgWHnR5tm", "pdf": "https://openreview.net/pdf?id=BkgWHnR5tm", "email": ";;;", "author_num": 4 }, { "id": "BkgYIiAcFQ", "title": "DecayNet: A Study on the Cell States of Long Short Term Memories", "track": "main", "status": "Reject", "tldr": "We present a LSTM reformulation with a monotonically decreasing forget gate to increase LSTM interpretability and modelling power without introducing new learnable parameters.", "abstract": "It is unclear whether the extensively applied long-short term memory (LSTM) is an optimised architecture for recurrent neural networks. Its complicated design makes the network hard to analyse and non-immediately clear for its utilities in real-world data. This paper studies LSTMs as systems of difference equations, and takes a theoretical mathematical approach to study consecutive transitions in network variables. Our study shows that the cell state propagation is predominantly controlled by the forget gate. Hence, we introduce DecayNets, LSTMs with monotonically decreasing forget gates, to calibrate cell state dynamics. With recurrent batch normalisation, DecayNet outperforms the previous state of the art for permuted sequential MNIST. The Decay mechanism is also beneficial for LSTM-based optimisers, and decrease optimisee neural network losses more rapidly.\n\nEdit status: Revised paper.", "keywords": "Long short term memory;Recurrent neural network;Dynamical systems;Difference equation", "primary_area": "", "supplementary_material": "", "author": "Nicholas I.H. Kuo;Mehrtash T. Harandi;Hanna Suominen;Nicolas Fourrier;Christian Walder;Gabriela Ferraro", "authorids": "u6424547@anu.edu.au;mehrtash.harandi@monash.edu;hanna.suominen@anu.edu.au;nicolas.fourrier@devinci.fr;christian.walder@data61.csiro.au;gabriela.ferraro@csiro.au", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nkuo2019decaynet,\ntitle={DecayNet: A Study on the Cell States of Long Short Term Memories},\nauthor={Nicholas I.H. Kuo and Mehrtash T. Harandi and Hanna Suominen and Nicolas Fourrier and Christian Walder and Gabriela Ferraro},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgYIiAcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkgYIiAcFQ", "pdf_size": 0, "rating": "4;4;8", "confidence": "4;4;3", "wc_review": "396;1135;217", "wc_reply_reviewers": "152;373;0", "wc_reply_authors": "1219;382;132", "reply_reviewers": "2;2;0", "reply_authors": "3;1;1", "rating_avg": [ 5.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 582.6666666666666, 397.33640938406614 ], "wc_reply_reviewers_avg": [ 175.0, 153.1426350389292 ], "wc_reply_authors_avg": [ 577.6666666666666, 464.8342595903285 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2450095190199363600&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkgiM20cYX", "title": "A Self-Supervised Method for Mapping Human Instructions to Robot Policies", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we propose a modular approach which separates the instruction-to-action mapping procedure into two separate stages. The two stages are bridged via an intermediate representation called a goal, which stands for the result after a robot performs a specific task. \nThe first stage maps an input instruction to a goal, while the second stage maps the goal to an appropriate policy selected from a set of robot policies. The policy is selected with an aim to guide the robot to reach the goal as close as possible. We implement the above two stages as a framework consisting of two distinct modules: an instruction-goal mapping module and a goal-policy mapping module. Given a human instruction in the evaluation phase, the instruction-goal mapping module first translates the instruction to a robot-interpretable goal. Once a goal is derived by the instruction-goal mapping module, the goal-policy mapping module then follows up to search through the goal-policy pairs to look for policy to be mapped by the instruction. Our experimental results show that the proposed method is able to learn an effective instruction-to-action mapping procedure in an environment with a given instruction set more efficiently than the baselines. In addition to the impressive data-efficiency, the results also show that our method can be adapted to a new instruction set and a new robot action space much faster than the baselines. The evidence suggests that our modular approach does lead to better adaptability and efficiency. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hsin-Wei Yu;Po-Yu Wu;Chih-An Tsao;You-An Shen;Shih-Hsuan Lin;Zhang-Wei Hong;Yi-Hsiang Chang;Chun-Yi Lee", "authorids": "hsinweiyo@gmail.com;;;;;;;", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nyu2019a,\ntitle={A Self-Supervised Method for Mapping Human Instructions to Robot Policies},\nauthor={Hsin-Wei Yu and Po-Yu Wu and Chih-An Tsao and You-An Shen and Shih-Hsuan Lin and Zhang-Wei Hong and Yi-Hsiang Chang and Chun-Yi Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgiM20cYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkgiM20cYX", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;5", "wc_review": "703;484;508", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 565.0, 98.07140255956371 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3T1wZ4YB8MYJ:scholar.google.com/&scioq=A+Self-Supervised+Method+for+Mapping+Human+Instructions+to+Robot+Policies&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "BkgosiRcKm", "title": "Deep Recurrent Gaussian Process with Variational Sparse Spectrum Approximation", "track": "main", "status": "Reject", "tldr": "Modeling time-series with several Gaussian Processes in a row via a specific Variational Sparse Spectrum Approximation", "abstract": "Modeling sequential data has become more and more important in practice. Some applications are autonomous driving, virtual sensors and weather forecasting. To model such systems, so called recurrent models are frequently used. In this paper we introduce several new Deep Recurrent Gaussian Process (DRGP) models based on the Sparse Spectrum Gaussian Process (SSGP) and the improved version, called Variational Sparse Spectrum Gaussian Process (VSSGP). We follow the recurrent structure given by an existing DRGP based on a specific variational sparse Nystr\u00f6m approximation, the recurrent Gaussian Process (RGP). Similar to previous work, we also variationally integrate out the input-space and hence can propagate uncertainty through the Gaussian Process (GP) layers. Our approach can deal with a larger class of covariance functions than the RGP, because its spectral nature allows variational integration in all stationary cases. Furthermore, we combine the (Variational) Sparse Spectrum ((V)SS) approximations with a well known inducing-input regularization framework. For the DRGP extension of these combined approximations and the simple (V)SS approximations an optimal variational distribution exists. We improve over current state of the art methods in prediction accuracy for experimental data-sets used for their evaluation and introduce a new data-set for engine control, named Emission.", "keywords": "Deep Gaussian Process Model;Recurrent Model;State-Space Model;Nonlinear system identification;Dynamical modeling", "primary_area": "", "supplementary_material": "", "author": "Roman F\u00f6ll;Bernard Haasdonk;Markus Hanselmann;Holger Ulmer", "authorids": "foell@mathematik.uni-stuttgart.de;haasdonk@mathematik.uni-stuttgart.de;markus.hanselmann@etas.com;holger.ulmer@etas.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nf\u00f6ll2019deep,\ntitle={Deep Recurrent Gaussian Process with Variational Sparse Spectrum Approximation},\nauthor={Roman F\u00f6ll and Bernard Haasdonk and Markus Hanselmann and Holger Ulmer},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgosiRcKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkgosiRcKm", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;2", "wc_review": "449;542;675", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "834;845;570", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 555.3333333333334, 92.7445715692059 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 749.6666666666666, 127.12286270463792 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3215182388457267610&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Function Space Particle Optimization for Bayesian Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1119", "id": "BkgtDsCcKQ", "author_site": "Ziyu Wang, Tongzheng Ren, Jun Zhu, Bo Zhang", "tldr": "", "abstract": "While Bayesian neural networks (BNNs) have drawn increasing attention, their posterior inference remains challenging, due to the high-dimensional and over-parameterized nature. To address this issue, several highly flexible and scalable variational inference procedures based on the idea of particle optimization have been proposed. These methods directly optimize a set of particles to approximate the target posterior. However, their application to BNNs often yields sub-optimal performance, as such methods have a particular failure mode on over-parameterized models. In this paper, we propose to solve this issue by performing particle optimization directly in the space of regression functions. We demonstrate through extensive experiments that our method successfully overcomes this issue, and outperforms strong baselines in a variety of tasks including prediction, defense against adversarial examples, and reinforcement learning.", "keywords": "Bayesian neural networks;uncertainty estimation;variational inference", "primary_area": "", "supplementary_material": "", "author": "Ziyu Wang;Tongzheng Ren;Jun Zhu;Bo Zhang", "authorids": "wzy196@gmail.com;rtz19970824@gmail.com;dcszj@mail.tsinghua.edu.cn;dcszb@mail.tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwang2018function,\ntitle={Function Space Particle Optimization for Bayesian Neural Networks},\nauthor={Ziyu Wang and Tongzheng Ren and Jun Zhu and Bo Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgtDsCcKQ},\n}", "github": "[![github](/images/github_icon.svg) thu-ml/fpovi](https://github.com/thu-ml/fpovi)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;4", "wc_review": "400;291;910", "wc_reply_reviewers": "29;0;431", "wc_reply_authors": "975;711;1717", "reply_reviewers": "1;0;1", "reply_authors": "3;1;3", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 533.6666666666666, 269.8028086502354 ], "wc_reply_reviewers_avg": [ 153.33333333333334, 196.696607890313 ], "wc_reply_authors_avg": [ 1134.3333333333333, 425.871133351654 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3265058804151062573&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=BkgtDsCcKQ", "pdf": "https://openreview.net/pdf?id=BkgtDsCcKQ", "email": ";;;", "author_num": 4 }, { "title": "Structured Adversarial Attack: Towards General Implementation and Better Interpretability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/859", "id": "BkgzniCqY7", "author_site": "Kaidi Xu, Sijia Liu, Pu Zhao, Pin-Yu Chen, Huan Zhang, Quanfu Fan, Deniz Erdogmus, Yanzhi Wang, Xue Lin", "tldr": "", "abstract": "When generating adversarial examples to attack deep neural networks (DNNs), Lp norm of the added perturbation is usually used to measure the similarity between original image and adversarial example. However, such adversarial attacks perturbing the raw input spaces may fail to capture structural information hidden in the input. This work develops a more general attack model, i.e., the structured attack (StrAttack), which explores group sparsity in adversarial perturbation by sliding a mask through images aiming for extracting key spatial structures. An ADMM (alternating direction method of multipliers)-based framework is proposed that can split the original problem into a sequence of analytically solvable subproblems and can be generalized to implement other attacking methods. Strong group sparsity is achieved in adversarial perturbations even with the same level of Lp-norm distortion (p\u2208 {1,2,\u221e}) as the state-of-the-art attacks. We demonstrate the effectiveness of StrAttack by extensive experimental results on MNIST, CIFAR-10 and ImageNet. We also show that StrAttack provides better interpretability (i.e., better correspondence with discriminative image regions) through adversarial saliency map (Paper-not et al., 2016b) and class activation map (Zhou et al., 2016).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kaidi Xu;Sijia Liu;Pu Zhao;Pin-Yu Chen;Huan Zhang;Quanfu Fan;Deniz Erdogmus;Yanzhi Wang;Xue Lin", "authorids": "xu.kaid@husky.neu.edu;sijia.liu@ibm.com;zhao.pu@husky.neu.edu;pin-yu.chen@ibm.com;ecezhang@ucdavis.edu;qfan@us.ibm.com;erdogmus@ece.neu.edu;yanz.wang@northeastern.edu;xue.lin@northeastern.edu", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nxu2018structured,\ntitle={Structured Adversarial Attack: Towards General Implementation and Better Interpretability},\nauthor={Kaidi Xu and Sijia Liu and Pu Zhao and Pin-Yu Chen and Huan Zhang and Quanfu Fan and Deniz Erdogmus and Yanzhi Wang and Xue Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkgzniCqY7},\n}", "github": "[![github](/images/github_icon.svg) KaidiXu/StrAttack](https://github.com/KaidiXu/StrAttack)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;3;2", "wc_review": "229;201;173", "wc_reply_reviewers": "38;0;18", "wc_reply_authors": "890;129;390", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 201.0, 22.861904265976328 ], "wc_reply_reviewers_avg": [ 18.666666666666668, 15.520595635763755 ], "wc_reply_authors_avg": [ 469.6666666666667, 315.74286303185943 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 193, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2416957312060244972&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BkgzniCqY7", "pdf": "https://openreview.net/pdf?id=BkgzniCqY7", "email": ";;;;;;;;", "author_num": 9 }, { "title": "Spherical CNNs on Unstructured Grids", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/708", "id": "Bkl-43C9FQ", "author_site": "Chiyu Jiang, Jingwei Huang, Karthik Kashinath, Mr Prabhat, Philip Marcus, Matthias Niessner", "tldr": "We present a new CNN kernel for unstructured grids for spherical signals, and show significant accuracy and parameter efficiency gain on tasks such as 3D classfication and omnidirectional image segmentation.", "abstract": "We present an efficient convolution kernel for Convolutional Neural Networks (CNNs) on unstructured grids using parameterized differential operators while focusing on spherical signals such as panorama images or planetary signals. \nTo this end, we replace conventional convolution kernels with linear combinations of differential operators that are weighted by learnable parameters. Differential operators can be efficiently estimated on unstructured grids using one-ring neighbors, and learnable parameters can be optimized through standard back-propagation. As a result, we obtain extremely efficient neural networks that match or outperform state-of-the-art network architectures in terms of performance but with a significantly lower number of network parameters. We evaluate our algorithm in an extensive series of experiments on a variety of computer vision and climate science tasks, including shape classification, climate pattern segmentation, and omnidirectional image semantic segmentation. Overall, we present (1) a novel CNN approach on unstructured grids using parameterized differential operators for spherical signals, and (2) we show that our unique kernel parameterization allows our model to achieve the same or higher accuracy with significantly fewer network parameters.", "keywords": "Spherical CNN;unstructured grid;panoramic;semantic segmentation;parameter efficiency", "primary_area": "", "supplementary_material": "", "author": "Chiyu Max Jiang;Jingwei Huang;Karthik Kashinath;Prabhat;Philip Marcus;Matthias Niessner", "authorids": "chiyu.jiang@berkeley.edu;jingweih@stanford.edu;kkashinath@lbl.gov;prabhat@lbl.gov;pmarcus@me.berkeley.edu;niessner@tum.de", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\njiang2018spherical,\ntitle={Spherical {CNN}s on Unstructured Grids},\nauthor={Chiyu Max Jiang and Jingwei Huang and Karthik Kashinath and Prabhat and Philip Marcus and Matthias Niessner},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkl-43C9FQ},\n}", "github": "[![github](/images/github_icon.svg) maxjiang93/ugscnn](https://github.com/maxjiang93/ugscnn)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;3", "wc_review": "284;585;488", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "159;515;442", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 452.3333333333333, 125.44409998968553 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 372.0, 153.5339267610474 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8988090417232263617&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=Bkl-43C9FQ", "pdf": "https://openreview.net/pdf?id=Bkl-43C9FQ", "email": ";;;;;", "author_num": 6 }, { "id": "Bkl2SjCcKQ", "title": "TequilaGAN: How To Easily Identify GAN Samples", "track": "main", "status": "Reject", "tldr": "We show strategies to easily identify fake samples generated with the Generative Adversarial Network framework.", "abstract": "In this paper we show strategies to easily identify fake samples generated with the Generative Adversarial Network framework. One strategy is based on the statistical analysis and comparison of raw pixel values and features extracted from them. The other strategy learns formal specifications from the real data and shows that fake samples violate the specifications of the real data. We show that fake samples produced with GANs have a universal signature that can be used to identify fake samples. We provide results on MNIST, CIFAR10, music and speech data.", "keywords": "Generative Adversarial Networks;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Rafael Valle;Wilson Cai;Anish P. Doshi", "authorids": "rafaelvalle@berkeley.edu;wcai@berkeley.edu;apdoshi@berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nvalle2019tequilagan,\ntitle={Tequila{GAN}: How To Easily Identify {GAN} Samples},\nauthor={Rafael Valle and Wilson Cai and Anish P. Doshi},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkl2SjCcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bkl2SjCcKQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;5", "wc_review": "478;648;361", "wc_reply_reviewers": "586;0;0", "wc_reply_authors": "527;258;350", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 495.6666666666667, 117.83132954449009 ], "wc_reply_reviewers_avg": [ 195.33333333333334, 276.24304918354454 ], "wc_reply_authors_avg": [ 378.3333333333333, 111.63133769491233 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17496982026447234223&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "Bkl87h09FX", "title": "Looking for ELMo's friends: Sentence-Level Pretraining Beyond Language Modeling", "track": "main", "status": "Reject", "tldr": "We compare many tasks and task combinations for pretraining sentence-level BiLSTMs for NLP tasks. Language modeling is the best single pretraining task, but simple baselines also do well.", "abstract": "Work on the problem of contextualized word representation\u2014the development of reusable neural network components for sentence understanding\u2014has recently seen a surge of progress centered on the unsupervised pretraining task of language modeling with methods like ELMo (Peters et al., 2018). This paper contributes the first large-scale systematic study comparing different pretraining tasks in this context, both as complements to language modeling and as potential alternatives. The primary results of the study support the use of language modeling as a pretraining task and set a new state of the art among comparable models using multitask learning with language models. However, a closer look at these results reveals worryingly strong baselines and strikingly varied results across target tasks, suggesting that the widely-used paradigm of pretraining and freezing sentence encoders may not be an ideal platform for further work.\n", "keywords": "natural language processing;transfer learning;multitask learning", "primary_area": "", "supplementary_material": "", "author": "Samuel R. Bowman;Ellie Pavlick;Edouard Grave;Benjamin Van Durme;Alex Wang;Jan Hula;Patrick Xia;Raghavendra Pappagari;R. Thomas McCoy;Roma Patel;Najoung Kim;Ian Tenney;Yinghui Huang;Katherin Yu;Shuning Jin;Berlin Chen", "authorids": "bowman@nyu.edu;ellie_pavlick@brown.edu;egrave@fb.com;vandurme@cs.jhu.edu;alexwang@nyu.edu;jan.hula21@gmail.com;paxia@jhu.edu;raghu1991.p@gmail.com;tom.mccoy@jhu.edu;romapatel@brown.edu;n.kim@jhu.edu;iftenney@google.com;huangyi@us.ibm.com;yukatherin@fb.com;jinxx596@d.umn.edu;bchen6@swarthmore.edu", "gender": ";;;;;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;;;;", "aff": ";;;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;;", "position": ";;;;;;;;;;;;;;;", "bibtex": "@misc{\nbowman2019looking,\ntitle={Looking for {ELM}o's friends: Sentence-Level Pretraining Beyond Language Modeling},\nauthor={Samuel R. Bowman and Ellie Pavlick and Edouard Grave and Benjamin Van Durme and Alex Wang and Jan Hula and Patrick Xia and Raghavendra Pappagari and R. Thomas McCoy and Roma Patel and Najoung Kim and Ian Tenney and Yinghui Huang and Katherin Yu and Shuning Jin and Berlin Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkl87h09FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Bkl87h09FX", "pdf_size": 0, "rating": "5;7;8", "confidence": "3;4;4", "wc_review": "226;204;156", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "339;102;176", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 195.33333333333334, 29.227080289043965 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 205.66666666666666, 99.00280579637909 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 16, 0 ], "corr_rating_confidence": 0.9449111825230683, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7055120592593216368&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "BklACjAqFm", "title": "Successor Uncertainties: exploration and uncertainty in temporal difference learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider the problem of balancing exploration and exploitation in sequential decision making problems. This trade-off naturally lends itself to probabilistic modelling. For a probabilistic approach to be effective, considering uncertainty about all immediate and long-term consequences of agent's actions is vital. An estimate of such uncertainty can be leveraged to guide exploration even in situations where the agent needs to perform a potentially long sequence of actions before reaching an under-explored area of the environment. This observation was made by the authors of the Uncertainty Bellman Equation model (O'Donoghue et al., 2018), which explicitly considers full marginal uncertainty for each decision the agent faces. However, their model still considers a fully factorised posterior over the consequences of each action, meaning that dependencies vital for correlated long-term exploration are ignored. We go a step beyond and develop Successor Uncertainties, a probabilistic model for the state-action value function of a Markov Decision Process with a non-factorised covariance. We demonstrate how this leads to greatly improved performance on classic tabular exploration benchmarks and show strong performance of our method on a subset of Atari baselines. Overall, Successor Uncertainties provides a better probabilistic model for temporal difference learning at a similar computational cost to its predecessors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "David Janz;Jiri Hron;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;Katja Hofmann;Sebastian Tschiatschek", "authorids": "david.janz93@gmail.com;jh2084@cam.ac.uk;jmh233@cam.ac.uk;katja.hofmann@microsoft.com;sebastian.tschiatschek@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\njanz2019successor,\ntitle={Successor Uncertainties: exploration and uncertainty in temporal difference learning},\nauthor={David Janz and Jiri Hron and Jos\u00e9 Miguel Hern\u00e1ndez-Lobato and Katja Hofmann and Sebastian Tschiatschek},\nyear={2019},\nurl={https://openreview.net/forum?id=BklACjAqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BklACjAqFm", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;4", "wc_review": "611;186;381", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "61;61;70", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 392.6666666666667, 173.70153200885196 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 64.0, 4.242640687119285 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6430587896486635247&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 9 }, { "id": "BklAEsR5t7", "title": "Large-scale classification of structured objects using a CRF with deep class embedding", "track": "main", "status": "Reject", "tldr": "We present a technique for ultrafine-grained, large-scale structured classification, based on CRF modeling with factorized pairwise potentials, learned as neighboring class embedding in a whitened space.", "abstract": "This paper presents a novel deep learning architecture for classifying structured objects in ultrafine-grained datasets, where classes may not be clearly distinguishable by their appearance but rather by their context. We model sequences of images as linear-chain CRFs, and jointly learn the parameters from both local-visual features and neighboring class information. The visual features are learned by convolutional layers, whereas class-structure information is reparametrized by factorizing the CRF pairwise potential matrix. This forms a context-based semantic similarity space, learned alongside the visual similarities, and dramatically increases the learning capacity of contextual information. This new parametrization, however, forms a highly nonlinear objective function which is challenging to optimize. To overcome this, we develop a novel surrogate likelihood which allows for a local likelihood approximation of the original CRF with integrated batch-normalization. This model overcomes the difficulties of existing CRF methods to learn the contextual relationships thoroughly when there is a large number of classes and the data is sparse. The performance of the proposed method is illustrated on a huge dataset that contains images of retail-store product displays, and shows significantly improved results compared to linear CRF parametrization, unnormalized likelihood optimization, and RNN modeling.", "keywords": "large-scale structure prediction;likelihood approximation;deep class embedding", "primary_area": "", "supplementary_material": "", "author": "Eran Goldman;Jacob Goldberger", "authorids": "eg4000@gmail.com;jacob.goldberger@biu.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngoldman2019largescale,\ntitle={Large-scale classification of structured objects using a {CRF} with deep class embedding},\nauthor={Eran Goldman and Jacob Goldberger},\nyear={2019},\nurl={https://openreview.net/forum?id=BklAEsR5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BklAEsR5t7", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;3;4", "wc_review": "499;152;681", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 444.0, 219.43715881014015 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12419771015060741275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "BklAyh05YQ", "title": "Neural Network Bandit Learning by Last Layer Marginalization", "track": "main", "status": "Withdraw", "tldr": "This paper proposes a new method for neural network learning in online bandit settings by marginalizing over the last layer", "abstract": "We propose a new method for training neural networks online in a bandit setting. Similar to prior work, we model the uncertainty only in the last layer of the network, treating the rest of the network as a feature extractor. This allows us to successfully balance between exploration and exploitation due to the efficient, closed-form uncertainty estimates available for linear models. To train the rest of the network, we take advantage of the posterior we have over the last layer, optimizing over all values in the last layer distribution weighted by probability. We derive a closed form, differential approximation to this objective and show empirically over various models and datasets that training the rest of the network in this fashion leads to both better online and offline performance when compared to other methods.", "keywords": "Bandit learning;online learning;contextual bandits;neural network learning in online settings", "primary_area": "", "supplementary_material": "", "author": "Noah Weber;Janez Starc;Arpit Mittal;Roi Blanco;Lluis Marquez", "authorids": "nwweber@cs.stonybrook.edu;janez.j.starc@gmail.com;arpit.mittal@yahoo.com;roiblan@amazon.es;lluismv@amazon.es", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=BklAyh05YQ", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:AAuYgy2aH34J:scholar.google.com/&scioq=Neural+Network+Bandit+Learning+by+Last+Layer+Marginalization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Optimal Transport Maps For Distribution Preserving Operations on Latent Spaces of Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1114", "id": "BklCusRct7", "author_site": "Eirikur Agustsson, Alexander Sage, Radu Timofte, Luc Van Gool", "tldr": "We propose a framework for modifying the latent space operations such that the distribution mismatch between the resulting outputs and the prior distribution the generative model was trained on is fully eliminated.", "abstract": "Generative models such as Variational Auto Encoders (VAEs) and Generative Adversarial Networks (GANs) are typically trained for a fixed prior distribution in the latent space, such as uniform or Gaussian. After a trained model is obtained, one can sample the Generator in various forms for exploration and understanding, such as interpolating between two samples, sampling in the vicinity of a sample or exploring differences between a pair of samples applied to a third sample. However, the latent space operations commonly used in the literature so far induce a distribution mismatch between the resulting outputs and the prior distribution the model was trained on. Previous works have attempted to reduce this mismatch with heuristic modification to the operations or by changing the latent distribution and re-training models. In this paper, we propose a framework for modifying the latent space operations such that the distribution mismatch is fully eliminated. Our approach is based on optimal transport maps, which adapt the latent space operations such that they fully match the prior distribution, while minimally modifying the original operation. Our matched operations are readily obtained for the commonly used operations and distributions and require no adjustment to the training procedure.", "keywords": "generative models;optimal transport;distribution preserving operations", "primary_area": "", "supplementary_material": "", "author": "Eirikur Agustsson;Alexander Sage;Radu Timofte;Luc Van Gool", "authorids": "aeirikur@vision.ee.ethz.ch;alexander.sage@gmail.com;radu.timofte@vision.ee.ethz.ch;vangool@vision.ee.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nagustsson2018optimal,\ntitle={Optimal Transport Maps For Distribution Preserving Operations on Latent Spaces of Generative Models},\nauthor={Eirikur Agustsson and Alexander Sage and Radu Timofte and Luc Van Gool},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BklCusRct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;3;5", "wc_review": "237;310;407", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "482;280;288", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 318.0, 69.6323679524592 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 350.0, 93.39521757920298 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11631258824141665090&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=BklCusRct7", "pdf": "https://openreview.net/pdf?id=BklCusRct7", "email": ";;;", "author_num": 4 }, { "title": "Deep Lagrangian Networks: Using Physics as Model Prior for Deep Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/916", "id": "BklHpjCqKm", "author_site": "Michael Lutter, Christian Ritter, Jan Peters", "tldr": "This paper introduces a physics prior for Deep Learning and applies the resulting network topology for model-based control.", "abstract": "Deep learning has achieved astonishing results on many tasks with large amounts of data and generalization within the proximity of training data. For many important real-world applications, these requirements are unfeasible and additional prior knowledge on the task domain is required to overcome the resulting problems. In particular, learning physics models for model-based control requires robust extrapolation from fewer samples \u2013 often collected online in real-time \u2013 and model errors may lead to drastic damages of the system.\nDirectly incorporating physical insight has enabled us to obtain a novel deep model learning approach that extrapolates well while requiring fewer samples. As a first example, we propose Deep Lagrangian Networks (DeLaN) as a deep network structure upon which Lagrangian Mechanics have been imposed. DeLaN can learn the equations of motion of a mechanical system (i.e., system dynamics) with a deep network efficiently while ensuring physical plausibility.\nThe resulting DeLaN network performs very well at robot tracking control. The proposed method did not only outperform previous model learning approaches at learning speed but exhibits substantially improved and more robust extrapolation to novel trajectories and learns online in real-time.", "keywords": "Deep Model Learning;Robot Control", "primary_area": "", "supplementary_material": "", "author": "Michael Lutter;Christian Ritter;Jan Peters", "authorids": "michael@robot-learning.de;ritter@stud.tu-darmstadt.de;peters@ias.tu-darmstadt.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlutter2018deep,\ntitle={Deep Lagrangian Networks: Using Physics as Model Prior for Deep Learning},\nauthor={Michael Lutter and Christian Ritter and Jan Peters},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BklHpjCqKm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=BklHpjCqKm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;7;7", "confidence": "5;3;4", "wc_review": "632;215;424", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "910;755;740", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 423.6666666666667, 170.23970029213385 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 801.6666666666666, 76.84761255599003 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 511, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7347447340504722974&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=BklHpjCqKm", "pdf": "https://openreview.net/pdf?id=BklHpjCqKm", "email": ";;", "author_num": 3 }, { "id": "BklKFo09YX", "title": "Mol-CycleGAN - a generative model for molecular optimization", "track": "main", "status": "Reject", "tldr": "We introduce Mol-CycleGAN - a new generative model for optimization of molecules to augment drug design.", "abstract": "Designing a molecule with desired properties is one of the biggest challenges in drug development, as it requires optimization of chemical compound structures with respect to many complex properties. To augment the compound design process we introduce Mol-CycleGAN -- a CycleGAN-based model that generates optimized compounds with a chemical scaffold of interest. Namely, given a molecule our model generates a structurally similar one with an optimized value of the considered property. We evaluate the performance of the model on selected optimization objectives related to structural properties (presence of halogen groups, number of aromatic rings) and to a physicochemical property (penalized logP). In the task of optimization of penalized logP of drug-like molecules our model significantly outperforms previous results. ", "keywords": "generative adversarial networks;drug design;deep learning;molecule optimization", "primary_area": "", "supplementary_material": "", "author": "\u0141ukasz Maziarka;Agnieszka Pocha;Jan Kaczmarczyk;Micha\u0142 Warcho\u0142", "authorids": "l.maziarka@gmail.com;lamiane.chan@gmail.com;jan.kaczmarczyk@ardigen.com;michal.warchol@ardigen.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmaziarka2019molcyclegan,\ntitle={Mol-Cycle{GAN} - a generative model for molecular optimization},\nauthor={\u0141ukasz Maziarka and Agnieszka Pocha and Jan Kaczmarczyk and Micha\u0142 Warcho\u0142},\nyear={2019},\nurl={https://openreview.net/forum?id=BklKFo09YX},\n}", "github": "[![github](/images/github_icon.svg) ardigen/mol-cycle-gan](https://github.com/ardigen/mol-cycle-gan)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BklKFo09YX", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;5", "wc_review": "436;184;594", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 404.6666666666667, 168.84180630269137 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 318, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15110682066018366615&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 21 }, { "id": "BklMYjC9FQ", "title": "microGAN: Promoting Variety through Microbatch Discrimination", "track": "main", "status": "Reject", "tldr": "We use microbatch discrimination on multi-adversarial GANs to mitigate mode collapse.", "abstract": "We propose to tackle the mode collapse problem in generative adversarial networks (GANs) by using multiple discriminators and assigning a different portion of each minibatch, called microbatch, to each discriminator. We gradually change each discriminator's task from distinguishing between real and fake samples to discriminating samples coming from inside or outside its assigned microbatch by using a diversity parameter $\\alpha$. The generator is then forced to promote variety in each minibatch to make the microbatch discrimination harder to achieve by each discriminator. Thus, all models in our framework benefit from having variety in the generated set to reduce their respective losses. We show evidence that our solution promotes sample diversity since early training stages on multiple datasets.", "keywords": "adversarial training;gans", "primary_area": "", "supplementary_material": "", "author": "Goncalo Mordido;Haojin Yang;Christoph Meinel", "authorids": "goncalo.mordido@hpi.de;haojin.yang@hpi.de;christoph.meinel@hpi.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmordido2019microgan,\ntitle={micro{GAN}: Promoting Variety through Microbatch Discrimination},\nauthor={Goncalo Mordido and Haojin Yang and Christoph Meinel},\nyear={2019},\nurl={https://openreview.net/forum?id=BklMYjC9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BklMYjC9FQ", "pdf_size": 0, "rating": "3;3;6", "confidence": "3;3;3", "wc_review": "502;160;609", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 423.6666666666667, 191.489483320161 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZcKIL1UGDAsJ:scholar.google.com/&scioq=microGAN:+Promoting+Variety+through+Microbatch+Discrimination&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Accumulation Bit-Width Scaling For Ultra-Low Precision Training Of Deep Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/664", "id": "BklMjsRqY7", "author_site": "Charbel Sakr, Naigang Wang, Chia-Yu Chen, Jungwook Choi, Ankur Agrawal, Naresh Shanbhag, Kailash Gopalakrishnan", "tldr": "We present an analytical framework to determine accumulation bit-width requirements in all three deep learning training GEMMs and verify the validity and tightness of our method via benchmarking experiments.", "abstract": "Efforts to reduce the numerical precision of computations in deep learning training have yielded systems that aggressively quantize weights and activations, yet employ wide high-precision accumulators for partial sums in inner-product operations to preserve the quality of convergence. The absence of any framework to analyze the precision requirements of partial sum accumulations results in conservative design choices. This imposes an upper-bound on the reduction of complexity of multiply-accumulate units. We present a statistical approach to analyze the impact of reduced accumulation precision on deep learning training. Observing that a bad choice for accumulation precision results in loss of information that manifests itself as a reduction in variance in an ensemble of partial sums, we derive a set of equations that relate this variance to the length of accumulation and the minimum number of bits needed for accumulation. We apply our analysis to three benchmark networks: CIFAR-10 ResNet 32, ImageNet ResNet 18 and ImageNet AlexNet. In each case, with accumulation precision set in accordance with our proposed equations, the networks successfully converge to the single precision floating-point baseline. We also show that reducing accumulation precision further degrades the quality of the trained network, proving that our equations produce tight bounds. Overall this analysis enables precise tailoring of computation hardware to the application, yielding area- and power-optimal systems.", "keywords": "reduced precision floating-point;partial sum accumulation bit-width;deep learning;training", "primary_area": "", "supplementary_material": "", "author": "Charbel Sakr;Naigang Wang;Chia-Yu Chen;Jungwook Choi;Ankur Agrawal;Naresh Shanbhag;Kailash Gopalakrishnan", "authorids": "sakr2@illinois.edu;nwang@us.ibm.com;cchen@us.ibm.com;choij@us.ibm.com;ankuragr@us.ibm.com;shanbhag@illinois.edu;kailash@us.ibm.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nsakr2018accumulation,\ntitle={Accumulation Bit-Width Scaling For Ultra-Low Precision Training Of Deep Networks},\nauthor={Charbel Sakr and Naigang Wang and Chia-Yu Chen and Jungwook Choi and Ankur Agrawal and Naresh Shanbhag and Kailash Gopalakrishnan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BklMjsRqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "wc_review": "238;208;661", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "600;412;1104", "reply_reviewers": "0;0;0", "reply_authors": "2;2;3", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 369.0, 206.83810093887442 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 705.3333333333334, 292.16129487360615 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16539260459869643539&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=BklMjsRqY7", "pdf": "https://openreview.net/pdf?id=BklMjsRqY7", "email": ";;;;;;", "author_num": 7 }, { "id": "BklUAoAcY7", "title": "Unsupervised Learning of Sentence Representations Using Sequence Consistency", "track": "main", "status": "Reject", "tldr": "Good sentence encoders can be learned by training them to distinguish between consistent and inconsistent (pairs of) sequences that are generated in an unsupervised manner.", "abstract": "Computing universal distributed representations of sentences is a fundamental task in natural language processing. We propose ConsSent, a simple yet surprisingly powerful unsupervised method to learn such representations by enforcing consistency constraints on sequences of tokens. We consider two classes of such constraints \u2013 sequences that form a sentence and between two sequences that form a sentence when merged. We learn sentence encoders by training them to distinguish between consistent and inconsistent examples, the latter being generated by randomly perturbing consistent examples in six different ways. Extensive evaluation on several transfer learning and linguistic probing tasks shows improved performance over strong unsupervised and supervised baselines, substantially surpassing them in several cases. Our best results are achieved by training sentence encoders in a multitask setting and by an ensemble of encoders trained on the individual tasks.", "keywords": "sentence representation;unsupervised learning;LSTM", "primary_area": "", "supplementary_material": "", "author": "Siddhartha Brahma", "authorids": "sidbrahma@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nbrahma2019unsupervised,\ntitle={Unsupervised Learning of Sentence Representations Using Sequence Consistency},\nauthor={Siddhartha Brahma},\nyear={2019},\nurl={https://openreview.net/forum?id=BklUAoAcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BklUAoAcY7", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "305;362;435", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "909;1030;1081", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 367.3333333333333, 53.206098230267635 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1006.6666666666666, 72.13105356841038 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4395349413841527218&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Deep Convolutional Networks as shallow Gaussian Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/751", "id": "Bklfsi0cKm", "author_site": "Adri\u00e0 Garriga-Alonso, Carl Edward Rasmussen, Laurence Aitchison", "tldr": "We show that CNNs and ResNets with appropriate priors on the parameters are Gaussian processes in the limit of infinitely many convolutional filters.", "abstract": "We show that the output of a (residual) CNN with an appropriate prior over the weights and biases is a GP in the limit of infinitely many convolutional filters, extending similar results for dense networks. For a CNN, the equivalent kernel can be computed exactly and, unlike \"deep kernels\", has very few parameters: only the hyperparameters of the original CNN. Further, we show that this kernel has two properties that allow it to be computed efficiently; the cost of evaluating the kernel for a pair of images is similar to a single forward pass through the original CNN with only one filter per layer. The kernel equivalent to a 32-layer ResNet obtains 0.84% classification error on MNIST, a new record for GP with a comparable number of parameters.", "keywords": "Gaussian process;CNN;ResNet;Bayesian", "primary_area": "", "supplementary_material": "", "author": "Adri\u00e0 Garriga-Alonso;Carl Edward Rasmussen;Laurence Aitchison", "authorids": "ag919@cam.ac.uk;cer54@cam.ac.uk;laurence.aitchison@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngarriga-alonso2018deep,\ntitle={Deep Convolutional Networks as shallow Gaussian Processes},\nauthor={Adri\u00e0 Garriga-Alonso and Carl Edward Rasmussen and Laurence Aitchison},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bklfsi0cKm},\n}", "github": "[![github](/images/github_icon.svg) rhaps0dy/convnets-as-gps](https://github.com/rhaps0dy/convnets-as-gps) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Bklfsi0cKm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;5;8", "confidence": "4;5;3", "wc_review": "467;303;410", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "516;316;460", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 393.3333333333333, 67.98202376772521 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 430.6666666666667, 84.243034660967 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 321, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3896108923568012105&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Bklfsi0cKm", "pdf": "https://openreview.net/pdf?id=Bklfsi0cKm", "email": ";;", "author_num": 3 }, { "title": "Unsupervised Domain Adaptation for Distance Metric Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/694", "id": "BklhAj09K7", "author_site": "Kihyuk Sohn, Wenling Shang, Xiang Yu, Manmohan Chandraker", "tldr": "A new theory of unsupervised domain adaptation for distance metric learning and its application to face recognition across diverse ethnicity variations.", "abstract": "Unsupervised domain adaptation is a promising avenue to enhance the performance of deep neural networks on a target domain, using labels only from a source domain. However, the two predominant methods, domain discrepancy reduction learning and semi-supervised learning, are not readily applicable when source and target domains do not share a common label space. This paper addresses the above scenario by learning a representation space that retains discriminative power on both the (labeled) source and (unlabeled) target domains while keeping representations for the two domains well-separated. Inspired by a theoretical analysis, we first reformulate the disjoint classification task, where the source and target domains correspond to non-overlapping class labels, to a verification one. To handle both within and cross domain verifications, we propose a Feature Transfer Network (FTN) to separate the target feature space from the original source space while aligned with a transformed source space. Moreover, we present a non-parametric multi-class entropy minimization loss to further boost the discriminative power of FTNs on the target domain. In experiments, we first illustrate how FTN works in a controlled setting of adapting from MNIST-M to MNIST with disjoint digit classes between the two domains and then demonstrate the effectiveness of FTNs through state-of-the-art performances on a cross-ethnicity face recognition problem.\n", "keywords": "domain adaptation;distance metric learning;face recognition", "primary_area": "", "supplementary_material": "", "author": "Kihyuk Sohn;Wenling Shang;Xiang Yu;Manmohan Chandraker", "authorids": "kihyuk.sohn@gmail.com;wendyshang1208@gmail.com;xiangyu@nec-labs.com;manu@nec-labs.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsohn2018unsupervised,\ntitle={Unsupervised Domain Adaptation for Distance Metric Learning},\nauthor={Kihyuk Sohn and Wenling Shang and Xiang Yu and Manmohan Chandraker},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BklhAj09K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;8;8", "confidence": "4;4;5", "wc_review": "138;170;251", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "619;131;114", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 186.33333333333334, 47.55581516024676 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 288.0, 234.1552191745182 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8422183633615722259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BklhAj09K7", "pdf": "https://openreview.net/pdf?id=BklhAj09K7", "email": ";;;", "author_num": 4 }, { "title": "A comprehensive, application-oriented study of catastrophic forgetting in DNNs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/656", "id": "BkloRs0qK7", "author_site": "Benedikt Pf\u00fclb, Alexander RT Gepperth", "tldr": "We check DNN models for catastrophic forgetting using a new evaluation scheme that reflects typical application conditions, with surprising results.", "abstract": "We present a large-scale empirical study of catastrophic forgetting (CF) in modern Deep Neural Network (DNN) models that perform sequential (or: incremental) learning.\nA new experimental protocol is proposed that takes into account typical constraints encountered in application scenarios.\nAs the investigation is empirical, we evaluate CF behavior on the hitherto largest number of visual classification datasets, from each of which we construct a representative number of Sequential Learning Tasks (SLTs) in close alignment to previous works on CF.\nOur results clearly indicate that there is no model that avoids CF for all investigated datasets and SLTs under application conditions. We conclude with a discussion of potential solutions and workarounds to CF, notably for the EWC and IMM models.", "keywords": "incremental learning;deep neural networks;catatrophic forgetting;sequential learning", "primary_area": "", "supplementary_material": "", "author": "B. Pf\u00fclb;A. Gepperth", "authorids": "benedikt.pfuelb@cs.hs-fulda.de;alexander.gepperth@cs.hs-fulda.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\npf\u00fclb2018a,\ntitle={A comprehensive, application-oriented study of catastrophic forgetting in {DNN}s},\nauthor={B. Pf\u00fclb and A. Gepperth},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkloRs0qK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;5;3", "wc_review": "115;364;642", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 373.6666666666667, 215.25540385525488 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 115, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4360661774443932111&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 18, "openreview": "https://openreview.net/forum?id=BkloRs0qK7", "pdf": "https://openreview.net/pdf?id=BkloRs0qK7", "email": ";", "author_num": 2 }, { "id": "BklpOo09tQ", "title": "EFFICIENT TWO-STEP ADVERSARIAL DEFENSE FOR DEEP NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "We proposed a time-efficient defense method against one-step and iterative adversarial attacks.", "abstract": "In recent years, deep neural networks have demonstrated outstanding performancein many machine learning tasks. However, researchers have discovered that thesestate-of-the-art models are vulnerable to adversarial examples: legitimate examples added by small perturbations which are unnoticeable to human eyes. Adversarial training, which augments the training data with adversarial examples duringthe training process, is a well known defense to improve the robustness of themodel against adversarial attacks. However, this robustness is only effective tothe same attack method used for adversarial training. Madry et al. (2017) suggest that effectiveness of iterative multi-step adversarial attacks and particularlythat projected gradient descent (PGD) may be considered the universal first order adversary and applying the adversarial training with PGD implies resistanceagainst many other first order attacks. However, the computational cost of theadversarial training with PGD and other multi-step adversarial examples is muchhigher than that of the adversarial training with other simpler attack techniques.In this paper, we show how strong adversarial examples can be generated only ata cost similar to that of two runs of the fast gradient sign method (FGSM), allowing defense against adversarial attacks with a robustness level comparable to thatof the adversarial training with multi-step adversarial examples. We empiricallydemonstrate the effectiveness of the proposed two-step defense approach againstdifferent attack methods and its improvements over existing defense strategies.", "keywords": "Adversarial Examples;Adversarial Training;FGSM;IFGSM;Robustness", "primary_area": "", "supplementary_material": "", "author": "Ting-Jui Chang;Yukun He;Peng Li", "authorids": "tingjui.chang@tamu.edu;dominiche@tamu.edu;pli@tamu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchang2019efficient,\ntitle={{EFFICIENT} {TWO}-{STEP} {ADVERSARIAL} {DEFENSE} {FOR} {DEEP} {NEURAL} {NETWORKS}},\nauthor={Ting-Jui Chang and Yukun He and Peng Li},\nyear={2019},\nurl={https://openreview.net/forum?id=BklpOo09tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BklpOo09tQ", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;3", "wc_review": "803;196;263", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 420.6666666666667, 271.7306673078244 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5746445656884778961&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Learning deep representations by mutual information estimation and maximization", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/649", "id": "Bklr3j0cKX", "author_site": "R Devon Hjelm, Alex Fedorov, Samuel Lavoie, Karan Grewal, Philip Bachman, Adam Trischler, Yoshua Bengio", "tldr": "We learn deep representation by maximizing mutual information, leveraging structure in the objective, and are able to compute with fully supervised classifiers with comparable architectures", "abstract": "This work investigates unsupervised learning of representations by maximizing mutual information between an input and the output of a deep neural network encoder. Importantly, we show that structure matters: incorporating knowledge about locality in the input into the objective can significantly improve a representation's suitability for downstream tasks. We further control characteristics of the representation by matching to a prior distribution adversarially. Our method, which we call Deep InfoMax (DIM), outperforms a number of popular unsupervised learning methods and compares favorably with fully-supervised learning on several classification tasks in with some standard architectures. DIM opens new avenues for unsupervised learning of representations and is an important step towards flexible formulations of representation learning objectives for specific end-goals.", "keywords": "representation learning;unsupervised learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "R Devon Hjelm;Alex Fedorov;Samuel Lavoie-Marchildon;Karan Grewal;Phil Bachman;Adam Trischler;Yoshua Bengio", "authorids": "devon.hjelm@microsoft.com;eidos92@gmail.com;samuel.lavoie-marchildon@umontreal.ca;karang@cs.toronto.edu;phil.bachman@gmail.com;adam.trischler@microsoft.com;yoshua.umontreal@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nhjelm2018learning,\ntitle={Learning deep representations by mutual information estimation and maximization},\nauthor={R Devon Hjelm and Alex Fedorov and Samuel Lavoie-Marchildon and Karan Grewal and Phil Bachman and Adam Trischler and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bklr3j0cKX},\n}", "github": "[![github](/images/github_icon.svg) rdevon/DIM](https://github.com/rdevon/DIM) + [![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=Bklr3j0cKX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;9", "confidence": "4;5;3", "wc_review": "200;688;795", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "161;1293;380", "reply_reviewers": "0;0;0", "reply_authors": "1;3;2", "rating_avg": [ 7.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 561.0, 258.97618938170103 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 611.3333333333334, 490.23282450507355 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 3384, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9102831258285751412&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Bklr3j0cKX", "pdf": "https://openreview.net/pdf?id=Bklr3j0cKX", "email": ";;;;;;", "author_num": 7 }, { "title": "Posterior Attention Models for Sequence to Sequence Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/858", "id": "BkltNhC9FX", "author_site": "Shiv Shankar, Sunita Sarawagi", "tldr": "Computing attention based on posterior distribution leads to more meaningful attention and better performance", "abstract": "Modern neural architectures critically rely on attention for mapping structured inputs to sequences. In this paper we show that prevalent attention architectures do not adequately model the dependence among the attention and output tokens across a predicted sequence.\nWe present an alternative architecture called Posterior Attention Models that after a principled factorization of the full joint distribution of the attention and output variables, proposes two major changes. First, the position where attention is marginalized is changed from the input to the output. Second, the attention propagated to the next decoding stage is a posterior attention distribution conditioned on the output. Empirically on five translation and two morphological inflection tasks the proposed posterior attention models yield better BLEU score and alignment accuracy than existing attention models.", "keywords": "posterior inference;attention;seq2seq learning;translation", "primary_area": "", "supplementary_material": "", "author": "Shiv Shankar;Sunita Sarawagi", "authorids": "sshankar@umass.edu;sunita@iitb.ac.in", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nshankar2018posterior,\ntitle={Posterior Attention Models for Sequence to Sequence Learning},\nauthor={Shiv Shankar and Sunita Sarawagi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkltNhC9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;5;4", "wc_review": "415;660;341", "wc_reply_reviewers": "0;127;0", "wc_reply_authors": "324;286;55", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 472.0, 136.32559065218337 ], "wc_reply_reviewers_avg": [ 42.333333333333336, 59.86837414046102 ], "wc_reply_authors_avg": [ 221.66666666666666, 118.86780706127102 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5382157066482592082&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkltNhC9FX", "pdf": "https://openreview.net/pdf?id=BkltNhC9FX", "email": ";", "author_num": 2 }, { "id": "Bklzkh0qFm", "title": "Relational Graph Attention Networks", "track": "main", "status": "Reject", "tldr": "We propose a new model for relational graphs and evaluate it on relational transductive and inductive tasks.", "abstract": "We investigate Relational Graph Attention Networks, a class of models that extends non-relational graph attention mechanisms to incorporate relational information, opening up these methods to a wider variety of problems. A thorough evaluation of these models is performed, and comparisons are made against established benchmarks. To provide a meaningful comparison, we retrain Relational Graph Convolutional Networks, the spectral counterpart of Relational Graph Attention Networks, and evaluate them under the same conditions. We find that Relational Graph Attention Networks perform worse than anticipated, although some configurations are marginally beneficial for modelling molecular properties. We provide insights as to why this may be, and suggest both modifications to evaluation strategies, as well as directions to investigate for future work.", "keywords": "RGCN;attention;graph convolutional networks;semi-supervised learning;graph classification;molecules", "primary_area": "", "supplementary_material": "", "author": "Dan Busbridge;Dane Sherburn;Pietro Cavallo;Nils Y. Hammerla", "authorids": "dan.busbridge@gmail.com;danesherbs@gmail.com;p.cavallo85@gmail.com;nils.hammerla@babylonhealth.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbusbridge2019relational,\ntitle={Relational Graph Attention Networks},\nauthor={Dan Busbridge and Dane Sherburn and Pietro Cavallo and Nils Y. Hammerla},\nyear={2019},\nurl={https://openreview.net/forum?id=Bklzkh0qFm},\n}", "github": "[![github](/images/github_icon.svg) Babylonpartners/rgat](https://github.com/Babylonpartners/rgat) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Bklzkh0qFm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bklzkh0qFm", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;5;4", "wc_review": "144;161;317", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "802;483;736", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 207.33333333333334, 77.85599241905247 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 673.6666666666666, 137.48777723451963 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 258, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16466335675086615136&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 4 }, { "title": "Generative Question Answering: Learning to Answer the Whole Question", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1020", "id": "Bkx0RjA9tX", "author_site": "Mike Lewis, Angela Fan", "tldr": "Question answering models that model the joint distribution of questions and answers can learn more than discriminative models", "abstract": "Discriminative question answering models can overfit to superficial biases in datasets, because their loss function saturates when any clue makes the answer likely. We introduce generative models of the joint distribution of questions and answers, which are trained to explain the whole question, not just to answer it.Our question answering (QA) model is implemented by learning a prior over answers, and a conditional language model to generate the question given the answer\u2014allowing scalable and interpretable many-hop reasoning as the question is generated word-by-word. Our model achieves competitive performance with specialised discriminative models on the SQUAD and CLEVR benchmarks, indicating that it is a more general architecture for language understanding and reasoning than previous work. The model greatly improves generalisation both from biased training data and to adversarial testing data, achieving a new state-of-the-art on ADVERSARIAL SQUAD. We will release our code.", "keywords": "Question answering;question generation;reasoning;squad;clevr", "primary_area": "", "supplementary_material": "", "author": "Mike Lewis;Angela Fan", "authorids": "mikelewis@fb.com;angelafan@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlewis2018generative,\ntitle={Generative Question Answering: Learning to Answer the Whole Question},\nauthor={Mike Lewis and Angela Fan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkx0RjA9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "wc_review": "336;648;498", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "420;785;313", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 494.0, 127.40486646906389 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 506.0, 202.06104688105194 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9070819050812325363&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Bkx0RjA9tX", "pdf": "https://openreview.net/pdf?id=Bkx0RjA9tX", "email": ";", "author_num": 2 }, { "id": "Bkx8OiRcYX", "title": "Countdown Regression: Sharp and Calibrated Survival Predictions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Personalized probabilistic forecasts of time to event (such as mortality) can be crucial in decision making, especially in the clinical setting. Inspired by ideas from the meteorology literature, we approach this problem through the paradigm of maximizing sharpness of prediction distributions, subject to calibration. In regression problems, it has been shown that optimizing the continuous ranked probability score (CRPS) instead of maximum likelihood leads to sharper prediction distributions while maintaining calibration. We introduce the Survival-CRPS, a generalization of the CRPS to the time to event setting, and present right-censored and interval-censored variants. To holistically evaluate the quality of predicted distributions over time to event, we present the scale agnostic Survival-AUPRC evaluation metric, an analog to area under the precision-recall curve. We apply these ideas by building a recurrent neural network for mortality prediction, using an Electronic Health Record dataset covering millions of patients. We demonstrate signi\ufb01cant bene\ufb01ts in models trained by the Survival-CRPS objective instead of maximum likelihood.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Anand Avati;Tony Duan;Sharon Zhou;Kenneth Jung;Nigam Shah;Andrew Ng", "authorids": ";;sharonz@cs.stanford.edu;;;", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\navati2019countdown,\ntitle={Countdown Regression: Sharp and Calibrated Survival Predictions},\nauthor={Anand Avati and Tony Duan and Sharon Zhou and Kenneth Jung and Nigam Shah and Andrew Ng},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkx8OiRcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=Bkx8OiRcYX", "pdf_size": 0, "rating": "4;4;4;5", "confidence": "4;4;5;3", "wc_review": "291;202;239;252", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "584;654;426;271", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 4.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "wc_review_avg": [ 246.0, 31.804087787578503 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 483.75, 148.0141462833874 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.816496580927726, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12854697734415524962&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "BkxAUjRqY7", "title": "An Information-Theoretic Metric of Transferability for Task Transfer Learning", "track": "main", "status": "Reject", "tldr": "We present a provable and easily-computable evaluation function that estimates the performance of transferred representations from one learning task to another in task transfer learning.", "abstract": "An important question in task transfer learning is to determine task transferability, i.e. given a common input domain, estimating to what extent representations learned from a source task can help in learning a target task. Typically, transferability is either measured experimentally or inferred through task relatedness, which is often defined without a clear operational meaning. In this paper, we present a novel metric, H-score, an easily-computable evaluation function that estimates the performance of transferred representations from one task to another in classification problems. Inspired by a principled information theoretic approach, H-score has a direct connection to the asymptotic error probability of the decision function based on the transferred feature. This formulation of transferability can further be used to select a suitable set of source tasks in task transfer learning problems or to devise efficient transfer learning policies. Experiments using both synthetic and real image data show that not only our formulation of transferability is meaningful in practice, but also it can generalize to inference problems beyond classification, such as recognition tasks for 3D indoor-scene understanding.", "keywords": "transfer learning;task transfer learning;H-score;transferability", "primary_area": "", "supplementary_material": "", "author": "Yajie Bao;Yang Li;Shao-Lun Huang;Lin Zhang;Amir R. Zamir;Leonidas J. Guibas", "authorids": "byjem123@163.com;tori2011@gmail.com;shaolun.huang@sz.tsinghua.edu.cn;linzhang@tsinghua.edu.cn;zamir@cs.stanford.edu;guibas@cs.stanford.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nbao2019an,\ntitle={An Information-Theoretic Metric of Transferability for Task Transfer Learning},\nauthor={Yajie Bao and Yang Li and Shao-Lun Huang and Lin Zhang and Amir R. Zamir and Leonidas J. Guibas},\nyear={2019},\nurl={https://openreview.net/forum?id=BkxAUjRqY7},\n}", "github": "[![github](/images/github_icon.svg) YaojieBao/An-Information-theoretic-Metric-of-Transferability](https://github.com/YaojieBao/An-Information-theoretic-Metric-of-Transferability)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BkxAUjRqY7", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;3", "wc_review": "273;243;180", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "285;296;29", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 232.0, 38.7556447501522 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 203.33333333333334, 123.35405231374534 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7433259101936485977&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BkxSHsC5FQ", "title": "SupportNet: solving catastrophic forgetting in class incremental learning with support data", "track": "main", "status": "Reject", "tldr": "", "abstract": "A plain well-trained deep learning model often does not have the ability to learn new knowledge without forgetting the previously learned knowledge, which is known as catastrophic forgetting. Here we propose a novel method, SupportNet, to efficiently and effectively solve the catastrophic forgetting problem in the class incremental learning scenario. SupportNet combines the strength of deep learning and support vector machine (SVM), where SVM is used to identify the support data from the old data, which are fed to the deep learning model together with the new data for further training so that the model can review the essential information of the old data when learning the new information. Two powerful consolidation regularizers are applied to stabilize the learned representation and ensure the robustness of the learned model. We validate our method with comprehensive experiments on various tasks, which show that SupportNet drastically outperforms the state-of-the-art incremental learning methods and even reaches similar performance as the deep learning model trained from scratch on both old and new data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Li;Zhongxiao Li;Lizhong Ding;Yijie Pan;Chao Huang;Yuhui Hu;Wei Chen;Xin Gao", "authorids": "yu.li@kaust.edu.sa;zhongxiao.li@kaust.edu.sa;lizhong.ding@inceptioniai.org;pyj@nbicc.com;chuang@ict.ac.cn;huyh@sustc.edu.cn;chenw@sustc.edu.cn;xin.gao@kaust.edu.sa", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nli2019supportnet,\ntitle={SupportNet: solving catastrophic forgetting in class incremental learning with support data},\nauthor={Yu Li and Zhongxiao Li and Lizhong Ding and Yijie Pan and Chao Huang and Yuhui Hu and Wei Chen and Xin Gao},\nyear={2019},\nurl={https://openreview.net/forum?id=BkxSHsC5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=BkxSHsC5FQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "246;344;733", "wc_reply_reviewers": "95;0;0", "wc_reply_authors": "1707;1442;985", "reply_reviewers": "1;0;0", "reply_authors": "3;2;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 441.0, 210.31563581119372 ], "wc_reply_reviewers_avg": [ 31.666666666666668, 44.78342947514801 ], "wc_reply_authors_avg": [ 1378.0, 298.2090988998603 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17605365223382089863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Diversity and Depth in Per-Example Routing Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1002", "id": "BkxWJnC9tX", "author_site": "Prajit Ramachandran, Quoc V Le", "tldr": "Per-example routing models benefit from architectural diversity, but still struggle to scale to a large number of routing decisions.", "abstract": "Routing models, a form of conditional computation where examples are routed through a subset of components in a larger network, have shown promising results in recent works. Surprisingly, routing models to date have lacked important properties, such as architectural diversity and large numbers of routing decisions. Both architectural diversity and routing depth can increase the representational power of a routing network. In this work, we address both of these deficiencies. We discuss the significance of architectural diversity in routing models, and explain the tradeoffs between capacity and optimization when increasing routing depth. In our experiments, we find that adding architectural diversity to routing models significantly improves performance, cutting the error rates of a strong baseline by 35% on an Omniglot setup. However, when scaling up routing depth, we find that modern routing techniques struggle with optimization. We conclude by discussing both the positive and negative results, and suggest directions for future research.", "keywords": "conditional computation;routing models;depth", "primary_area": "", "supplementary_material": "", "author": "Prajit Ramachandran;Quoc V. Le", "authorids": "prajitram@gmail.com;qvl@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nramachandran2018diversity,\ntitle={Diversity and Depth in Per-Example Routing Models},\nauthor={Prajit Ramachandran and Quoc V. Le},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkxWJnC9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;5", "wc_review": "425;235;449", "wc_reply_reviewers": "0;0;144", "wc_reply_authors": "345;163;899", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 369.6666666666667, 95.72646214895626 ], "wc_reply_reviewers_avg": [ 48.0, 67.88225099390856 ], "wc_reply_authors_avg": [ 469.0, 313.0026623954925 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15364434217579695622&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BkxWJnC9tX", "pdf": "https://openreview.net/pdf?id=BkxWJnC9tX", "email": ";", "author_num": 2 }, { "id": "Bkx_Dj09tQ", "title": "Causal importance of orientation selectivity for generalization in image recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Although both our brain and deep neural networks (DNNs) can perform high-level sensory-perception tasks such as image or speech recognition, the inner mechanism of these hierarchical information-processing systems is poorly understood in both neuroscience and machine learning. Recently, Morcos et al. (2018) examined the effect of class-selective units in DNNs, i.e., units with high-level selectivity, on network generalization, concluding that hidden units that are selectively activated by specific input patterns may harm the network's performance. In this study, we revisit their hypothesis, considering units with selectivity for lower-level features, and argue that selective units are not always harmful to the network performance. Specifically, by using DNNs trained for image classification (7-layer CNNs and VGG16 trained on CIFAR-10 and ImageNet, respectively), we analyzed the orientation selectivity of individual units. Orientation selectivity is a low-level selectivity widely studied in visual neuroscience, in which, when images of bars with several orientations are presented to the eye, many neurons in the visual cortex respond selectively to a specific orientation. We found that orientation-selective units exist in both lower and higher layers of these DNNs, as in our brain. In particular, units in the lower layers become more orientation-selective as the generalization performance improves during the course of training of the DNNs. Consistently, networks that generalize better are more orientation-selective in the lower layers. We finally reveal that ablating these selective units in the lower layers substantially degrades the generalization performance, at least by disrupting the shift-invariance of the higher layers. These results suggest to the machine-learning community that, contrary to the triviality of units with high-level selectivity, lower-layer units with selectivity for low-level features can be indispensable for generalization, and for neuroscientists, orientation selectivity can play a causally important role in object recognition.", "keywords": "deep learning;generalization;selectivity;neuroscience", "primary_area": "", "supplementary_material": "", "author": "Jumpei Ukita", "authorids": "i.love.ny517@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nukita2019causal,\ntitle={Causal importance of orientation selectivity for generalization in image recognition},\nauthor={Jumpei Ukita},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkx_Dj09tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bkx_Dj09tQ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;2", "wc_review": "375;528;173", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "699;626;274", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 358.6666666666667, 145.38760454576433 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 533.0, 185.54963397071594 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a67S8Q4p7dkJ:scholar.google.com/&scioq=Causal+importance+of+orientation+selectivity+for+generalization+in+image+recognition&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Selfless Sequential Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/917", "id": "Bkxbrn0cYX", "author_site": "Rahaf Aljundi, Marcus Rohrbach, Tinne Tuytelaars", "tldr": "A regularization strategy for improving the performance of sequential learning", "abstract": "Sequential learning, also called lifelong learning, studies the problem of learning tasks in a sequence with access restricted to only the data of the current task. In this paper we look at a scenario with fixed model capacity, and postulate that the learning process should not be selfish, i.e. it should account for future tasks to be added and thus leave enough capacity for them. To achieve Selfless Sequential Learning we study different regularization strategies and activation functions. We find that\nimposing sparsity at the level of the representation (i.e. neuron activations) is more beneficial for sequential learning than encouraging parameter sparsity. In particular, we propose a novel regularizer, that encourages representation sparsity by means of neural inhibition. It results in few active neurons which in turn leaves more free neurons to be utilized by upcoming tasks. As neural inhibition over an entire layer can be too drastic, especially for complex tasks requiring strong representations,\nour regularizer only inhibits other neurons in a local neighbourhood, inspired by lateral inhibition processes in the brain. We combine our novel regularizer with state-of-the-art lifelong learning methods that penalize changes to important previously learned parts of the network. We show that our new regularizer leads to increased sparsity which translates in consistent performance improvement on diverse datasets.", "keywords": "Lifelong learning;Continual Learning;Sequential learning;Regularization", "primary_area": "", "supplementary_material": "", "author": "Rahaf Aljundi;Marcus Rohrbach;Tinne Tuytelaars", "authorids": "rahaf.aljundi@gmail.com;mrf@fb.com;tinne.tuytelaars@esat.kuleuven.be", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\naljundi2018selfless,\ntitle={Selfless Sequential Learning},\nauthor={Rahaf Aljundi and Marcus Rohrbach and Tinne Tuytelaars},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkxbrn0cYX},\n}", "github": "[![github](/images/github_icon.svg) rahafaljundi/Selfless-Sequential-Learning](https://github.com/rahafaljundi/Selfless-Sequential-Learning)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;4", "wc_review": "303;492;950", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "323;1208;464", "reply_reviewers": "0;0;0", "reply_authors": "1;4;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 581.6666666666666, 271.63987597961795 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 665.0, 388.2499195106163 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11518728044683719539&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Bkxbrn0cYX", "pdf": "https://openreview.net/pdf?id=Bkxbrn0cYX", "email": ";;", "author_num": 3 }, { "id": "Bkxdqj0cFQ", "title": "Calibration of neural network logit vectors to combat adversarial attacks", "track": "main", "status": "Reject", "tldr": "This paper uses principles from the field of calibration in machine learning on the logits of a neural network to defend against adversarial attacks", "abstract": "Adversarial examples remain an issue for contemporary neural networks. This paper draws on Background Check (Perello-Nieto et al., 2016), a technique in model calibration, to assist two-class neural networks in detecting adversarial examples, using the one dimensional difference between logit values as the underlying measure. This method interestingly tends to achieve the highest average recall on image sets that are generated with large perturbation vectors, which is unlike the existing literature on adversarial attacks (Cubuk et al., 2017). The proposed method does not need knowledge of the attack parameters or methods at training time, unlike a great deal of the literature that uses deep learning based methods to detect adversarial examples, such as Metzen et al. (2017), imbuing the proposed method with additional flexibility.", "keywords": "Adversarial attacks;calibration;probability;adversarial defence", "primary_area": "", "supplementary_material": "", "author": "Oliver Goldstein", "authorids": "og14775@my.bristol.ac.uk", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ngoldstein2019calibration,\ntitle={Calibration of neural network logit vectors to combat adversarial attacks},\nauthor={Oliver Goldstein},\nyear={2019},\nurl={https://openreview.net/forum?id=Bkxdqj0cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bkxdqj0cFQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;5", "wc_review": "240;468;449", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "25;25;25", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 385.6666666666667, 103.29354072523391 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 25.0, 0.0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sAgZVW0RjQgJ:scholar.google.com/&scioq=Calibration+of+neural+network+logit+vectors+to+combat+adversarial+attacks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BkxgbhCqtQ", "title": "Predictive Uncertainty through Quantization", "track": "main", "status": "Reject", "tldr": "A novel tractable and flexible variational distribution through quantization of latent variables, applied to the deep variational information bottleneck objective for improved uncertainty.", "abstract": "High-risk domains require reliable confidence estimates from predictive models. \nDeep latent variable models provide these, but suffer from the rigid variational distributions used for tractable inference, which err on the side of overconfidence.\nWe propose Stochastic Quantized Activation Distributions (SQUAD), which imposes a flexible yet tractable distribution over discretized latent variables.\nThe proposed method is scalable, self-normalizing and sample efficient. We demonstrate that the model fully utilizes the flexible distribution, learns interesting non-linearities, and provides predictive uncertainty of competitive quality.\n", "keywords": "variational inference;information bottleneck;bayesian deep learning;latent variable models;amortized variational inference;uncertainty;learning non-linearities", "primary_area": "", "supplementary_material": "", "author": "Bastiaan S. Veeling;Rianne van den Berg;Max Welling", "authorids": "basveeling@gmail.com;;", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nveeling2019predictive,\ntitle={Predictive Uncertainty through Quantization},\nauthor={Bastiaan S. Veeling and Rianne van den Berg and Max Welling},\nyear={2019},\nurl={https://openreview.net/forum?id=BkxgbhCqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkxgbhCqtQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "360;247;257", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "702;547;621", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 288.0, 51.07510809255979 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 623.3333333333334, 63.299991223450334 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9493003212982218974&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "BkxkH30cFm", "title": "Object-Oriented Model Learning through Multi-Level Abstraction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Object-based approaches for learning action-conditioned dynamics has demonstrated promise for generalization and interpretability. However, existing approaches suffer from structural limitations and optimization difficulties for common environments with multiple dynamic objects. In this paper, we present a novel self-supervised learning framework, called Multi-level Abstraction Object-oriented Predictor (MAOP), for learning object-based dynamics models from raw visual observations. MAOP employs a three-level learning architecture that enables efficient dynamics learning for complex environments with a dynamic background. We also design a spatial-temporal relational reasoning mechanism to support instance-level dynamics learning and handle partial observability. Empirical results show that MAOP significantly outperforms previous methods in terms of sample efficiency and generalization over novel environments that have multiple controllable and uncontrollable dynamic objects and different static object layouts. In addition, MAOP learns semantically and visually interpretable disentangled representations.", "keywords": "action-conditioned dynamics learning;deep learning;generalization;interpretability;sample efficiency", "primary_area": "", "supplementary_material": "", "author": "Guangxiang Zhu;Jianhao Wang;ZhiZhou Ren;Chongjie Zhang", "authorids": "guangxiangzhu@outlook.com;jh-wang15@mails.tsinghua.edu.cn;rzz16@mails.tsinghua.edu.cn;chongjie@tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhu2019objectoriented,\ntitle={Object-Oriented Model Learning through Multi-Level Abstraction},\nauthor={Guangxiang Zhu and Jianhao Wang and ZhiZhou Ren and Chongjie Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=BkxkH30cFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BkxkH30cFm", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;3", "wc_review": "543;1178;1398", "wc_reply_reviewers": "0;383;0", "wc_reply_authors": "1203;816;1414", "reply_reviewers": "0;1;0", "reply_authors": "2;2;2", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 1039.6666666666667, 362.4990421443283 ], "wc_reply_reviewers_avg": [ 127.66666666666667, 180.54793146296512 ], "wc_reply_authors_avg": [ 1144.3333333333333, 247.631895271097 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Opjb2yVA7McJ:scholar.google.com/&scioq=Object-Oriented+Model+Learning+through+Multi-Level+Abstraction&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "M^3RL: Mind-aware Multi-agent Management Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1037", "id": "BkzeUiRcY7", "author_site": "Tianmin Shu, Yuandong Tian", "tldr": "We propose Mind-aware Multi-agent Management Reinforcement Learning (M^3RL) for training a manager to motivate self-interested workers to achieve optimal collaboration by assigning suitable contracts to them.", "abstract": "Most of the prior work on multi-agent reinforcement learning (MARL) achieves optimal collaboration by directly learning a policy for each agent to maximize a common reward. In this paper, we aim to address this from a different angle. In particular, we consider scenarios where there are self-interested agents (i.e., worker agents) which have their own minds (preferences, intentions, skills, etc.) and can not be dictated to perform tasks they do not want to do. For achieving optimal coordination among these agents, we train a super agent (i.e., the manager) to manage them by first inferring their minds based on both current and past observations and then initiating contracts to assign suitable tasks to workers and promise to reward them with corresponding bonuses so that they will agree to work together. The objective of the manager is to maximize the overall productivity as well as minimize payments made to the workers for ad-hoc worker teaming. To train the manager, we propose Mind-aware Multi-agent Management Reinforcement Learning (M^3RL), which consists of agent modeling and policy learning. We have evaluated our approach in two environments, Resource Collection and Crafting, to simulate multi-agent management problems with various task settings and multiple designs for the worker agents. The experimental results have validated the effectiveness of our approach in modeling worker agents' minds online, and in achieving optimal ad-hoc teaming with good generalization and fast adaptation.", "keywords": "Multi-agent Reinforcement Learning;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Tianmin Shu;Yuandong Tian", "authorids": "tianmin.shu@ucla.edu;yuandong@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nshu2018mrl,\ntitle={M^3{RL}: Mind-aware Multi-agent Management Reinforcement Learning},\nauthor={Tianmin Shu and Yuandong Tian},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BkzeUiRcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "1;3;4", "wc_review": "513;228;313", "wc_reply_reviewers": "0;0;33", "wc_reply_authors": "648;416;321", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 351.3333333333333, 119.46640624971616 ], "wc_reply_reviewers_avg": [ 11.0, 15.556349186104045 ], "wc_reply_authors_avg": [ 461.6666666666667, 137.34708668997033 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=841802929654227287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BkzeUiRcY7", "pdf": "https://openreview.net/pdf?id=BkzeUiRcY7", "email": ";", "author_num": 2 }, { "id": "By40DoAqtX", "title": "Learning Discriminators as Energy Networks in Adversarial Learning", "track": "main", "status": "Reject", "tldr": "We propose a novel adversarial learning framework for structured prediction, in which discriminative models can be used to refine structured prediction models at the inference stage. ", "abstract": "We propose a novel adversarial learning framework in this work. Existing adversarial learning methods involve two separate networks, i.e., the structured prediction models and the discriminative models, in the training. The information captured by discriminative models complements that in the structured prediction models, but few existing researches have studied on utilizing such information to improve structured prediction models at the inference stage. In this work, we propose to refine the predictions of structured prediction models by effectively integrating discriminative models into the prediction. Discriminative models are treated as energy-based models. Similar to the adversarial learning, discriminative models are trained to estimate scores which measure the quality of predicted outputs, while structured prediction models are trained to predict contrastive outputs with maximal energy scores. In this way, the gradient vanishing problem is ameliorated, and thus we are able to perform inference by following the ascent gradient directions of discriminative models to refine structured prediction models. The proposed method is able to handle a range of tasks, \\emph{e.g.}, multi-label classification and image segmentation. Empirical results on these two tasks validate the effectiveness of our learning method.", "keywords": "adversarial learning;structured prediction;energy networks", "primary_area": "", "supplementary_material": "", "author": "Pingbo Pan;Yan Yan;Tianbao Yang;Yi Yang", "authorids": "pingbo.pan@student.uts.edu.au;yan.yan-3@student.uts.edu.au;tianbao-yang@uiowa.edu;yi.yang@uts.edu.au", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\npan2019learning,\ntitle={Learning Discriminators as Energy Networks in Adversarial Learning},\nauthor={Pingbo Pan and Yan Yan and Tianbao Yang and Yi Yang},\nyear={2019},\nurl={https://openreview.net/forum?id=By40DoAqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=By40DoAqtX", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;5", "wc_review": "417;345;591", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "88;218;531", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 451.0, 103.26664514740469 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 279.0, 185.92650877878245 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16132307574404107322&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "By41BjA9YQ", "title": "Laplacian Smoothing Gradient Descent", "track": "main", "status": "Reject", "tldr": "We proposal a simple surrogate for gradient descent to improve training of deep neural nets and other optimization problems.", "abstract": "We propose a class of very simple modifications of gradient descent and stochastic gradient descent. We show that when applied to a large variety of machine learning problems, ranging from softmax regression to deep neural nets, the proposed surrogates can dramatically reduce the variance and improve the generalization accuracy. The methods only involve multiplying the usual (stochastic) gradient by the inverse of a positive definitive matrix coming from the discrete Laplacian or its high order generalizations. The theory of Hamilton-Jacobi partial differential equations demonstrates that the implicit version of new algorithm is almost the same as doing gradient descent on a new function which (i) has the same global minima as the original function and (ii) is ``more convex\". We show that optimization algorithms with these surrogates converge uniformly in the discrete Sobolev $H_\\sigma^p$ sense and reduce the optimality gap for convex optimization problems. We implement our algorithm into both PyTorch and Tensorflow platforms which only involves changing of a few lines of code. The code will be available on Github.", "keywords": "Laplacian Smoothing;Nonconvex Optimization;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Stanley J. Osher;Bao Wang;Penghang Yin;Xiyang Luo;Minh Pham;Alex T. Lin", "authorids": "sjo@math.ucla.edu;wangbaonj@gmail.com;yph@g.ucla.edu;xylmath@gmail.com;minhrose@ucla.edu;atlin@math.ucla.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nosher2019laplacian,\ntitle={Laplacian Smoothing Gradient Descent},\nauthor={Stanley J. Osher and Bao Wang and Penghang Yin and Xiyang Luo and Minh Pham and Alex T. Lin},\nyear={2019},\nurl={https://openreview.net/forum?id=By41BjA9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=By41BjA9YQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "455;370;777", "wc_reply_reviewers": "0;0;166", "wc_reply_authors": "776;622;782", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 534.0, 175.29594024582164 ], "wc_reply_reviewers_avg": [ 55.333333333333336, 78.25315045131126 ], "wc_reply_authors_avg": [ 726.6666666666666, 74.05103345366327 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11069536981587879245&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "ByEtPiAcY7", "title": "Characterizing the Accuracy/Complexity Landscape of Explanations of Deep Networks through Knowledge Extraction", "track": "main", "status": "Reject", "tldr": "Systematically examines how well we can explain the hidden features of a deep network in terms of logical rules.", "abstract": "Knowledge extraction techniques are used to convert neural networks into symbolic descriptions with the objective of producing more comprehensible learning models. The central challenge is to find an explanation which is more comprehensible than the original model while still representing that model faithfully. The distributed nature of deep networks has led many to believe that the hidden features of a neural network cannot be explained by logical descriptions simple enough to be understood by humans, and that decompositional knowledge extraction should be abandoned in favour of other methods. In this paper we examine this question systematically by proposing a knowledge extraction method using \\textit{M-of-N} rules which allows us to map the complexity/accuracy landscape of rules describing hidden features in a Convolutional Neural Network (CNN). Experiments reported in this paper show that the shape of this landscape reveals an optimal trade off between comprehensibility and accuracy, showing that each latent variable has an optimal \\textit{M-of-N} rule to describe its behaviour. We find that the rules with optimal tradeoff in the first and final layer have a high degree of explainability whereas the rules with the optimal tradeoff in the second and third layer are less explainable. The results shed light on the feasibility of rule extraction from deep networks, and point to the value of decompositional knowledge extraction as a method of explainability.", "keywords": "Deep Networks;Explainability;Knowledge Extraction", "primary_area": "", "supplementary_material": "", "author": "Simon Odense;Artur d'Avila Garcez", "authorids": "simon.odense@city.ac.uk;a.garcez@city.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nodense2019characterizing,\ntitle={Characterizing the Accuracy/Complexity Landscape of Explanations of Deep Networks through Knowledge Extraction},\nauthor={Simon Odense and Artur d'Avila Garcez},\nyear={2019},\nurl={https://openreview.net/forum?id=ByEtPiAcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=ByEtPiAcY7", "pdf_size": 0, "rating": "4;4;4;5", "confidence": "4;5;2;3", "wc_review": "345;421;207;157", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "203;401;147;159", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 4.25, 0.4330127018922193 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "wc_review_avg": [ 282.5, 105.52132485900658 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 227.5, 102.31690964840563 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.2581988897471611, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PeUwCwA5uy4J:scholar.google.com/&scioq=Characterizing+the+Accuracy/Complexity+Landscape+of+Explanations+of+Deep+Networks+through+Knowledge+Extraction&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ByGOuo0cYm", "title": "Meta-Learning with Domain Adaptation for Few-Shot Learning under Domain Shift", "track": "main", "status": "Reject", "tldr": "Meta Learning for Few Shot learning assumes that training tasks and test tasks are drawn from the same distribution. What do you do if they are not? Meta Learning with task-level Domain Adaptation!", "abstract": "Few-Shot Learning (learning with limited labeled data) aims to overcome the limitations of traditional machine learning approaches which require thousands of labeled examples to train an effective model. Considered as a hallmark of human intelligence, the community has recently witnessed several contributions on this topic, in particular through meta-learning, where a model learns how to learn an effective model for few-shot learning. The main idea is to acquire prior knowledge from a set of training tasks, which is then used to perform (few-shot) test tasks. Most existing work assumes that both training and test tasks are drawn from the same distribution, and a large amount of labeled data is available in the training tasks. This is a very strong assumption which restricts the usage of meta-learning strategies in the real world where ample training tasks following the same distribution as test tasks may not be available. In this paper, we propose a novel meta-learning paradigm wherein a few-shot learning model is learnt, which simultaneously overcomes domain shift between the train and test tasks via adversarial domain adaptation. We demonstrate the efficacy the proposed method through extensive experiments.", "keywords": "Meta-Learning;Few-Shot Learning;Domain Adaptation", "primary_area": "", "supplementary_material": "", "author": "Doyen Sahoo;Hung Le;Chenghao Liu;Steven C. H. Hoi", "authorids": "doyens@smu.edu.sg;hungle.2018@phdis.smu.edu.sg;chliu@smu.edu.sg;chhoi@smu.edu.sg", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsahoo2019metalearning,\ntitle={Meta-Learning with Domain Adaptation for Few-Shot Learning under Domain Shift},\nauthor={Doyen Sahoo and Hung Le and Chenghao Liu and Steven C. H. Hoi},\nyear={2019},\nurl={https://openreview.net/forum?id=ByGOuo0cYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByGOuo0cYm", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;3", "wc_review": "701;592;252", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "943;321;392", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 515.0, 191.21889725303475 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 552.0, 277.99400473151695 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14402130912100420617&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ByGUFsAqYm", "title": "Downsampling leads to Image Memorization in Convolutional Autoencoders", "track": "main", "status": "Reject", "tldr": "We identify downsampling as a mechansim for memorization in convolutional autoencoders.", "abstract": "Memorization of data in deep neural networks has become a subject of significant research interest. \nIn this paper, we link memorization of images in deep convolutional autoencoders to downsampling through strided convolution. To analyze this mechanism in a simpler setting, we train linear convolutional autoencoders and show that linear combinations of training data are stored as eigenvectors in the linear operator corresponding to the network when downsampling is used. On the other hand, networks without downsampling do not memorize training data. We provide further evidence that the same effect happens in nonlinear networks. Moreover, downsampling in nonlinear networks causes the model to not only memorize just linear combinations of images, but individual training images. Since convolutional autoencoder components are building blocks of deep convolutional networks, we envision that our findings will shed light on the important phenomenon of memorization in over-parameterized deep networks. \n", "keywords": "Memorization in Deep Learning;Convolutional Autoencoders", "primary_area": "", "supplementary_material": "", "author": "Adityanarayanan Radhakrishnan;Caroline Uhler;Mikhail Belkin", "authorids": "aradha@mit.edu;cuhler@mit.edu;mbelkin@cse.ohio-state.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nradhakrishnan2019downsampling,\ntitle={Downsampling leads to Image Memorization in Convolutional Autoencoders},\nauthor={Adityanarayanan Radhakrishnan and Caroline Uhler and Mikhail Belkin},\nyear={2019},\nurl={https://openreview.net/forum?id=ByGUFsAqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByGUFsAqYm", "pdf_size": 0, "rating": "3;5;5", "confidence": "3;2;2", "wc_review": "273;251;467", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "426;282;576", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 330.3333333333333, 97.05439482865037 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 428.0, 120.03332870498926 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15061698911385786172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ByGVui0ctm", "title": "Three continual learning scenarios and a case for generative replay", "track": "main", "status": "Reject", "tldr": "A newly introduced structured comparison of recent methods for continual learning that turns into an argument for and extension of generative replay.", "abstract": "Standard artificial neural networks suffer from the well-known issue of catastrophic forgetting, making continual or lifelong learning problematic. Recently, numerous methods have been proposed for continual learning, but due to differences in evaluation protocols it is difficult to directly compare their performance. To enable more meaningful comparisons, we identified three distinct continual learning scenarios based on whether task identity is known and, if it is not, whether it needs to be inferred. Performing the split and permuted MNIST task protocols according to each of these scenarios, we found that regularization-based approaches (e.g., elastic weight consolidation) failed when task identity needed to be inferred. In contrast, generative replay combined with distillation (i.e., using class probabilities as \u201csoft targets\u201d) achieved superior performance in all three scenarios. In addition, we reduced the computational cost of generative replay by integrating the generative model into the main model.", "keywords": "continual learning;generative models;replay;distillation;variational autoencoder", "primary_area": "", "supplementary_material": "", "author": "Gido M. van de Ven;Andreas S. Tolias", "authorids": "gidovandeven@gmail.com;astolias@bcm.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nven2019three,\ntitle={Three continual learning scenarios and a case for generative replay},\nauthor={Gido M. van de Ven and Andreas S. Tolias},\nyear={2019},\nurl={https://openreview.net/forum?id=ByGVui0ctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByGVui0ctm", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;5", "wc_review": "353;175;422", "wc_reply_reviewers": "0;0;41", "wc_reply_authors": "337;230;568", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 316.6666666666667, 104.05874409304688 ], "wc_reply_reviewers_avg": [ 13.666666666666666, 19.3275853524323 ], "wc_reply_authors_avg": [ 378.3333333333333, 141.04924278027477 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15715207111084256325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ByG_3s09KX", "title": "Dopamine: A Research Framework for Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "In this paper we introduce Dopamine, a new research framework for deep RL that is open-source, TensorFlow-based, and provides compact yet reliable implementations of some state-of-the-art deep RL agents.", "abstract": "Deep reinforcement learning (deep RL) research has grown significantly in recent years. A number of software offerings now exist that provide stable, comprehensive implementations for benchmarking. At the same time, recent deep RL research\nhas become more diverse in its goals. In this paper we introduce Dopamine, a new research framework for deep RL that aims to support some of that diversity. Dopamine is open-source, TensorFlow-based, and provides compact yet reliable\nimplementations of some state-of-the-art deep RL agents. We complement this offering with a taxonomy of the different research objectives in deep RL research. While by no means exhaustive, our analysis highlights the heterogeneity of research\nin the field, and the value of frameworks such as ours.", "keywords": "reinforcement learning;software;framework;reproducibility", "primary_area": "", "supplementary_material": "", "author": "Pablo Samuel Castro;Subhodeep Moitra;Carles Gelada;Saurabh Kumar;Marc G. Bellemare", "authorids": "psc@google.com;smoitra@google.com;cgel@google.com;kumasaurabh@google.com;bellemare@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ncastro2019dopamine,\ntitle={Dopamine: A Research Framework for Deep Reinforcement Learning},\nauthor={Pablo Samuel Castro and Subhodeep Moitra and Carles Gelada and Saurabh Kumar and Marc G. Bellemare},\nyear={2019},\nurl={https://openreview.net/forum?id=ByG_3s09KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByG_3s09KX", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;2;4", "wc_review": "279;448;203", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 310.0, 102.39466131916579 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 308, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4057790091359596156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ByGq7hRqKX", "title": "Cross-Task Knowledge Transfer for Visually-Grounded Navigation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent efforts on training visual navigation agents conditioned on language using deep reinforcement learning have been successful in learning policies for two different tasks: learning to follow navigational instructions and embodied question answering. In this paper, we aim to learn a multitask model capable of jointly learning both tasks, and transferring knowledge of words and their grounding in visual objects across tasks. The proposed model uses a novel Dual-Attention unit to disentangle the knowledge of words in the textual representations and visual objects in the visual representations, and align them with each other. This disentangled task-invariant alignment of representations facilitates grounding and knowledge transfer across both tasks. We show that the proposed model outperforms a range of baselines on both tasks in simulated 3D environments. We also show that this disentanglement of representations makes our model modular, interpretable, and allows for zero-shot transfer to instructions containing new words by leveraging object detectors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Devendra Singh Chaplot;Lisa Lee;Ruslan Salakhutdinov;Devi Parikh;Dhruv Batra", "authorids": "chaplot@cs.cmu.edu;lslee@cs.cmu.edu;rsalakhu@cs.cmu.edu;parikh@gatech.edu;dbatra@gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchaplot2019crosstask,\ntitle={Cross-Task Knowledge Transfer for Visually-Grounded Navigation},\nauthor={Devendra Singh Chaplot and Lisa Lee and Ruslan Salakhutdinov and Devi Parikh and Dhruv Batra},\nyear={2019},\nurl={https://openreview.net/forum?id=ByGq7hRqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByGq7hRqKX", "pdf_size": 0, "rating": "5;5;7", "confidence": "5;3;4", "wc_review": "225;475;220", "wc_reply_reviewers": "0;254;0", "wc_reply_authors": "362;797;235", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 306.6666666666667, 119.04714285619048 ], "wc_reply_reviewers_avg": [ 84.66666666666667, 119.73674828092204 ], "wc_reply_authors_avg": [ 464.6666666666667, 240.6468135855717 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mg0E7XFoRiUJ:scholar.google.com/&scioq=Cross-Task+Knowledge+Transfer+for+Visually-Grounded+Navigation&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "The Deep Weight Prior", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/889", "id": "ByGuynAct7", "author_site": "Andrei Atanov, Arsenii Ashukha, Kirill Struminsky, Dmitry P. Vetrov, Max Welling", "tldr": "The generative model for kernels of convolutional neural networks, that acts as a prior distribution while training on new datasets.", "abstract": "Bayesian inference is known to provide a general framework for incorporating prior knowledge or specific properties into machine learning models via carefully choosing a prior distribution. In this work, we propose a new type of prior distributions for convolutional neural networks, deep weight prior (DWP), that exploit generative models to encourage a specific structure of trained convolutional filters e.g., spatial correlations of weights. We define DWP in the form of an implicit distribution and propose a method for variational inference with such type of implicit priors. In experiments, we show that DWP improves the performance of Bayesian neural networks when training data are limited, and initialization of weights with samples from DWP accelerates training of conventional convolutional neural networks.\n", "keywords": "deep learning;variational inference;prior distributions", "primary_area": "", "supplementary_material": "", "author": "Andrei Atanov;Arsenii Ashukha;Kirill Struminsky;Dmitriy Vetrov;Max Welling", "authorids": "andrewatanov@yandex.ru;ars.ashuha@gmail.com;k.struminsky@gmail.com;vetrovd@yandex.ru;m.welling@uva.nl", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\natanov2018the,\ntitle={The Deep Weight Prior},\nauthor={Andrei Atanov and Arsenii Ashukha and Kirill Struminsky and Dmitriy Vetrov and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByGuynAct7},\n}", "github": "[![github](/images/github_icon.svg) bayesgroup/deep-weight-prior](https://github.com/bayesgroup/deep-weight-prior) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=ByGuynAct7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;4", "wc_review": "245;326;412", "wc_reply_reviewers": "86;0;16", "wc_reply_authors": "384;389;198", "reply_reviewers": "2;0;1", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 327.6666666666667, 68.18764958227814 ], "wc_reply_reviewers_avg": [ 34.0, 37.345236197762446 ], "wc_reply_authors_avg": [ 323.6666666666667, 88.88319426203259 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15422497541572460475&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=ByGuynAct7", "pdf": "https://openreview.net/pdf?id=ByGuynAct7", "email": ";;;;", "author_num": 5 }, { "title": "Efficient Multi-Objective Neural Architecture Search via Lamarckian Evolution", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1090", "id": "ByME42AqK7", "author_site": "Thomas Elsken, Jan Hendrik Metzen, Frank Hutter", "tldr": "We propose a method for efficient Multi-Objective Neural Architecture Search based on Lamarckian inheritance and evolutionary algorithms.", "abstract": "Architecture search aims at automatically finding neural architectures that are competitive with architectures designed by human experts. While recent approaches have achieved state-of-the-art predictive performance for image recognition, they are problematic under resource constraints for two reasons: (1) the neural architectures found are solely optimized for high predictive performance, without penalizing excessive resource consumption; (2)most architecture search methods require vast computational resources. We address the first shortcoming by proposing LEMONADE, an evolutionary algorithm for multi-objective architecture search that allows approximating the Pareto-front of architectures under multiple objectives, such as predictive performance and number of parameters, in a single run of the method. We address the second shortcoming by proposing a Lamarckian inheritance mechanism for LEMONADE which generates children networks that are warmstarted with the predictive performance of their trained parents. This is accomplished by using (approximate) network morphism operators for generating children. The combination of these two contributions allows finding models that are on par or even outperform different-sized NASNets, MobileNets, MobileNets V2 and Wide Residual Networks on CIFAR-10 and ImageNet64x64 within only one week on eight GPUs, which is about 20-40x less compute power than previous architecture search methods that yield state-of-the-art performance.", "keywords": "Neural Architecture Search;AutoML;AutoDL;Deep Learning;Evolutionary Algorithms;Multi-Objective Optimization", "primary_area": "", "supplementary_material": "", "author": "Thomas Elsken;Jan Hendrik Metzen;Frank Hutter", "authorids": "thomas.elsken@de.bosch.com;janhendrik.metzen@de.bosch.com;fh@cs.uni-freiburg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nelsken2018efficient,\ntitle={Efficient Multi-Objective Neural Architecture Search via Lamarckian Evolution},\nauthor={Thomas Elsken and Jan Hendrik Metzen and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByME42AqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;3", "wc_review": "255;463;290", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "542;714;456", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 336.0, 90.93220918171221 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 570.6666666666666, 107.26084508751966 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 694, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7285962348545895359&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByME42AqK7", "pdf": "https://openreview.net/pdf?id=ByME42AqK7", "email": ";;", "author_num": 3 }, { "title": "Quaternion Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/714", "id": "ByMHvs0cFQ", "author_site": "Titouan Parcollet, Mirco Ravanellu, Mohamed Morchid, Georges Linar\u00e8s, Chiheb Trabelsi, Renato De Mori, Yoshua Bengio", "tldr": "", "abstract": "Recurrent neural networks (RNNs) are powerful architectures to model sequential data, due to their capability to learn short and long-term dependencies between the basic elements of a sequence. Nonetheless, popular tasks such as speech or images recognition, involve multi-dimensional input features that are characterized by strong internal dependencies between the dimensions of the input vector. We propose a novel quaternion recurrent neural network (QRNN), alongside with a quaternion long-short term memory neural network (QLSTM), that take into account both the external relations and these internal structural dependencies with the quaternion algebra. Similarly to capsules, quaternions allow the QRNN to code internal dependencies by composing and processing multidimensional features as single entities, while the recurrent operation reveals correlations between the elements composing the sequence. We show that both QRNN and QLSTM achieve better performances than RNN and LSTM in a realistic application of automatic speech recognition. Finally, we show that QRNN and QLSTM reduce by a maximum factor of 3.3x the number of free parameters needed, compared to real-valued RNNs and LSTMs to reach better results, leading to a more compact representation of the relevant information.", "keywords": "Quaternion recurrent neural networks;quaternion numbers;recurrent neural networks;speech recognition", "primary_area": "", "supplementary_material": "", "author": "Titouan Parcollet;Mirco Ravanelli;Mohamed Morchid;Georges Linar\u00e8s;Chiheb Trabelsi;Renato De Mori;Yoshua Bengio", "authorids": "titouan.parcollet@alumni.univ-avignon.fr;mirco.ravanelli@gmail.com;mohamed.morchid@univ-avignon.fr;georges.linares@univ-avignon.fr;chiheb.trabelsi@polymtl.ca;rdemori@cs.mcgill.ca;yoshua.bengio@mila.quebec", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nparcollet2018quaternion,\ntitle={Quaternion Recurrent Neural Networks},\nauthor={Titouan Parcollet and Mirco Ravanelli and Mohamed Morchid and Georges Linar\u00e8s and Chiheb Trabelsi and Renato De Mori and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByMHvs0cFQ},\n}", "github": "[![github](/images/github_icon.svg) mravanelli/pytorch-kaldi](https://github.com/mravanelli/pytorch-kaldi) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=ByMHvs0cFQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;5;4", "wc_review": "564;330;127", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "835;860;114", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 340.3333333333333, 178.55406899747638 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 603.0, 345.9258109286826 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13606271573268587298&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ByMHvs0cFQ", "pdf": "https://openreview.net/pdf?id=ByMHvs0cFQ", "email": ";;;;;;", "author_num": 7 }, { "title": "Adversarial Audio Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/892", "id": "ByMVTsR5KQ", "author_site": "Chris Donahue, Julian McAuley, Miller Puckette", "tldr": "Learning to synthesize raw waveform audio with GANs", "abstract": "Audio signals are sampled at high temporal resolutions, and learning to synthesize audio requires capturing structure across a range of timescales. Generative adversarial networks (GANs) have seen wide success at generating images that are both locally and globally coherent, but they have seen little application to audio generation. In this paper we introduce WaveGAN, a first attempt at applying GANs to unsupervised synthesis of raw-waveform audio. WaveGAN is capable of synthesizing one second slices of audio waveforms with global coherence, suitable for sound effect generation. Our experiments demonstrate that\u2014without labels\u2014WaveGAN learns to produce intelligible words when trained on a small-vocabulary speech dataset, and can also synthesize audio from other domains such as drums, bird vocalizations, and piano. We compare WaveGAN to a method which applies GANs designed for image generation on image-like audio feature representations, finding both approaches to be promising.", "keywords": "audio;waveform;spectrogram;GAN;adversarial;WaveGAN;SpecGAN", "primary_area": "", "supplementary_material": "", "author": "Chris Donahue;Julian McAuley;Miller Puckette", "authorids": "cdonahue@ucsd.edu;jmcauley@eng.ucsd.edu;msp@ucsd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndonahue2018adversarial,\ntitle={Adversarial Audio Synthesis},\nauthor={Chris Donahue and Julian McAuley and Miller Puckette},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByMVTsR5KQ},\n}", "github": "[![github](/images/github_icon.svg) chrisdonahue/wavegan](https://github.com/chrisdonahue/wavegan) + [![Papers with Code](/images/pwc_icon.svg) 20 community implementations](https://paperswithcode.com/paper/?openreview=ByMVTsR5KQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;3", "wc_review": "39;445;35", "wc_reply_reviewers": "477;0;0", "wc_reply_authors": "1243;771;191", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 173.0, 192.33997677723335 ], "wc_reply_reviewers_avg": [ 159.0, 224.8599564173221 ], "wc_reply_authors_avg": [ 735.0, 430.2309457334126 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 917, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5918610073287101746&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByMVTsR5KQ", "pdf": "https://openreview.net/pdf?id=ByMVTsR5KQ", "email": ";;", "author_num": 3 }, { "id": "ByN7Yo05YX", "title": "Adaptive Neural Trees", "track": "main", "status": "Reject", "tldr": "We propose a framework to combine decision trees and neural networks, and show on image classification tasks that it enjoys the complementary benefits of the two approaches, while addressing the limitations of prior work.", "abstract": "Deep neural networks and decision trees operate on largely separate paradigms; typically, the former performs representation learning with pre-specified architectures, while the latter is characterised by learning hierarchies over pre-specified features with data-driven architectures. We unite the two via adaptive neural trees (ANTs), a model that incorporates representation learning into edges, routing functions and leaf nodes of a decision tree, along with a backpropagation-based training algorithm that adaptively grows the architecture from primitive modules (e.g., convolutional layers). ANTs allow increased interpretability via hierarchical clustering, e.g., learning meaningful class associations, such as separating natural vs. man-made objects. We demonstrate this on classification and regression tasks, achieving over 99% and 90% accuracy on the MNIST and CIFAR-10 datasets, and outperforming standard neural networks, random forests and gradient boosted trees on the SARCOS dataset. Furthermore, ANT optimisation naturally adapts the architecture to the size and complexity of the training data.", "keywords": "neural networks;decision trees;computer vision", "primary_area": "", "supplementary_material": "", "author": "Ryutaro Tanno;Kai Arulkumaran;Daniel C. Alexander;Antonio Criminisi;Aditya Nori", "authorids": "ryutaro.tanno.15@ucl.ac.uk;kailash.arulkumaran13@imperial.ac.uk;d.alexander@ucl.ac.uk;antcrim@microsoft.com;adityan@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntanno2019adaptive,\ntitle={Adaptive Neural Trees},\nauthor={Ryutaro Tanno and Kai Arulkumaran and Daniel C. Alexander and Antonio Criminisi and Aditya Nori},\nyear={2019},\nurl={https://openreview.net/forum?id=ByN7Yo05YX},\n}", "github": "[![github](/images/github_icon.svg) rtanno21609/AdaptiveNeuralTrees](https://github.com/rtanno21609/AdaptiveNeuralTrees)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ByN7Yo05YX", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;4", "wc_review": "147;135;498", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "618;135;1136", "reply_reviewers": "0;0;0", "reply_authors": "2;2;3", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 260.0, 168.36270370839262 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 629.6666666666666, 408.7397976980248 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 221, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10252139245277017232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "ByWMz305FQ", "title": "The Missing Ingredient in Zero-Shot Neural Machine Translation", "track": "main", "status": "Withdraw", "tldr": "Simple similarity constraints on top of multilingual NMT enables high quality translation between unseen language pairs for the first time.", "abstract": "Multilingual Neural Machine Translation (NMT) systems are capable of translating between multiple source and target languages within a single system. An important indicator of generalization within these systems is the quality of zero-shot translation - translating between language pairs that the system has never seen during training. However, until now, the zero-shot performance of multilingual models has lagged far behind the quality that can be achieved by using a two step translation process that pivots through an intermediate language (usually English). In this work, we diagnose why multilingual models under-perform in zero shot settings. We propose explicit language invariance losses that guide an NMT encoder towards learning language agnostic representations. Our proposed strategies significantly improve zero-shot translation performance on WMT English-French-German and on the IWSLT 2017 shared task, and for the first time, match the performance of pivoting approaches while maintaining performance on supervised directions.", "keywords": "Machine Translation;Multi-lingual processing;Zero-Shot translation", "primary_area": "", "supplementary_material": "", "author": "Naveen Arivazhagan;Ankur Bapna;Orhan Firat;Roee Aharoni;Melvin Johnson;Wolfgang Macherey", "authorids": "naveenariva@gmail.com;ankurbpn@google.com;orhanf@google.com;roee.aharoni@gmail.com;melvinp@google.com;wmach@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByWMz305FQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;3;5", "wc_review": "594;583;206", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "508;486;375", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 461.0, 180.36814205027082 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 456.3333333333333, 58.20843772360002 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12714943355185138183&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "Bye5OiR5F7", "title": "Wasserstein proximal of GANs", "track": "main", "status": "Reject", "tldr": "We propose the Wasserstein proximal method for training GANs. ", "abstract": "We introduce a new method for training GANs by applying the Wasserstein-2 metric proximal on the generators. \nThe approach is based on the gradient operator induced by optimal transport, which connects the geometry of sample space and parameter space in implicit deep generative models. From this theory, we obtain an easy-to-implement regularizer for the parameter updates. Our experiments demonstrate that this method improves the speed and stability in training GANs in terms of wall-clock time and Fr\\'echet Inception Distance (FID) learning curves. ", "keywords": "Optimal transport;Wasserstein gradient;Generative adversarial network;Unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Alex Tong Lin;Wuchen Li;Stanley Osher;Guido Montufar", "authorids": "atlin@math.ucla.edu;wcli@math.ucla.edu;sjo@math.ucla.edu;montufar@math.ucla.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlin2019wasserstein,\ntitle={Wasserstein proximal of {GAN}s},\nauthor={Alex Tong Lin and Wuchen Li and Stanley Osher and Guido Montufar},\nyear={2019},\nurl={https://openreview.net/forum?id=Bye5OiR5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Bye5OiR5F7", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;3;3", "wc_review": "507;260;592", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1168;638;672", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 453.0, 140.81429851640303 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 826.0, 242.22854222132176 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.7559289460184546, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3911333707513070155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "Preconditioner on Matrix Lie Group for SGD", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/762", "id": "Bye5SiAqKX", "author_site": "XI-LIN LI", "tldr": "We propose a new framework for preconditioner learning, derive new forms of preconditioners and learning methods, and reveal the relationship to methods like RMSProp, Adam, Adagrad, ESGD, KFAC, batch normalization, etc.", "abstract": "We study two types of preconditioners and preconditioned stochastic gradient descent (SGD) methods in a unified framework. We call the first one the Newton type due to its close relationship to the Newton method, and the second one the Fisher type as its preconditioner is closely related to the inverse of Fisher information matrix. Both preconditioners can be derived from one framework, and efficiently estimated on any matrix Lie groups designated by the user using natural or relative gradient descent minimizing certain preconditioner estimation criteria. Many existing preconditioners and methods, e.g., RMSProp, Adam, KFAC, equilibrated SGD, batch normalization, etc., are special cases of or closely related to either the Newton type or the Fisher type ones. Experimental results on relatively large scale machine learning problems are reported for performance study.", "keywords": "preconditioner;stochastic gradient descent;Newton method;Fisher information;natural gradient;Lie group", "primary_area": "", "supplementary_material": "", "author": "Xi-Lin Li", "authorids": "lixilinx@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nli2018learning,\ntitle={Learning Preconditioner on Matrix Lie Group},\nauthor={Xi-Lin Li},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bye5SiAqKX},\n}", "github": "[![github](/images/github_icon.svg) lixilinx/psgd_torch](https://github.com/lixilinx/psgd_torch) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Bye5SiAqKX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;3;5", "wc_review": "912;81;368", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "729;52;782", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 453.6666666666667, 344.6199194603946 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 521.0, 332.33818117493917 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=631643897928454629&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Bye5SiAqKX", "pdf": "https://openreview.net/pdf?id=Bye5SiAqKX", "email": "", "author_num": 1 }, { "id": "Bye9LiR9YX", "title": "Remember and Forget for Experience Replay", "track": "main", "status": "Reject", "tldr": "ReF-ER is an Experience Replay algorithm to regulate the pace at which the control policy is allowed to deviate from past behaviors; it is shown to enhance the stability and performance of off-policy RL methods.", "abstract": "Experience replay (ER) is crucial for attaining high data-efficiency in off-policy deep reinforcement learning (RL). ER entails the recall of experiences obtained in past iterations to compute gradient estimates for the current policy. However, the accuracy of such updates may deteriorate when the policy diverges from past behaviors, possibly undermining the effectiveness of ER. Previous off-policy RL algorithms mitigated this issue by tuning their hyper-parameters in order to abate policy changes. We propose ReF-ER, a method for active management of experiences in the Replay Memory (RM). ReF-ER forgets experiences that would be too unlikely with the current policy and constrains policy changes within a trust region of the behaviors in the RM. We couple ReF-ER with Q-learning, deterministic policy gradient and off-policy gradient methods to show that ReF-ER reliably improves the performance of continuous-action off-policy RL. We complement ReF-ER with a novel off-policy actor-critic algorithm (RACER) for continuous-action control. RACER employs a computationally efficient closed-form approximation of the action values and is shown to be highly competitive with state-of-the-art algorithms on benchmark problems, while being robust to large hyper-parameter variations.", "keywords": "reinforcement learning;experience replay;policy gradients", "primary_area": "", "supplementary_material": "", "author": "Guido Novati;Petros Koumoutsakos", "authorids": "novatig@ethz.ch;petros@ethz.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnovati2019remember,\ntitle={Remember and Forget for Experience Replay},\nauthor={Guido Novati and Petros Koumoutsakos},\nyear={2019},\nurl={https://openreview.net/forum?id=Bye9LiR9YX},\n}", "github": "[![github](/images/github_icon.svg) cselab/smarties](https://github.com/cselab/smarties) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Bye9LiR9YX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=Bye9LiR9YX", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "wc_review": "260;806;917", "wc_reply_reviewers": "44;92;12", "wc_reply_authors": "701;736;830", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 661.0, 287.14804543997855 ], "wc_reply_reviewers_avg": [ 49.333333333333336, 32.87687468250121 ], "wc_reply_authors_avg": [ 755.6666666666666, 54.46915538989832 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13050806613216384530&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "ByeDojRcYQ", "title": "COLLABORATIVE MULTIAGENT REINFORCEMENT LEARNING IN HOMOGENEOUS SWARMS", "track": "main", "status": "Reject", "tldr": "Novel policy gradient for multiagent systems via distributed learning. ", "abstract": "A deep reinforcement learning solution is developed for a collaborative multiagent system. Individual agents choose actions in response to the state of the environment, their own state, and possibly partial information about the state of other agents. Actions are chosen to maximize a collaborative long term discounted reward that encompasses the individual rewards collected by each agent. The paper focuses on developing a scalable approach that applies to large swarms of homogeneous agents. This is accomplished by forcing the policies of all agents to be the same resulting in a constrained formulation in which the experiences of each agent inform the learning process of the whole team, thereby enhancing the sample efficiency of the learning process. A projected coordinate policy gradient descent algorithm is derived to solve the constrained reinforcement learning problem. Experimental evaluations in collaborative navigation, a multi-predator-multi-prey game, and a multiagent survival game show marked improvements relative to methods that do not exploit the policy equivalence that naturally arises in homogeneous swarms.", "keywords": "Reinforcement Learning;Multi Agent;policy gradient", "primary_area": "", "supplementary_material": "", "author": "Arbaaz Khan;Clark Zhang;Vijay Kumar;Alejandro Ribeiro", "authorids": "arbaazk@seas.upenn.edu;;vijay.kumar@seas.upenn.edu;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkhan2019collaborative,\ntitle={{COLLABORATIVE} {MULTIAGENT} {REINFORCEMENT} {LEARNING} {IN} {HOMOGENEOUS} {SWARMS}},\nauthor={Arbaaz Khan and Clark Zhang and Vijay Kumar and Alejandro Ribeiro},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeDojRcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByeDojRcYQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "488;516;398", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 467.3333333333333, 50.34105900974097 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10168526232775771312&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ByeLBj0qFQ", "title": "Unsupervised Image to Sequence Translation with Canvas-Drawer Networks", "track": "main", "status": "Reject", "tldr": "Recreate images as interpretable high-level sequences without the need for paired data.", "abstract": "Encoding images as a series of high-level constructs, such as brush strokes or discrete shapes, can often be key to both human and machine understanding. In many cases, however, data is only available in pixel form. We present a method for generating images directly in a high-level domain (e.g. brush strokes), without the need for real pairwise data. Specifically, we train a \u201dcanvas\u201d network to imitate the mapping of high-level constructs to pixels, followed by a high-level \u201ddrawing\u201d network which is optimized through this mapping towards solving a desired image recreation or translation task. We successfully discover sequential vector representations of symbols, large sketches, and 3D objects, utilizing only pixel data. We display applications of our method in image segmentation, and present several ablation studies comparing various configurations.", "keywords": "image;translation;unsupervised;model-based", "primary_area": "", "supplementary_material": "", "author": "Kevin Frans;Chin-Yi Cheng", "authorids": "kevinfrans2@gmail.com;chin-yi.cheng@autodesk.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfrans2019unsupervised,\ntitle={Unsupervised Image to Sequence Translation with Canvas-Drawer Networks},\nauthor={Kevin Frans and Chin-Yi Cheng},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeLBj0qFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByeLBj0qFQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;5", "wc_review": "470;686;179", "wc_reply_reviewers": "56;132;0", "wc_reply_authors": "284;786;202", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 445.0, 207.73540863319377 ], "wc_reply_reviewers_avg": [ 62.666666666666664, 54.09456740026879 ], "wc_reply_authors_avg": [ 424.0, 258.15240976343154 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4958493557225234324&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "ByeLmn0qtX", "title": "Variational Domain Adaptation", "track": "main", "status": "Reject", "tldr": "This paper proposes variational domain adaptation, a uni\ufb01ed, scalable, simple framework for learning multiple distributions through variational inference", "abstract": "This paper proposes variational domain adaptation, a unified, scalable, simple framework for learning multiple distributions through variational inference. Unlike the existing methods on domain transfer through deep generative models, such as StarGAN (Choi et al., 2017) and UFDN (Liu et al., 2018), the variational domain adaptation has three advantages. Firstly, the samples from the target are not required. Instead, the framework requires one known source as a prior $p(x)$ and binary discriminators, $p(\\mathcal{D}_i|x)$, discriminating the target domain $\\mathcal{D}_i$ from others. Consequently, the framework regards a target as a posterior that can be explicitly formulated through the Bayesian inference, $p(x|\\mathcal{D}_i) \\propto p(\\mathcal{D}_i|x)p(x)$, as exhibited by a further proposed model of dual variational autoencoder (DualVAE). Secondly, the framework is scablable to large-scale domains. As well as VAE encodes a sample $x$ as a mode on a latent space: $\\mu(x) \\in \\mathcal{Z}$, DualVAE encodes a domain $\\mathcal{D}_i$ as a mode on the dual latent space $\\mu^*(\\mathcal{D}_i) \\in \\mathcal{Z}^*$, named domain embedding. It reformulates the posterior with a natural paring $\\langle, \\rangle: \\mathcal{Z} \\times \\mathcal{Z}^* \\rightarrow \\Real$, which can be expanded to uncountable infinite domains such as continuous domains as well as interpolation. Thirdly, DualVAE fastly converges without sophisticated automatic/manual hyperparameter search in comparison to GANs as it requires only one additional parameter to VAE. Through the numerical experiment, we demonstrate the three benefits with multi-domain image generation task on CelebA with up to 60 domains, and exhibits that DualVAE records the state-of-the-art performance outperforming StarGAN and UFDN.", "keywords": "domain adaptation;variational inference;multi-domain", "primary_area": "", "supplementary_material": "", "author": "Hirono Okamoto;Shohei Ohsawa;Itto Higuchi;Haruka Murakami;Mizuki Sango;Zhenghang Cui;Masahiro Suzuki;Hiroshi Kajino;Yutaka Matsuo", "authorids": "ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp;ohsawa@weblab.t.u-tokyo.ac.jp", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@misc{\nokamoto2019variational,\ntitle={Variational Domain Adaptation},\nauthor={Hirono Okamoto and Shohei Ohsawa and Itto Higuchi and Haruka Murakami and Mizuki Sango and Zhenghang Cui and Masahiro Suzuki and Hiroshi Kajino and Yutaka Matsuo},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeLmn0qtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByeLmn0qtX", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;3", "wc_review": "435;133;411", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "598;251;532", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 326.3333333333333, 137.05797637820604 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 460.3333333333333, 150.45338886918506 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Learning to Screen for Fast Softmax Inference on Large Vocabulary Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1014", "id": "ByeMB3Act7", "author_site": "Patrick CHen, Si Si, Sanjiv Kumar, Yang Li, Cho-Jui Hsieh", "tldr": "", "abstract": "Neural language models have been widely used in various NLP tasks, including machine translation, next word prediction and conversational agents. However, it is challenging to deploy these models on mobile devices due to their slow prediction speed, where the bottleneck is to compute top candidates in the softmax layer. In this paper, we introduce a novel softmax layer approximation algorithm by exploiting the clustering structure of context vectors. Our algorithm uses a light-weight screening model to predict a much smaller set of candidate words based on the given context, and then conducts an exact softmax only within that subset. Training such a procedure end-to-end is challenging as traditional clustering methods are discrete and non-differentiable, and thus unable to be used with back-propagation in the training process. Using the Gumbel softmax, we are able to train the screening model end-to-end on the training set to exploit data distribution. The algorithm achieves an order of magnitude faster inference than the original softmax layer for predicting top-k words in various tasks such as beam search in machine translation or next words prediction. For example, for machine translation task on German to English dataset with around 25K vocabulary, we can achieve 20.4 times speed up with 98.9% precision@1 and 99.3% precision@5 with the original softmax layer prediction, while state-of-the-art (Zhang et al., 2018) only achieves 6.7x speedup with 98.7% precision@1 and 98.1% precision@5 for the same task.", "keywords": "fast inference;softmax computation;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Patrick Chen;Si Si;Sanjiv Kumar;Yang Li;Cho-Jui Hsieh", "authorids": "patrickchen@g.ucla.edu;sisidaisy@google.com;sanjivk@google.com;liyang@google.com;chohsieh@cs.ucla.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nchen2018learning,\ntitle={Learning to Screen for Fast Softmax Inference on Large Vocabulary Neural Networks},\nauthor={Patrick Chen and Si Si and Sanjiv Kumar and Yang Li and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeMB3Act7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;4", "wc_review": "172;337;163", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "187;336;219", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 224.0, 79.98749902328488 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 247.33333333333334, 64.04338807055382 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17886875272425018344&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByeMB3Act7", "pdf": "https://openreview.net/pdf?id=ByeMB3Act7", "email": ";;;;", "author_num": 5 }, { "id": "ByeNFoRcK7", "title": "PA-GAN: Improving GAN Training by Progressive Augmentation", "track": "main", "status": "Reject", "tldr": "We introduce a new technique - progressive augmentation of GANs (PA-GAN) - that helps to improve the overall stability of GAN training.", "abstract": "Despite recent progress, Generative Adversarial Networks (GANs) still suffer from training instability, requiring careful consideration of architecture design choices and hyper-parameter tuning. The reason for this fragile training behaviour is partially due to the discriminator performing well very quickly; its loss converges to zero, providing no reliable backpropagation signal to the generator. In this work we introduce a new technique - progressive augmentation of GANs (PA-GAN) - that helps to overcome this fundamental limitation and improve the overall stability of GAN training. The key idea is to gradually increase the task difficulty of the discriminator by progressively augmenting its input space, thus enabling continuous learning of the generator. We show that the proposed progressive augmentation preserves the original GAN objective, does not bias the optimality of the discriminator and encourages the healthy competition between the generator and discriminator, leading to a better-performing generator. We experimentally demonstrate the effectiveness of the proposed approach on multiple benchmarks (MNIST, Fashion-MNIST, CIFAR10, CELEBA) for the image generation task.", "keywords": "Deep Learning;GANs;Augmentation;Generative Modelling", "primary_area": "", "supplementary_material": "", "author": "Dan Zhang;Anna Khoreva", "authorids": "dan.zhang2@de.bosch.com;anna.khoreva@de.bosch.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhang2019pagan,\ntitle={{PA}-{GAN}: Improving {GAN} Training by Progressive Augmentation},\nauthor={Dan Zhang and Anna Khoreva},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeNFoRcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByeNFoRcK7", "pdf_size": 0, "rating": "4;5;5", "confidence": "2;4;5", "wc_review": "216;365;648", "wc_reply_reviewers": "0;0;329", "wc_reply_authors": "29;721;967", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 409.6666666666667, 179.16906975132608 ], "wc_reply_reviewers_avg": [ 109.66666666666667, 155.0920873402494 ], "wc_reply_authors_avg": [ 572.3333333333334, 397.10396063947223 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9449111825230683, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16266963327849989346&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ByePUo05K7", "title": "What a difference a pixel makes: An empirical examination of features used by CNNs for categorisation", "track": "main", "status": "Reject", "tldr": "This study highlights a key difference between human vision and CNNs: while object recognition in humans relies on analysing shape, CNNs do not have such a shape-bias.", "abstract": "Convolutional neural networks (CNNs) were inspired by human vision and, in some settings, achieve a performance comparable to human object recognition. This has lead to the speculation that both systems use similar mechanisms to perform recognition. In this study, we conducted a series of simulations that indicate that there is a fundamental difference between human vision and CNNs: while object recognition in humans relies on analysing shape, CNNs do not have such a shape-bias. We teased apart the type of features selected by the model by modifying the CIFAR-10 dataset so that, in addition to containing objects with shape, the images concurrently contained non-shape features, such as a noise-like mask. When trained on these modified set of images, the model did not show any bias towards selecting shapes as features. Instead it relied on whichever feature allowed it to perform the best prediction -- even when this feature was a noise-like mask or a single predictive pixel amongst 50176 pixels. We also found that regularisation methods, such as batch normalisation or Dropout, did not change this behaviour and neither did past or concurrent experience with images from other datasets.", "keywords": "deep learning;shape bias;vision;feature selection", "primary_area": "", "supplementary_material": "", "author": "Gaurav Malhotra;Jeffrey Bowers", "authorids": "gaurav.malhotra@bristol.ac.uk;j.bowers@bristol.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmalhotra2019what,\ntitle={What a difference a pixel makes: An empirical examination of features used by {CNN}s for categorisation},\nauthor={Gaurav Malhotra and Jeffrey Bowers},\nyear={2019},\nurl={https://openreview.net/forum?id=ByePUo05K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ByePUo05K7", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;4;5", "wc_review": "662;393;254", "wc_reply_reviewers": "707;0;0", "wc_reply_authors": "2581;469;322", "reply_reviewers": "2;0;0", "reply_authors": "6;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 436.3333333333333, 169.36023408370954 ], "wc_reply_reviewers_avg": [ 235.66666666666666, 333.2829961992594 ], "wc_reply_authors_avg": [ 1124.0, 1032.0009689917931 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 2.3570226039551585 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8526816239675730539&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Adaptive Posterior Learning: few-shot learning with a surprise-based memory module", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/988", "id": "ByeSdsC9Km", "author_site": "Tiago Ramalho, Marta Garnelo", "tldr": "We introduce a model which generalizes quickly from few observations by storing surprising information and attending over the most relevant data at each time point.", "abstract": "The ability to generalize quickly from few observations is crucial for intelligent systems. In this paper we introduce APL, an algorithm that approximates probability distributions by remembering the most surprising observations it has encountered. These past observations are recalled from an external memory module and processed by a decoder network that can combine information from different memory slots to generalize beyond direct recall. We show this algorithm can perform as well as state of the art baselines on few-shot classification benchmarks with a smaller memory footprint. In addition, its memory compression allows it to scale to thousands of unknown labels. Finally, we introduce a meta-learning reasoning task which is more challenging than direct classification. In this setting, APL is able to generalize with fewer than one example per class via deductive reasoning.", "keywords": "metalearning;memory;few-shot;relational;self-attention;classification;sequential;reasoning;working memory;episodic memory", "primary_area": "", "supplementary_material": "", "author": "Tiago Ramalho;Marta Garnelo", "authorids": "tiago.mpramalho@gmail.com;garnelo@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nramalho2018adaptive,\ntitle={Adaptive Posterior Learning: few-shot learning with a surprise-based memory module},\nauthor={Tiago Ramalho and Marta Garnelo},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeSdsC9Km},\n}", "github": "[![github](/images/github_icon.svg) cogentlabs/apl](https://github.com/cogentlabs/apl)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "wc_review": "213;655;401", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "185;984;306", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 423.0, 181.11506471485652 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 491.6666666666667, 351.61942810689834 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3877086335539241291&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ByeSdsC9Km", "pdf": "https://openreview.net/pdf?id=ByeSdsC9Km", "email": ";", "author_num": 2 }, { "id": "ByeTHsAqtX", "title": "Gradient Descent Happens in a Tiny Subspace", "track": "main", "status": "Reject", "tldr": "For classification problems with k classes, we show that the gradient tends to live in a tiny, slowly-evolving subspace spanned by the eigenvectors corresponding to the k-largest eigenvalues of the Hessian.", "abstract": "We show that in a variety of large-scale deep learning scenarios the gradient dynamically converges to a very small subspace after a short period of training. The subspace is spanned by a few top eigenvectors of the Hessian (equal to the number of classes in the dataset), and is mostly preserved over long periods of training. A simple argument then suggests that gradient descent may happen mostly in this subspace. We give an example of this effect in a solvable model of classification, and we comment on possible implications for optimization and learning.", "keywords": "Gradient Descent;Hessian;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Guy Gur-Ari;Daniel A. Roberts;Ethan Dyer", "authorids": "guyg@ias.edu;danr@fb.com;edyer@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngur-ari2019gradient,\ntitle={Gradient Descent Happens in a Tiny Subspace},\nauthor={Guy Gur-Ari and Daniel A. Roberts and Ethan Dyer},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeTHsAqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByeTHsAqtX", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;3;4", "wc_review": "290;152;400", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "669;296;665", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 280.6666666666667, 101.46044658990134 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 543.3333333333334, 174.89870084010218 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 237, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=603104952013109434&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "ByeWdiR5Ym", "title": "Adaptive Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "An adaptve convolutional kernel, that includes non-linear transformations obtaining similar results as the state of the art algorithms, while yielding a reduction in required memory up to 16x in the CIFAR10", "abstract": "The quest for increased visual recognition performance has led to the development of highly complex neural networks with very deep topologies. To avoid high computing resource requirements of such complex networks and to enable operation on devices with limited resources, this paper introduces adaptive kernels for convolutional layers. Motivated by the non-linear perception response in human visual cells, the input image is used to define the weights of a dynamic kernel called Adaptive kernel. This new adaptive kernel is used to perform a second convolution of the input image generating the output pixel. Adaptive kernels enable accurate recognition with lower memory requirements; This is accomplished through reducing the number of kernels and the number of layers needed in the typical CNN configuration, in addition to reducing the memory used, increasing 2X the training speed and the number of activation function evaluations. Our experiments show a reduction of 70X in the memory used for MNIST, maintaining 99% accuracy and 16X memory reduction for CIFAR10 with 92.5% accuracy.", "keywords": "Adaptive kernels;Dynamic kernels;Pattern recognition;low memory CNNs", "primary_area": "", "supplementary_material": "", "author": "Julio Cesar Zamora;Jesus Adan Cruz Vargas;Omesh Tickoo", "authorids": "julio.c.zamora.esquivel@intel.com;jesus.a.cruz.vargas@intel.com;omesh.tickoo@intel.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzamora2019adaptive,\ntitle={Adaptive Convolutional Neural Networks},\nauthor={Julio Cesar Zamora and Jesus Adan Cruz Vargas and Omesh Tickoo},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeWdiR5Ym},\n}", "github": "[![github](/images/github_icon.svg) adapconv/adaptive-cnn](https://github.com/adapconv/adaptive-cnn)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ByeWdiR5Ym", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;3", "wc_review": "449;173;417", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "599;362;657", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 346.3333333333333, 123.25943732180872 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 539.3333333333334, 127.60964784146309 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 383, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16269301846334705331&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "KnockoffGAN: Generating Knockoffs for Feature Selection using Generative Adversarial Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1012", "id": "ByeZ5jC5YQ", "author_site": "James Jordon, Jinsung Yoon, Mihaela Schaar", "tldr": "", "abstract": "Feature selection is a pervasive problem. The discovery of relevant features can be as important for performing a particular task (such as to avoid overfitting in prediction) as it can be for understanding the underlying processes governing the true label (such as discovering relevant genetic factors for a disease). Machine learning driven feature selection can enable discovery from large, high-dimensional, non-linear observational datasets by creating a subset of features for experts to focus on. In order to use expert time most efficiently, we need a principled methodology capable of controlling the False Discovery Rate. In this work, we build on the promising Knockoff framework by developing a flexible knockoff generation model. We adapt the Generative Adversarial Networks framework to allow us to generate knockoffs with no assumptions on the feature distribution. Our model consists of 4 networks, a generator, a discriminator, a stability network and a power network. We demonstrate the capability of our model to perform feature selection, showing that it performs as well as the originally proposed knockoff generation model in the Gaussian setting and that it outperforms the original model in non-Gaussian settings, including on a real-world dataset.", "keywords": "Knockoff model;Feature selection;False discovery rate control;Generative Adversarial networks", "primary_area": "", "supplementary_material": "", "author": "James Jordon;Jinsung Yoon;Mihaela van der Schaar", "authorids": "james.jordon@wolfson.ox.ac.uk;jsyoon0823@gmail.com;mihaela.vanderschaar@eng.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\njordon2018knockoffgan,\ntitle={Knockoff{GAN}: Generating Knockoffs for Feature Selection using Generative Adversarial Networks},\nauthor={James Jordon and Jinsung Yoon and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByeZ5jC5YQ},\n}", "github": "[![github](/images/github_icon.svg) vanderschaarlab/mlforhealthlabpub](https://github.com/vanderschaarlab/mlforhealthlabpub/tree/main/alg/knockoffgan)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;10", "confidence": "4;4;4", "wc_review": "463;224;935", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "376;628;772", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 7.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 540.6666666666666, 295.41421014493454 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 592.0, 163.65818036383027 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13308269050823437896&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByeZ5jC5YQ", "pdf": "https://openreview.net/pdf?id=ByeZ5jC5YQ", "email": ";;", "author_num": 3 }, { "id": "ByecAoAqK7", "title": "Zero-shot Dual Machine Translation", "track": "main", "status": "Reject", "tldr": "A multilingual NMT model with reinforcement learning (dual learning) aiming to improve zero-shot translation directions.", "abstract": "Neural Machine Translation (NMT) systems rely on large amounts of parallel data.This is a major challenge for low-resource languages. Building on recent work onunsupervised and semi-supervised methods, we present an approach that combineszero-shot and dual learning. The latter relies on reinforcement learning, to exploitthe duality of the machine translation task, and requires only monolingual datafor the target language pair. Experiments on the UN corpus show that a zero-shotdual system, trained on English-French and English-Spanish, outperforms by largemargins a standard NMT system in zero-shot translation performance on Spanish-French (both directions). We also evaluate onnewstest2014. These experimentsshow that the zero-shot dual method outperforms the LSTM-based unsupervisedNMT system proposed in (Lample et al., 2018b), on the en\u2192fr task, while onthe fr\u2192en task it outperforms both the LSTM-based and the Transformers-basedunsupervised NMT systems.", "keywords": "unsupervised;machine translation;dual learning;zero-shot", "primary_area": "", "supplementary_material": "", "author": "Lierni Sestorain;Massimiliano Ciaramita;Christian Buck;Thomas Hofmann", "authorids": "lierni@google.com;massi@google.com;cbuck@google.com;thomas.hofmann@inf.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsestorain2019zeroshot,\ntitle={Zero-shot Dual Machine Translation},\nauthor={Lierni Sestorain and Massimiliano Ciaramita and Christian Buck and Thomas Hofmann},\nyear={2019},\nurl={https://openreview.net/forum?id=ByecAoAqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByecAoAqK7", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;3", "wc_review": "191;315;430", "wc_reply_reviewers": "0;190;0", "wc_reply_authors": "187;508;398", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 312.0, 97.59439874637614 ], "wc_reply_reviewers_avg": [ 63.333333333333336, 89.56685895029602 ], "wc_reply_authors_avg": [ 364.3333333333333, 133.19242554373062 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=990438134286439330&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Probabilistic Planning with Sequential Monte Carlo methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/783", "id": "ByetGn0cYX", "author_site": "Alexandre Piche, Valentin Thomas, Cyril Ibrahim, Yoshua Bengio, Christopher Pal", "tldr": "Leveraging control as inference and Sequential Monte Carlo methods, we proposed a probabilistic planning algorithm.", "abstract": "In this work, we propose a novel formulation of planning which views it as a probabilistic inference problem over future optimal trajectories. This enables us to use sampling methods, and thus, tackle planning in continuous domains using a fixed computational budget. We design a new algorithm, Sequential Monte Carlo Planning, by leveraging classical methods in Sequential Monte Carlo and Bayesian smoothing in the context of control as inference. Furthermore, we show that Sequential Monte Carlo Planning can capture multimodal policies and can quickly learn continuous control tasks.", "keywords": "control as inference;probabilistic planning;sequential monte carlo;model based reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Alexandre Piche;Valentin Thomas;Cyril Ibrahim;Yoshua Bengio;Chris Pal", "authorids": "alexandrelpiche@gmail.com;vltn.thomas@gmail.com;cyril.ibrahim@elementai.com;yoshua.umontreal@gmail.com;christopher.pal@polymtl.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\npiche2018probabilistic,\ntitle={Probabilistic Planning with Sequential Monte Carlo methods},\nauthor={Alexandre Pich\\'{e} and Valentin Thomas and Cyril Ibrahim and Yoshua Bengio and Chris Pal},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByetGn0cYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;4;4", "wc_review": "914;194;793", "wc_reply_reviewers": "273;0;16", "wc_reply_authors": "3412;377;406", "reply_reviewers": "4;0;1", "reply_authors": "8;2;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 633.6666666666666, 314.79128888130873 ], "wc_reply_reviewers_avg": [ 96.33333333333333, 125.09285440112966 ], "wc_reply_authors_avg": [ 1398.3333333333333, 1423.9265742617802 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.699673171197595 ], "reply_authors_avg": [ 3.6666666666666665, 3.091206165165235 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1860428806595804827&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=ByetGn0cYX", "pdf": "https://openreview.net/pdf?id=ByetGn0cYX", "email": ";;;;", "author_num": 5 }, { "title": "Plan Online, Learn Offline: Efficient Learning and Exploration via Model-Based Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/907", "id": "Byey7n05FQ", "author_site": "Kendall Lowrey, Aravind Rajeswaran, Sham M Kakade, Emanuel Todorov, Igor Mordatch", "tldr": "We propose a framework that incorporates planning for efficient exploration and learning in complex environments.", "abstract": "We propose a \"plan online and learn offline\" framework for the setting where an agent, with an internal model, needs to continually act and learn in the world. Our work builds on the synergistic relationship between local model-based control, global value function learning, and exploration. We study how local trajectory optimization can cope with approximation errors in the value function, and can stabilize and accelerate value function learning. Conversely, we also study how approximate value functions can help reduce the planning horizon and allow for better policies beyond local solutions. Finally, we also demonstrate how trajectory optimization can be used to perform temporally coordinated exploration in conjunction with estimating uncertainty in value function approximation. This exploration is critical for fast and stable learning of the value function. Combining these components enable solutions to complex control tasks, like humanoid locomotion and dexterous in-hand manipulation, in the equivalent of a few minutes of experience in the real world.", "keywords": "deep reinforcement learning;exploration;model-based", "primary_area": "", "supplementary_material": "", "author": "Kendall Lowrey;Aravind Rajeswaran;Sham Kakade;Emanuel Todorov;Igor Mordatch", "authorids": "kendall.lowrey@gmail.com;rajeswaran.aravind@gmail.com;sham@cs.washington.edu;etodorov@gmail.com;mordatch@openai.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlowrey2018plan,\ntitle={Plan Online, Learn Offline: Efficient Learning and Exploration via Model-Based Control},\nauthor={Kendall Lowrey and Aravind Rajeswaran and Sham Kakade and Emanuel Todorov and Igor Mordatch},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byey7n05FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;5;4", "wc_review": "271;378;399", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "610;979;685", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 349.3333333333333, 56.04958122551458 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 758.0, 159.24195427085164 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 284, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12747766892860310052&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=Byey7n05FQ", "pdf": "https://openreview.net/pdf?id=Byey7n05FQ", "email": ";;;;", "author_num": 5 }, { "id": "ByezgnA5tm", "title": "Constraining Action Sequences with Formal Languages for Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We constrain an agent's actions during reinforcement learning, for safety or to enhance exploration.", "abstract": "We study the problem of deep reinforcement learning where the agent's action sequences are constrained, e.g., prohibition of dithering or overactuating action sequences that might damage a robot, drone, or other physical device. Our model focuses on constraints that can be described by automata such as DFAs or PDAs. We then propose multiple approaches to augment the state descriptions of the Markov decision process (MDP) with summaries of recent action histories. We empirically evaluate these methods applying DQN to three Atari games, training with reward shaping. We found that our approaches are effective in significantly reducing, and even eliminating, constraint violations while maintaining high reward. We also observed that the total reward achieved by an agent can be highly sensitive to how much the constraints encourage or discourage exploration of potentially effective actions during training, and, in addition to helping ensure safe policies, the use of constraints can enhance exploration during training.", "keywords": "reinforcement learning;constraints;finite state machines", "primary_area": "", "supplementary_material": "", "author": "Dong Xu;Eleanor Quint;Zeynep Hakguder;Haluk Dogan;Stephen Scott;Matthew Dwyer", "authorids": "dx@virginia.edu;pquint@cse.unl.edu;zeynep.hakguder@huskers.unl.edu;haluk.dogan@huskers.unl.edu;sscott@cse.unl.edu;matthewbdwyer@virginia.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nxu2019constraining,\ntitle={Constraining Action Sequences with Formal Languages for Deep Reinforcement Learning},\nauthor={Dong Xu and Eleanor Quint and Zeynep Hakguder and Haluk Dogan and Stephen Scott and Matthew Dwyer},\nyear={2019},\nurl={https://openreview.net/forum?id=ByezgnA5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ByezgnA5tm", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;4", "wc_review": "470;657;337", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 488.0, 131.25801562825285 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:22hmMiW-JdcJ:scholar.google.com/&scioq=Constraining+Action+Sequences+with+Formal+Languages+for+Deep+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "DHER: Hindsight Experience Replay for Dynamic Goals", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/775", "id": "Byf5-30qFX", "author_site": "Meng Fang, Cheng Zhou, Bei Shi, Boqing Gong, Jia Xu, Tong Zhang", "tldr": "", "abstract": "Dealing with sparse rewards is one of the most important challenges in reinforcement learning (RL), especially when a goal is dynamic (e.g., to grasp a moving object). Hindsight experience replay (HER) has been shown an effective solution to handling sparse rewards with fixed goals. However, it does not account for dynamic goals in its vanilla form and, as a result, even degrades the performance of existing off-policy RL algorithms when the goal is changing over time. \n\nIn this paper, we present Dynamic Hindsight Experience Replay (DHER), a novel approach for tasks with dynamic goals in the presence of sparse rewards. DHER automatically assembles successful experiences from two relevant failures and can be used to enhance an arbitrary off-policy RL algorithm when the tasks' goals are dynamic. We evaluate DHER on tasks of robotic manipulation and moving object tracking, and transfer the polices from simulation to physical robots. Extensive comparison and ablation studies demonstrate the superiority of our approach, showing that DHER is a crucial ingredient to enable RL to solve tasks with dynamic goals in manipulation and grid world domains.", "keywords": "Sparse rewards;Dynamic goals;Experience replay", "primary_area": "", "supplementary_material": "", "author": "Meng Fang;Cheng Zhou;Bei Shi;Boqing Gong;Jia Xu;Tong Zhang", "authorids": "moefang@gmail.com;chengzhmike@gmail.com;shibei00@gmail.com;boqinggo@outlook.com;jiaxu@cs.wisc.edu;tongzhang@tongzhang-ml.org", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nfang2018dher,\ntitle={{DHER}: Hindsight Experience Replay for Dynamic Goals},\nauthor={Meng Fang and Cheng Zhou and Bei Shi and Boqing Gong and Weitao Xi and Tianzhou Wang and Jia Xu and Tong Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byf5-30qFX},\n}", "github": "[![github](/images/github_icon.svg) mengf1/DHER](https://github.com/mengf1/DHER)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "wc_review": "705;403;323", "wc_reply_reviewers": "179;432;0", "wc_reply_authors": "1436;750;243", "reply_reviewers": "2;1;0", "reply_authors": "4;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 477.0, 164.49518736627726 ], "wc_reply_reviewers_avg": [ 203.66666666666666, 177.22365029031036 ], "wc_reply_authors_avg": [ 809.6666666666666, 488.8642165491582 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7304888076080019198&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Byf5-30qFX", "pdf": "https://openreview.net/pdf?id=Byf5-30qFX", "email": ";;;;;", "author_num": 6 }, { "id": "ByfXe2C5tm", "title": "NLProlog: Reasoning with Weak Unification for Natural Language Question Answering", "track": "main", "status": "Reject", "tldr": "We introduce NLProlog, a system that performs rule-based reasoning on natural language by leveraging pretrained sentence embeddings and fine-tuning with Evolution Strategies, and apply it to two multi-hop Question Answering tasks.", "abstract": "Symbolic logic allows practitioners to build systems that perform rule-based reasoning which is interpretable and which can easily be augmented with prior knowledge. However, such systems are traditionally difficult to apply to problems involving natural language due to the large linguistic variability of language. Currently, most work in natural language processing focuses on neural networks which learn distributed representations of words and their composition, thereby performing well in the presence of large linguistic variability. We propose to reap the benefits of both approaches by applying a combination of neural networks and logic programming to natural language question answering. We propose to employ an external, non-differentiable Prolog prover which utilizes a similarity function over pretrained sentence encoders. We fine-tune these representations via Evolution Strategies with the goal of multi-hop reasoning on natural language. This allows us to create a system that can apply rule-based reasoning to natural language and induce domain-specific natural language rules from training data. We evaluate the proposed system on two different question answering tasks, showing that it complements two very strong baselines \u2013 BIDAF (Seo et al., 2016a) and FASTQA (Weissenborn et al.,2017) \u2013 and outperforms both when used in an ensemble.", "keywords": "symbolic reasoning;neural networks;natural language processing;question answering;sentence embeddings;evolution strategies", "primary_area": "", "supplementary_material": "", "author": "Leon Weber;Pasquale Minervini;Ulf Leser;Tim Rockt\u00e4schel", "authorids": "leonweber@posteo.de;p.minervini@gmail.com;leser@informatik.hu-berlin.de;tim.rocktaeschel@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nweber2019nlprolog,\ntitle={{NLP}rolog: Reasoning with Weak Unification for Natural Language Question Answering},\nauthor={Leon Weber and Pasquale Minervini and Ulf Leser and Tim Rockt\u00e4schel},\nyear={2019},\nurl={https://openreview.net/forum?id=ByfXe2C5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByfXe2C5tm", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;3", "wc_review": "348;734;328", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "548;801;375", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 470.0, 186.85466723276318 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 574.6666666666666, 174.93300304846866 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UnqJvL_icYwJ:scholar.google.com/&scioq=NLProlog:+Reasoning+with+Weak+Unification+for+Natural+Language+Question+Answering&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ByfbnsA9Km", "title": "Cross-Entropy Loss Leads To Poor Margins", "track": "main", "status": "Reject", "tldr": "We show that minimizing the cross-entropy loss by using a gradient method could lead to a very poor margin if the features of the dataset lie on a low-dimensional subspace.", "abstract": "Neural networks could misclassify inputs that are slightly different from their training data, which indicates a small margin between their decision boundaries and the training dataset. In this work, we study the binary classification of linearly separable datasets and show that linear classifiers could also have decision boundaries that lie close to their training dataset if cross-entropy loss is used for training. In particular, we show that if the features of the training dataset lie in a low-dimensional affine subspace and the cross-entropy loss is minimized by using a gradient method, the margin between the training points and the decision boundary could be much smaller than the optimal value. This result is contrary to the conclusions of recent related works such as (Soudry et al., 2018), and we identify the reason for this contradiction. In order to improve the margin, we introduce differential training, which is a training paradigm that uses a loss function defined on pairs of points from each class. We show that the decision boundary of a linear classifier trained with differential training indeed achieves the maximum margin. The results reveal the use of cross-entropy loss as one of the hidden culprits of adversarial examples and introduces a new direction to make neural networks robust against them.", "keywords": "Cross-entropy loss;Binary classification;Low-rank features;Adversarial examples;Differential training", "primary_area": "", "supplementary_material": "", "author": "Kamil Nar;Orhan Ocal;S. Shankar Sastry;Kannan Ramchandran", "authorids": "nar@berkeley.edu;ocal@eecs.berkeley.edu;sastry@eecs.berkeley.edu;kannanr@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nnar2019crossentropy,\ntitle={Cross-Entropy Loss Leads To Poor Margins},\nauthor={Kamil Nar and Orhan Ocal and S. Shankar Sastry and Kannan Ramchandran},\nyear={2019},\nurl={https://openreview.net/forum?id=ByfbnsA9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer5;AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByfbnsA9Km", "pdf_size": 0, "rating": "3;4;5;5;8", "confidence": "4;5;4;4;3", "wc_review": "131;531;202;295;272", "wc_reply_reviewers": "0;407;0;316;0", "wc_reply_authors": "0;293;181;623;104", "reply_reviewers": "0;1;0;1;0", "reply_authors": "0;1;2;1;1", "rating_avg": [ 5.0, 1.6733200530681511 ], "confidence_avg": [ 4.0, 0.6324555320336759 ], "wc_review_avg": [ 286.2, 135.19082809125774 ], "wc_reply_reviewers_avg": [ 144.6, 179.42084605753035 ], "wc_reply_authors_avg": [ 240.2, 214.0442944813059 ], "reply_reviewers_avg": [ 0.4, 0.48989794855663565 ], "reply_authors_avg": [ 1.0, 0.6324555320336759 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1332505833615928188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "FlowQA: Grasping Flow in History for Conversational Machine Comprehension", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/761", "id": "ByftGnR9KX", "author_site": "Hsin-Yuan Huang, Eunsol Choi, Wen-tau Yih", "tldr": "We propose the Flow mechanism and an end-to-end architecture, FlowQA, that achieves SotA on two conversational QA datasets and a sequential instruction understanding task.", "abstract": "Conversational machine comprehension requires a deep understanding of the conversation history. To enable traditional, single-turn models to encode the history comprehensively, we introduce Flow, a mechanism that can incorporate intermediate representations generated during the process of answering previous questions, through an alternating parallel processing structure. Compared to shallow approaches that concatenate previous questions/answers as input, Flow integrates the latent semantics of the conversation history more deeply. Our model, FlowQA, shows superior performance on two recently proposed conversational challenges (+7.2% F1 on CoQA and +4.0% on QuAC). The effectiveness of Flow also shows in other tasks. By reducing sequential instruction understanding to conversational machine comprehension, FlowQA outperforms the best models on all three domains in SCONE, with +1.8% to +4.4% improvement in accuracy.", "keywords": "Machine Comprehension;Conversational Agent;Natural Language Processing;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Hsin-Yuan Huang;Eunsol Choi;Wen-tau Yih", "authorids": "hsinyuan@caltech.edu;eunsol@cs.washington.edu;scottyih@allenai.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhuang2018flowqa,\ntitle={Flow{QA}: Grasping Flow in History for Conversational Machine Comprehension},\nauthor={Hsin-Yuan Huang and Eunsol Choi and Wen-tau Yih},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByftGnR9KX},\n}", "github": "[![github](/images/github_icon.svg) momohuang/FlowQA](https://github.com/momohuang/FlowQA)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;5", "wc_review": "759;169;195", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "964;257;76", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 374.3333333333333, 272.20743724022105 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 432.3333333333333, 383.1382112095959 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13021094548556076955&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ByftGnR9KX", "pdf": "https://openreview.net/pdf?id=ByftGnR9KX", "email": ";;", "author_num": 3 }, { "title": "Learning to Design RNA", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/921", "id": "ByfyHh05tQ", "author_site": "Frederic Runge, Danny Stoll, Stefan Falkner, Frank Hutter", "tldr": "We learn to solve the RNA Design problem with reinforcement learning using meta learning and autoML approaches.", "abstract": "Designing RNA molecules has garnered recent interest in medicine, synthetic biology, biotechnology and bioinformatics since many functional RNA molecules were shown to be involved in regulatory processes for transcription, epigenetics and translation. Since an RNA's function depends on its structural properties, the RNA Design problem is to find an RNA sequence which satisfies given structural constraints. Here, we propose a new algorithm for the RNA Design problem, dubbed LEARNA. LEARNA uses deep reinforcement learning to train a policy network to sequentially design an entire RNA sequence given a specified target structure. By meta-learning across 65000 different RNA Design tasks for one hour on 20 CPU cores, our extension Meta-LEARNA constructs an RNA Design policy that can be applied out of the box to solve novel RNA Design tasks. Methodologically, for what we believe to be the first time, we jointly optimize over a rich space of architectures for the policy network, the hyperparameters of the training procedure and the formulation of the decision process. Comprehensive empirical results on two widely-used RNA Design benchmarks, as well as a third one that we introduce, show that our approach achieves new state-of-the-art performance on the former while also being orders of magnitudes faster in reaching the previous state-of-the-art performance. In an ablation study, we analyze the importance of our method's different components.\n", "keywords": "matter engineering;bioinformatics;rna design;reinforcement learning;meta learning;neural architecture search;hyperparameter optimization", "primary_area": "", "supplementary_material": "", "author": "Frederic Runge;Danny Stoll;Stefan Falkner;Frank Hutter", "authorids": "runget@cs.uni-freiburg.de;d.stoll@tutanota.com;sfalkner@cs.uni-freiburg.de;fh@cs.uni-freiburg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nrunge2018learning,\ntitle={Learning to Design {RNA}},\nauthor={Frederic Runge and Danny Stoll and Stefan Falkner and Frank Hutter},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByfyHh05tQ},\n}", "github": "[![github](/images/github_icon.svg) automl/learna](https://github.com/automl/learna) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=ByfyHh05tQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "1;4;4", "wc_review": "303;322;653", "wc_reply_reviewers": "0;57;451", "wc_reply_authors": "806;1108;3321", "reply_reviewers": "0;1;3", "reply_authors": "1;3;6", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 426.0, 160.70054967754984 ], "wc_reply_reviewers_avg": [ 169.33333333333334, 200.5232045314346 ], "wc_reply_authors_avg": [ 1745.0, 1121.199655131354 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 3.3333333333333335, 2.0548046676563256 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.49999999999999994, "gs_citation": 103, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17240520904353756155&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=ByfyHh05tQ", "pdf": "https://openreview.net/pdf?id=ByfyHh05tQ", "email": ";;;", "author_num": 4 }, { "title": "Robust Conditional Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1105", "id": "Byg0DsCqYQ", "author_site": "Grigorios Chrysos, Jean Kossaifi, Stefanos Zafeiriou", "tldr": "We introduce a new type of conditional GAN, which aims to leverage structure in the target space of the generator. We augment the generator with a new, unsupervised pathway to learn the target structure. ", "abstract": "Conditional generative adversarial networks (cGAN) have led to large improvements in the task of conditional image generation, which lies at the heart of computer vision. The major focus so far has been on performance improvement, while there has been little effort in making cGAN more robust to noise. The regression (of the generator) might lead to arbitrarily large errors in the output, which makes cGAN unreliable for real-world applications. In this work, we introduce a novel conditional GAN model, called RoCGAN, which leverages structure in the target space of the model to address the issue. Our model augments the generator with an unsupervised pathway, which promotes the outputs of the generator to span the target manifold even in the presence of intense noise. We prove that RoCGAN share similar theoretical properties as GAN and experimentally verify that our model outperforms existing state-of-the-art cGAN architectures by a large margin in a variety of domains including images from natural scenes and faces.", "keywords": "conditional GAN;unsupervised pathway;autoencoder;robustness", "primary_area": "", "supplementary_material": "", "author": "Grigorios G. Chrysos;Jean Kossaifi;Stefanos Zafeiriou", "authorids": "greggchrysos@gmail.com;jean.kossaifi@gmail.com;s.zafeiriou@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchrysos2018rocgan,\ntitle={RoC-{GAN}: Robust Conditional {GAN}},\nauthor={Grigorios G. Chrysos and Jean Kossaifi and Stefanos Zafeiriou},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byg0DsCqYQ},\n}", "github": "[![github](/images/github_icon.svg) grigorisg9gr/rocgan](https://github.com/grigorisg9gr/rocgan)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "wc_review": "203;390;276", "wc_reply_reviewers": "0;0;25", "wc_reply_authors": "1103;775;795", "reply_reviewers": "0;0;1", "reply_authors": "3;2;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 289.6666666666667, 76.95164426804378 ], "wc_reply_reviewers_avg": [ 8.333333333333334, 11.785113019775793 ], "wc_reply_authors_avg": [ 891.0, 150.1288335619333 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 282, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15862016331433813666&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Byg0DsCqYQ", "pdf": "https://openreview.net/pdf?id=Byg0DsCqYQ", "email": ";;", "author_num": 3 }, { "title": "Learning Protein Structure with a Differentiable Simulator", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/959", "id": "Byg3y3C9Km", "author_site": "John Ingraham, Adam J Riesselman, Chris Sander, Debora Marks", "tldr": "We use an unrolled simulator as an end-to-end differentiable model of protein structure and show it can (sometimes) hierarchically generalize to unseen fold topologies.", "abstract": "The Boltzmann distribution is a natural model for many systems, from brains to materials and biomolecules, but is often of limited utility for fitting data because Monte Carlo algorithms are unable to simulate it in available time. This gap between the expressive capabilities and sampling practicalities of energy-based models is exemplified by the protein folding problem, since energy landscapes underlie contemporary knowledge of protein biophysics but computer simulations are challenged to fold all but the smallest proteins from first principles. In this work we aim to bridge the gap between the expressive capacity of energy functions and the practical capabilities of their simulators by using an unrolled Monte Carlo simulation as a model for data. We compose a neural energy function with a novel and efficient simulator based on Langevin dynamics to build an end-to-end-differentiable model of atomic protein structure given amino acid sequence information. We introduce techniques for stabilizing backpropagation under long roll-outs and demonstrate the model's capacity to make multimodal predictions and to, in some cases, generalize to unobserved protein fold types when trained on a large corpus of protein structures.", "keywords": "generative models;simulators;molecular modeling;proteins;structured prediction", "primary_area": "", "supplementary_material": "", "author": "John Ingraham;Adam Riesselman;Chris Sander;Debora Marks", "authorids": "john.ingraham@gmail.com;adam.riesselman@gmail.com;cccsander@gmail.com;deboramarks@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ningraham2018learning,\ntitle={Learning Protein Structure with a Differentiable Simulator},\nauthor={John Ingraham and Adam Riesselman and Chris Sander and Debora Marks},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byg3y3C9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;6;7;7", "confidence": "3;5;3;5", "wc_review": "468;589;255;543", "wc_reply_reviewers": "351;0;0;0", "wc_reply_authors": "1024;549;322;416", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;2;1", "rating_avg": [ 6.5, 0.5 ], "confidence_avg": [ 4.0, 1.0 ], "wc_review_avg": [ 463.75, 128.02612038174084 ], "wc_reply_reviewers_avg": [ 87.75, 151.987458364169 ], "wc_reply_authors_avg": [ 577.75, 269.9707160045326 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 174, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12210832124993097695&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Byg3y3C9Km", "pdf": "https://openreview.net/pdf?id=Byg3y3C9Km", "email": ";;;", "author_num": 4 }, { "id": "Byg54oC5tQ", "title": "Generative Model For Material Irradiation Experiments Based On Prior Knowledge And Attention Mechanism", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Material irradiation experiment is dangerous and complex, which requires large number of high-level expertise in the manual processing of experimental images and data. In this paper, we propose a generative adversarial model based on prior knowledge and attention mechanism to achieve the generation of irradiated material images (data-to-image model), and a prediction model for corresponding industrial performance (image-to-data model). With the proposed models, researchers can skip the dangerous and complex irradiation experiments and obtain the irradiation images and industrial performance parameters directly by inputing some experimental parameters only. We also introduce a new dataset ISMD which contains 22000 irradiated images with 22,143 sets of corresponding parameters. Our model achieved high quality results by compared with several baseline models. The evaluation and detailed analysis are also performed.", "keywords": "Generative Model;Images of Irradiation Experiments;Prior Knowledge;Attention Mechanism", "primary_area": "", "supplementary_material": "", "author": "MinCong Luo;Li Liu", "authorids": "luomincong@foxmail.com;1920148271@qq.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Byg54oC5tQ", "pdf_size": 0, "rating": "3;3", "confidence": "4;4", "wc_review": "264;194", "wc_reply_reviewers": "0;0", "wc_reply_authors": "0;0", "reply_reviewers": "0;0", "reply_authors": "0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 229.0, 35.0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 3, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XLsRSdMPC6wJ:scholar.google.com/&scioq=Generative+Model+For+Material+Irradiation+Experiments+Based+On+Prior+Knowledge+And+Attention+Mechanism&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Top-Down Neural Model For Formulae", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1026", "id": "Byg5QhR5FQ", "tldr": "A top-down approach how to recursively represent propositional formulae by neural networks is presented.", "abstract": "We present a simple neural model that given a formula and a property tries to answer the question whether the formula has the given property, for example whether a propositional formula is always true. The structure of the formula is captured by a feedforward neural network recursively built for the given formula in a top-down manner. The results of this network are then processed by two recurrent neural networks. One of the interesting aspects of our model is how propositional atoms are treated. For example, the model is insensitive to their names, it only matters whether they are the same or distinct.", "keywords": "logic;formula;recursive neural networks;recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "Karel Chvalovsk\u00fd", "authorids": "karel@chvalovsky.cz", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nchvalovsky2018topdown,\ntitle={Top-Down Neural Model For Formulae},\nauthor={Karel Chvalovsky},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byg5QhR5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;4;3", "wc_review": "223;500;362", "wc_reply_reviewers": "21;0;0", "wc_reply_authors": "136;196;148", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 361.6666666666667, 113.08502209498047 ], "wc_reply_reviewers_avg": [ 7.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 160.0, 25.92296279363144 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6459149611018563492&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=Byg5QhR5FQ", "pdf": "https://openreview.net/pdf?id=Byg5QhR5FQ", "email": "", "author_num": 1 }, { "title": "Cost-Sensitive Robustness against Adversarial Examples", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1128", "id": "BygANhA9tQ", "author_site": "Xiao Zhang, David Evans", "tldr": "A general method for training certified cost-sensitive robust classifier against adversarial perturbations", "abstract": "Several recent works have developed methods for training classifiers that are certifiably robust against norm-bounded adversarial perturbations. These methods assume that all the adversarial transformations are equally important, which is seldom the case in real-world applications. We advocate for cost-sensitive robustness as the criteria for measuring the classifier's performance for tasks where some adversarial transformation are more important than others. We encode the potential harm of each adversarial transformation in a cost matrix, and propose a general objective function to adapt the robust training method of Wong & Kolter (2018) to optimize for cost-sensitive robustness. Our experiments on simple MNIST and CIFAR10 models with a variety of cost matrices show that the proposed approach can produce models with substantially reduced cost-sensitive robust error, while maintaining classification accuracy.", "keywords": "Certified robustness;Adversarial examples;Cost-sensitive learning", "primary_area": "", "supplementary_material": "", "author": "Xiao Zhang;David Evans", "authorids": "xz7bc@virginia.edu;evans@virginia.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhang2018costsensitive,\ntitle={Cost-Sensitive Robustness against Adversarial Examples},\nauthor={Xiao Zhang and David Evans},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BygANhA9tQ},\n}", "github": "[![github](/images/github_icon.svg) xiaozhanguva/Cost-Sensitive-Robustness](https://github.com/xiaozhanguva/Cost-Sensitive-Robustness)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;5;8", "confidence": "3;4;3", "wc_review": "235;325;449", "wc_reply_reviewers": "139;374;163", "wc_reply_authors": "447;1007;505", "reply_reviewers": "1;1;2", "reply_authors": "2;3;3", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 336.3333333333333, 87.73191488215043 ], "wc_reply_reviewers_avg": [ 225.33333333333334, 105.57882784388586 ], "wc_reply_authors_avg": [ 653.0, 251.43322506515852 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16169861265468560490&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=BygANhA9tQ", "pdf": "https://openreview.net/pdf?id=BygANhA9tQ", "email": ";", "author_num": 2 }, { "id": "BygANjA5FX", "title": "IEA: Inner Ensemble Average within a convolutional neural network", "track": "main", "status": "Reject", "tldr": "We inner ensemble the features of a convolutional neural layer, it increases the network accuracy and generates distinct features.", "abstract": "Ensemble learning is a method of combining multiple trained models to improve model accuracy. We propose the usage of such methods, specifically ensemble average, inside Convolutional Neural Network (CNN) architectures by replacing the single convolutional layers with Inner Average Ensembles (IEA) of multiple convolutional layers. Empirical results on different benchmarking datasets show that CNN models using IEA outperform those with regular convolutional layers and advances the state of art. A visual and a similarity score analysis of the features generated from IEA explains why it boosts the model performance.", "keywords": "Ensemble Convolutional Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Abduallah Mohamed;Xinrui Hua;Xianda Zhou;Christian Claudel", "authorids": "abduallah.mohamed@utexas.edu;xinruihua@utexas.edu;xianda@utexas.edu;christian.claudel@utexas.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmohamed2019iea,\ntitle={{IEA}: Inner Ensemble Average within a convolutional neural network},\nauthor={Abduallah Mohamed and Xinrui Hua and Xianda Zhou and Christian Claudel},\nyear={2019},\nurl={https://openreview.net/forum?id=BygANjA5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=BygANjA5FX", "pdf_size": 0, "rating": "2;4;4", "confidence": "4;5;3", "wc_review": "497;139;572", "wc_reply_reviewers": "66;0;373", "wc_reply_authors": "383;335;654", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 402.6666666666667, 188.93796748021705 ], "wc_reply_reviewers_avg": [ 146.33333333333334, 162.52657902290593 ], "wc_reply_authors_avg": [ 457.3333333333333, 140.43820309619775 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "BygGNnCqKQ", "title": "Architecture Compression", "track": "main", "status": "Reject", "tldr": "Novel gradient descent approach to perform model compression in architecture space", "abstract": "In this paper we propose a novel approach to model compression termed Architecture Compression. Instead of operating on the weight or filter space of the network like classical model compression methods, our approach operates on the architecture space. A 1-D CNN encoder/decoder is trained to learn a mapping from discrete architecture space to a continuous embedding and back. Additionally, this embedding is jointly trained to regress accuracy and parameter count in order to incorporate information about the architecture's effectiveness on the dataset. During the compression phase, we first encode the network and then perform gradient descent in continuous space to optimize a compression objective function that maximizes accuracy and minimizes parameter count. The final continuous feature is then mapped to a discrete architecture using the decoder. We demonstrate the merits of this approach on visual recognition tasks such as CIFAR-10/100, FMNIST and SVHN and achieve a greater than 20x compression on CIFAR-10.", "keywords": "compression;architecture search", "primary_area": "", "supplementary_material": "", "author": "Anubhav Ashok", "authorids": "anubhava@alumni.cmu.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nashok2019architecture,\ntitle={Architecture Compression},\nauthor={Anubhav Ashok},\nyear={2019},\nurl={https://openreview.net/forum?id=BygGNnCqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BygGNnCqKQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;3;4", "wc_review": "1246;308;455", "wc_reply_reviewers": "0;0;67", "wc_reply_authors": "1242;360;416", "reply_reviewers": "0;0;1", "reply_authors": "2;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 669.6666666666666, 411.92421093637546 ], "wc_reply_reviewers_avg": [ 22.333333333333332, 31.584102892999123 ], "wc_reply_authors_avg": [ 672.6666666666666, 403.2280854415884 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "BygIV2CcKm", "title": "Learning to Augment Influential Data", "track": "main", "status": "Reject", "tldr": "", "abstract": "Data augmentation is a technique to reduce overfitting and to improve generalization by increasing the number of labeled data samples by performing label preserving transformations; however, it is currently conducted in a trial and error manner. A composition of predefined transformations, such as rotation, scaling and cropping, is performed on training samples, and its effect on performance over test samples can only be empirically evaluated and cannot be predicted. This paper considers an influence function which predicts how generalization is affected by a particular augmented training sample in terms of validation loss. The influence function provides an approximation of the change in validation loss without comparing the performance which includes and excludes the sample in the training process. A differentiable augmentation model that generalizes the conventional composition of predefined transformations is also proposed. The differentiable augmentation model and reformulation of the influence function allow the parameters of the augmented model to be directly updated by backpropagation to minimize the validation loss. The experimental results show that the proposed method provides better generalization over conventional data augmentation methods.", "keywords": "data augmentation;influence function;generative adversarial network", "primary_area": "", "supplementary_material": "", "author": "Donghoon Lee;Chang D. Yoo", "authorids": "iamdh@kaist.ac.kr;cd_yoo@kaist.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2019learning,\ntitle={Learning to Augment Influential Data},\nauthor={Donghoon Lee and Chang D. Yoo},\nyear={2019},\nurl={https://openreview.net/forum?id=BygIV2CcKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BygIV2CcKm", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "157;307;259", "wc_reply_reviewers": "0;36;0", "wc_reply_authors": "495;552;269", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 241.0, 62.545983084447556 ], "wc_reply_reviewers_avg": [ 12.0, 16.97056274847714 ], "wc_reply_authors_avg": [ 438.6666666666667, 122.20838305488249 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14361301963830347100&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BygMAiRqK7", "title": "Entropic GANs meet VAEs: A Statistical Approach to Compute Sample Likelihoods in GANs", "track": "main", "status": "Reject", "tldr": "A statistical approach to compute sample likelihoods in Generative Adversarial Networks", "abstract": "Building on the success of deep learning, two modern approaches to learn a probability model of the observed data are Generative Adversarial Networks (GANs) and Variational AutoEncoders (VAEs). VAEs consider an explicit probability model for the data and compute a generative distribution by maximizing a variational lower-bound on the log-likelihood function. GANs, however, compute a generative model by minimizing a distance between observed and generated probability distributions without considering an explicit model for the observed data. The lack of having explicit probability models in GANs prohibits computation of sample likelihoods in their frameworks and limits their use in statistical inference problems. In this work, we show that an optimal transport GAN with the entropy regularization can be viewed as a generative model that maximizes a lower-bound on average sample likelihoods, an approach that VAEs are based on. In particular, our proof constructs an explicit probability model for GANs that can be used to compute likelihood statistics within GAN's framework. Our numerical results on several datasets demonstrate consistent trends with the proposed theory. ", "keywords": "GAN;VAE;likelihood estimation;statistical inference", "primary_area": "", "supplementary_material": "", "author": "Yogesh Balaji;Hamed Hasani;Rama Chellappa;Soheil Feizi", "authorids": "yogesh@cs.umd.edu;hassani@seas.upenn.edu;rama@umiacs.umd.edu;sfeizi@cs.umd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbalaji2019entropic,\ntitle={Entropic {GAN}s meet {VAE}s: A Statistical Approach to Compute Sample Likelihoods in {GAN}s},\nauthor={Yogesh Balaji and Hamed Hasani and Rama Chellappa and Soheil Feizi},\nyear={2019},\nurl={https://openreview.net/forum?id=BygMAiRqK7},\n}", "github": "[![github](/images/github_icon.svg) yogeshbalaji/EntropicGANs_meet_VAEs](https://github.com/yogeshbalaji/EntropicGANs_meet_VAEs)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BygMAiRqK7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;5", "wc_review": "139;215;353", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "215;626;603", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 235.66666666666666, 88.57890393441444 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 481.3333333333333, 188.56004054117324 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4502964466526434508&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "BygNqoR9tm", "title": "Sinkhorn AutoEncoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Optimal Transport offers an alternative to maximum likelihood for learning generative autoencoding models. We show how this principle dictates the minimization of the Wasserstein distance between the encoder aggregated posterior and the prior, plus a reconstruction error. We prove that in the non-parametric limit the autoencoder generates the data distribution if and only if the two distributions match exactly, and that the optimum can be obtained by deterministic autoencoders.\nWe then introduce the Sinkhorn AutoEncoder (SAE), which casts the problem into Optimal Transport on the latent space. The resulting Wasserstein distance is minimized by backpropagating through the Sinkhorn algorithm. \nSAE models the aggregated posterior as an implicit distribution and therefore does not need a reparameterization trick for gradients estimation. Moreover, it requires virtually no adaptation to different prior distributions. We demonstrate its flexibility by considering models with hyperspherical and Dirichlet priors, as well as a simple case of probabilistic programming. SAE matches or outperforms other autoencoding models in visual quality and FID scores. ", "keywords": "generative models;autoencoders;optimal transport;sinkhorn algorithm", "primary_area": "", "supplementary_material": "", "author": "Giorgio Patrini;Marcello Carioni;Patrick Forr\u00e9;Samarth Bhargav;Max Welling;Rianne van den Berg;Tim Genewein;Frank Nielsen", "authorids": "patrinig@hotmail.com;marcello.carioni@uni-graz.at;patrickforre@gmail.com;samarth.bhargav@student.uva.nl;welling.max@gmail.com;riannevdberg@gmail.com;tim.genewein@de.bosch.com;nielsen@lix.polytechnique.fr", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\npatrini2019sinkhorn,\ntitle={Sinkhorn AutoEncoders},\nauthor={Giorgio Patrini and Marcello Carioni and Patrick Forr\u00e9 and Samarth Bhargav and Max Welling and Rianne van den Berg and Tim Genewein and Frank Nielsen},\nyear={2019},\nurl={https://openreview.net/forum?id=BygNqoR9tm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=BygNqoR9tm)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=BygNqoR9tm", "pdf_size": 0, "rating": "5;6;7;7", "confidence": "4;3;3;3", "wc_review": "470;92;183;361", "wc_reply_reviewers": "216;0;0;0", "wc_reply_authors": "644;127;0;0", "reply_reviewers": "1;0;0;0", "reply_authors": "1;1;0;0", "rating_avg": [ 6.25, 0.82915619758885 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 276.5, 147.78785471073056 ], "wc_reply_reviewers_avg": [ 54.0, 93.53074360871938 ], "wc_reply_authors_avg": [ 192.75, 265.63826437469436 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.8703882797784891, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11512121849599908997&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14 }, { "id": "BygREjC9YQ", "title": "A unified theory of adaptive stochastic gradient descent as Bayesian filtering", "track": "main", "status": "Reject", "tldr": "We formulated SGD as a Bayesian filtering problem, and show that this gives rise to RMSprop, Adam, AdamW, NAG and other features of state-of-the-art adaptive methods", "abstract": "We formulate stochastic gradient descent (SGD) as a novel factorised Bayesian filtering problem, in which each parameter is inferred separately, conditioned on the corresopnding backpropagated gradient. Inference in this setting naturally gives rise to BRMSprop and BAdam: Bayesian variants of RMSprop and Adam. Remarkably, the Bayesian approach recovers many features of state-of-the-art adaptive SGD methods, including amongst others root-mean-square normalization, Nesterov acceleration and AdamW. As such, the Bayesian approach provides one explanation for the empirical effectiveness of state-of-the-art adaptive SGD algorithms. Empirically comparing BRMSprop and BAdam with naive RMSprop and Adam on MNIST, we find that Bayesian methods have the potential to considerably reduce test loss and classification error.", "keywords": "SGD;Bayesian;RMSprop;Adam", "primary_area": "", "supplementary_material": "", "author": "Laurence Aitchison", "authorids": "laurence.aitchison@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\naitchison2019a,\ntitle={A unified theory of adaptive stochastic gradient descent as Bayesian filtering},\nauthor={Laurence Aitchison},\nyear={2019},\nurl={https://openreview.net/forum?id=BygREjC9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BygREjC9YQ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "wc_review": "954;749;778", "wc_reply_reviewers": "1769;756;1117", "wc_reply_authors": "2626;1276;2004", "reply_reviewers": "5;2;3", "reply_authors": "6;3;5", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 827.0, 90.57961507241387 ], "wc_reply_reviewers_avg": [ 1214.0, 419.20480277146953 ], "wc_reply_authors_avg": [ 1968.6666666666667, 551.7012073778906 ], "reply_reviewers_avg": [ 3.3333333333333335, 1.247219128924647 ], "reply_authors_avg": [ 4.666666666666667, 1.247219128924647 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8556429872852614800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BygRNn0qYX", "title": "P^2IR: Universal Deep Node Representation via Partial Permutation Invariant Set Functions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph node representation learning is a central problem in social network analysis, aiming to learn the vector representation for each node in a graph. The key problem is how to model the dependence of each node to its neighbor nodes since the neighborhood can uniquely characterize a graph. Most existing approaches rely on defining the specific neighborhood dependence as the computation mechanism of representations, which may exclude important subtle structures within the graph and dependence among neighbors. Instead, we propose a novel graph node embedding method (namely P^2IR) via developing a novel notion, namely partial permutation invariant set function} to learn those subtle structures. Our method can 1) learn an arbitrary form of the representation function from the neighborhood, without losing any potential dependence structures, 2) automatically decide the significance of neighbors at different distances, and 3) be applicable to both homogeneous and heterogeneous graph embedding, which may contain multiple types of nodes. Theoretical guarantee for the representation capability of our method has been proved for general homogeneous and heterogeneous graphs. Evaluation results on benchmark data sets show that the proposed P^IR outperforms the state-of-the-art approaches on producing node vectors for classification tasks.", "keywords": "graph embedding;set function;representation learning", "primary_area": "", "supplementary_material": "", "author": "Shupeng Gui;Xiangliang Zhang;Shuang Qiu;Mingrui Wu;Jieping Ye;Ji Liu", "authorids": "sgui2@ur.rochester.edu;xiangliang.zhang@kaust.edu.sa;qiush@umich.edu;mingrui.wu@alibaba-inc.com;jieping@gmail.com;ji.liu.uwisc@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ngui2019pir,\ntitle={P^2{IR}: Universal Deep Node Representation via Partial Permutation Invariant Set Functions},\nauthor={Shupeng Gui and Xiangliang Zhang and Shuang Qiu and Mingrui Wu and Jieping Ye and Ji Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=BygRNn0qYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BygRNn0qYX", "pdf_size": 0, "rating": "4;5;5;7", "confidence": "4;5;3;4", "wc_review": "366;833;334;265", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 5.25, 1.0897247358851685 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "wc_review_avg": [ 449.5, 224.40198305719136 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9zKvZHrNlEwJ:scholar.google.com/&scioq=P%5E2IR:+Universal+Deep+Node+Representation+via+Partial+Permutation+Invariant+Set+Functions&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "The role of over-parametrization in generalization of neural networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/886", "id": "BygfghAcYX", "author_site": "Behnam Neyshabur, Zhiyuan Li, Srinadh Bhojanapalli, Yann LeCun, Nathan Srebro", "tldr": "We suggest a generalization bound that could partly explain the improvement in generalization with over-parametrization.", "abstract": "Despite existing work on ensuring generalization of neural networks in terms of scale sensitive complexity measures, such as norms, margin and sharpness, these complexity measures do not offer an explanation of why neural networks generalize better with over-parametrization. In this work we suggest a novel complexity measure based on unit-wise capacities resulting in a tighter generalization bound for two layer ReLU networks. Our capacity bound correlates with the behavior of test error with increasing network sizes (within the range reported in the experiments), and could partly explain the improvement in generalization with over-parametrization. We further present a matching lower bound for the Rademacher complexity that improves over previous capacity lower bounds for neural networks. ", "keywords": "Generalization;Over-Parametrization;Neural Networks;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Behnam Neyshabur;Zhiyuan Li;Srinadh Bhojanapalli;Yann LeCun;Nathan Srebro", "authorids": "bneyshabur@gmail.com;zhiyuanli@cs.princeton.edu;srinadh@ttic.edu;yann@cs.nyu.edu;nati@ttic.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nneyshabur2018the,\ntitle={The role of over-parametrization in generalization of neural networks},\nauthor={Behnam Neyshabur and Zhiyuan Li and Srinadh Bhojanapalli and Yann LeCun and Nathan Srebro},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BygfghAcYX},\n}", "github": "[![github](/images/github_icon.svg) bneyshabur/over-parametrization](https://github.com/bneyshabur/over-parametrization)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;5;3", "wc_review": "518;819;488", "wc_reply_reviewers": "0;102;0", "wc_reply_authors": "119;234;46", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 608.3333333333334, 149.46645851903438 ], "wc_reply_reviewers_avg": [ 34.0, 48.08326112068523 ], "wc_reply_authors_avg": [ 133.0, 77.38647599333275 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=BygfghAcYX", "pdf": "https://openreview.net/pdf?id=BygfghAcYX", "email": ";;;;", "author_num": 5 }, { "title": "ImageNet-trained CNNs are biased towards texture; increasing shape bias improves accuracy and robustness", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/697", "id": "Bygh9j09KX", "author_site": "Robert Geirhos, Patricia Rubisch, Claudio Michaelis, Matthias Bethge, Felix Wichmann, Wieland Brendel", "tldr": "ImageNet-trained CNNs are biased towards object texture (instead of shape like humans). Overcoming this major difference between human and machine vision yields improved detection performance and previously unseen robustness to image distortions.", "abstract": "Convolutional Neural Networks (CNNs) are commonly thought to recognise objects by learning increasingly complex representations of object shapes. Some recent studies suggest a more important role of image textures. We here put these conflicting hypotheses to a quantitative test by evaluating CNNs and human observers on images with a texture-shape cue conflict. We show that ImageNet-trained CNNs are strongly biased towards recognising textures rather than shapes, which is in stark contrast to human behavioural evidence and reveals fundamentally different classification strategies. We then demonstrate that the same standard architecture (ResNet-50) that learns a texture-based representation on ImageNet is able to learn a shape-based representation instead when trained on 'Stylized-ImageNet', a stylized version of ImageNet. This provides a much better fit for human behavioural performance in our well-controlled psychophysical lab setting (nine experiments totalling 48,560 psychophysical trials across 97 observers) and comes with a number of unexpected emergent benefits such as improved object detection performance and previously unseen robustness towards a wide range of image distortions, highlighting advantages of a shape-based representation.", "keywords": "deep learning;psychophysics;representation learning;object recognition;robustness;neural networks;data augmentation", "primary_area": "", "supplementary_material": "", "author": "Robert Geirhos;Patricia Rubisch;Claudio Michaelis;Matthias Bethge;Felix A. Wichmann;Wieland Brendel", "authorids": "robert@geirhos.de;patricia@rubisch.net;claudio.michaelis@bethgelab.org;matthias.bethge@uni-tuebingen.de;felix.wichmann@uni-tuebingen.de;wieland.brendel@bethgelab.org", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ngeirhos2018imagenettrained,\ntitle={ImageNet-trained {CNN}s are biased towards texture; increasing shape bias improves accuracy and robustness.},\nauthor={Robert Geirhos and Patricia Rubisch and Claudio Michaelis and Matthias Bethge and Felix A. Wichmann and Wieland Brendel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bygh9j09KX},\n}", "github": "[![github](/images/github_icon.svg) rgeirhos/Stylized-ImageNet](https://github.com/rgeirhos/Stylized-ImageNet) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=Bygh9j09KX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "wc_review": "456;760;329", "wc_reply_reviewers": "0;103;0", "wc_reply_authors": "1293;754;537", "reply_reviewers": "0;1;0", "reply_authors": "2;1;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 515.0, 180.8332565284015 ], "wc_reply_reviewers_avg": [ 34.333333333333336, 48.554665641476255 ], "wc_reply_authors_avg": [ 861.3333333333334, 317.8304929081258 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3448, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14190455085351957023&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=Bygh9j09KX", "pdf": "https://openreview.net/pdf?id=Bygh9j09KX", "email": ";;;;;", "author_num": 6 }, { "id": "ByghKiC5YX", "title": "Greedy Attack and Gumbel Attack: Generating Adversarial Examples for Discrete Data", "track": "main", "status": "Reject", "tldr": "We develop two methods for generating adversarial examples on discrete data under a probabilistic framework.", "abstract": "We present a probabilistic framework for studying adversarial attacks on discrete data. Based on this framework, we derive a perturbation-based method, Greedy Attack, and a scalable learning-based method, Gumbel Attack, that illustrate various tradeoffs in the design of attacks. We demonstrate the effectiveness of these methods using both quantitative metrics and human evaluation on various state-of-the-art models for text classification, including a word-based CNN, a character-based CNN and an LSTM. As an example of our results, we show that the accuracy of character-based convolutional networks drops to the level of random selection by modifying only five characters through Greedy Attack.", "keywords": "Adversarial Examples", "primary_area": "", "supplementary_material": "", "author": "Puyudi Yang;Jianbo Chen;Cho-Jui Hsieh;Jane-Ling Wang;Michael I. Jordan", "authorids": "pydyang@ucdavis.edu;jianbochen@berkeley.edu;chohsieh@ucdavis.edu;janelwang@ucdavis.edu;jordan@cs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyang2019greedy,\ntitle={Greedy Attack and Gumbel Attack: Generating Adversarial Examples for Discrete Data},\nauthor={Puyudi Yang and Jianbo Chen and Cho-Jui Hsieh and Jane-Ling Wang and Michael I. Jordan},\nyear={2019},\nurl={https://openreview.net/forum?id=ByghKiC5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ByghKiC5YX", "pdf_size": 0, "rating": "3;6;7;8", "confidence": "4;4;4;2", "wc_review": "309;339;224;280", "wc_reply_reviewers": "192;222;0;0", "wc_reply_authors": "472;1487;65;267", "reply_reviewers": "1;2;0;0", "reply_authors": "2;3;1;1", "rating_avg": [ 6.0, 1.8708286933869707 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "wc_review_avg": [ 288.0, 42.432299018554254 ], "wc_reply_reviewers_avg": [ 103.5, 104.04205880315902 ], "wc_reply_authors_avg": [ 572.75, 547.1052800878456 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.75, 0.82915619758885 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.6172133998483676, "gs_citation": 135, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8196641774334393385&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "BygmRoA9YQ", "title": "Mixture of Pre-processing Experts Model for Noise Robust Deep Learning on Resource Constrained Platforms", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning on an edge device requires energy efficient operation due to ever diminishing power budget. Intentional low quality data during the data acquisition for longer battery life, and natural noise from the low cost sensor degrade the quality of target output which hinders adoption of deep learning on an edge device. To overcome these problems, we propose simple yet efficient mixture of pre-processing experts (MoPE) model to handle various image distortions including low resolution and noisy images. We also propose to use adversarially trained auto encoder as a pre-processing expert for the noisy images. We evaluate our proposed method for various machine learning tasks including object detection on MS-COCO 2014 dataset, multiple object tracking problem on MOT-Challenge dataset, and human activity recognition on UCF 101 dataset. Experimental results show that the proposed method achieves better detection, tracking and activity recognition accuracies under noise without sacrificing accuracies for the clean images. The overheads of our proposed MoPE are 0.67% and 0.17% in terms of memory and computation compared to the baseline object detection network.", "keywords": "noise robust;object detection", "primary_area": "", "supplementary_material": "", "author": "Taesik Na;Minah Lee;Burhan A. Mudassar;Priyabrata Saha;Jong Hwan Ko;Saibal Mukhopadhyay", "authorids": "taesik.na@gatech.edu;minah.lee@gatech.edu;burhan.mudassar@gatech.edu;priyabratasaha@gatech.edu;jonghwan.ko@gatech.edu;smukhopadhyay6@gatech.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nna2019mixture,\ntitle={Mixture of Pre-processing Experts Model for Noise Robust Deep Learning on Resource Constrained Platforms},\nauthor={Taesik Na and Minah Lee and Burhan A. Mudassar and Priyabrata Saha and Jong Hwan Ko and Saibal Mukhopadhyay},\nyear={2019},\nurl={https://openreview.net/forum?id=BygmRoA9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BygmRoA9YQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;5", "wc_review": "307;128;105", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "311;239;248", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 180.0, 90.29211851909704 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 266.0, 32.03123475609393 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8834070193087360180&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Diffusion Scattering Transforms on Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1130", "id": "BygqBiRcFQ", "author_site": "Fernando Gama, Alejandro Ribeiro, Joan Bruna", "tldr": "Stability of scattering transform representations of graph data to deformations of the underlying graph support.", "abstract": "Stability is a key aspect of data analysis. In many applications, the natural notion of stability is geometric, as illustrated for example in computer vision. Scattering transforms construct deep convolutional representations which are certified stable to input deformations. This stability to deformations can be interpreted as stability with respect to changes in the metric structure of the domain. \n\nIn this work, we show that scattering transforms can be generalized to non-Euclidean domains using diffusion wavelets, while preserving a notion of stability with respect to metric changes in the domain, measured with diffusion maps. The resulting representation is stable to metric perturbations of the domain while being able to capture ''high-frequency'' information, akin to the Euclidean Scattering. ", "keywords": "graph neural networks;deep learning;stability;scattering transforms;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Fernando Gama;Alejandro Ribeiro;Joan Bruna", "authorids": "fgama@seas.upenn.edu;aribeiro@seas.upenn.edu;bruna@cims.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngama2018diffusion,\ntitle={Diffusion Scattering Transforms on Graphs},\nauthor={Fernando Gama and Alejandro Ribeiro and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BygqBiRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;3;5", "wc_review": "441;593;178", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1333;1439;729", "reply_reviewers": "0;0;0", "reply_authors": "3;2;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 404.0, 171.43123013811302 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1167.0, 312.72138824625773 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.6546536707079772, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7856126226724225104&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=BygqBiRcFQ", "pdf": "https://openreview.net/pdf?id=BygqBiRcFQ", "email": ";;", "author_num": 3 }, { "id": "Bygre3R9Fm", "title": "DEFactor: Differentiable Edge Factorization-based Probabilistic Graph Generation", "track": "main", "status": "Reject", "tldr": "New scalable graph decoding scheme that allows to perform direct molecular graph conditional generation.", "abstract": "Generating novel molecules with optimal properties is a crucial step in many industries such as drug discovery. \nRecently, deep generative models have shown a promising way of performing de-novo molecular design. \nAlthough graph generative models are currently available they either have a graph size dependency in their number of parameters, limiting their use to only very small graphs or are formulated as a sequence of discrete actions needed to construct a graph, making the output graph non-differentiable w.r.t the model parameters, therefore preventing them to be used in scenarios such as conditional graph generation. In this work we propose a model for conditional graph generation that is computationally efficient and enables direct optimisation of the graph. We demonstrate favourable performance of our model on prototype-based molecular graph conditional generation tasks.", "keywords": "molecular graphs;conditional autoencoder;graph autoencoder", "primary_area": "", "supplementary_material": "", "author": "Rim Assouel;Mohamed Ahmed;Marwin Segler;Amir Saffari;Yoshua Bengio", "authorids": "rim.assouel@hotmail.fr;mohamed.ahmed@benevolent.ai;marwin.segler@benevolent.ai;amir.saffari@benevolent.ai;yoshua.bengio@mila.quebec", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nassouel2019defactor,\ntitle={{DEF}actor: Differentiable Edge Factorization-based Probabilistic Graph Generation},\nauthor={Rim Assouel and Mohamed Ahmed and Marwin Segler and Amir Saffari and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=Bygre3R9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Bygre3R9Fm", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;3", "wc_review": "146;642;276", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "555;1110;822", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 354.6666666666667, 209.99259246194586 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 829.0, 226.63186007267382 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1824226348029557479&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "BygrtoC9Km", "title": "Meta-Learning with Individualized Feature Space for Few-Shot Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Meta-learning provides a promising learning framework to address few-shot classification tasks. In existing meta-learning methods, the meta-learner is designed to learn about model optimization, parameter initialization, or similarity metric. Differently, in this paper, we propose to learn how to create an individualized feature embedding specific to a given query image for better classifying, i.e., given a query image, a specific feature embedding tailored for its characteristics is created accordingly, leading to an individualized feature space in which the query image can be more accurately classified.\u00a0 Specifically, we introduce a kernel generator as meta-learner to learn to construct feature embedding for query images. The kernel generator acquires meta-knowledge of generating adequate convolutional kernels for different query images during training, which can generalize to unseen categories without fine-tuning. In two standard few-shot classification data sets, i.e. Omniglot, and \\emph{mini}ImageNet, our method shows highly competitive performance. ", "keywords": "few-shot classification;meta-learning;individualized feature space", "primary_area": "", "supplementary_material": "", "author": "Chunrui Han;Shiguang Shan;Meina Kan;Shuzhe Wu;Xilin Chen", "authorids": "chunrui.han@vipl.ict.ac.cn;sgshan@ict.ac.cn;kanmeina@ict.ac.cn;shuzhe.wu@vipl.ict.ac.cn;xlchen@ict.ac.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhan2019metalearning,\ntitle={Meta-Learning with Individualized Feature Space for Few-Shot Classification},\nauthor={Chunrui Han and Shiguang Shan and Meina Kan and Shuzhe Wu and Xilin Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=BygrtoC9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BygrtoC9Km", "pdf_size": 0, "rating": "3;5;5", "confidence": "3;4;4", "wc_review": "355;489;622", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 488.6666666666667, 109.00254839018929 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4663138618806190017&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Capsule Graph Neural Network", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/932", "id": "Byl8BnRcYm", "author_site": "xinyi zhang, Lihui Chen", "tldr": "Inspired by CapsNet, we propose a novel architecture for graph embeddings on the basis of node features extracted from GNN.", "abstract": "The high-quality node embeddings learned from the Graph Neural Networks (GNNs) have been applied to a wide range of node-based applications and some of them have achieved state-of-the-art (SOTA) performance. However, when applying node embeddings learned from GNNs to generate graph embeddings, the scalar node representation may not suffice to preserve the node/graph properties efficiently, resulting in sub-optimal graph embeddings.\n\nInspired by the Capsule Neural Network (CapsNet), we propose the Capsule Graph Neural Network (CapsGNN), which adopts the concept of capsules to address the weakness in existing GNN-based graph embeddings algorithms. By extracting node features in the form of capsules, routing mechanism can be utilized to capture important information at the graph level. As a result, our model generates multiple embeddings for each graph to capture graph properties from different aspects. The attention module incorporated in CapsGNN is used to tackle graphs with various sizes which also enables the model to focus on critical parts of the graphs.\n\nOur extensive evaluations with 10 graph-structured datasets demonstrate that CapsGNN has a powerful mechanism that operates to capture macroscopic properties of the whole graph by data-driven. It outperforms other SOTA techniques on several graph classification tasks, by virtue of the new instrument.", "keywords": "CapsNet;Graph embedding;GNN", "primary_area": "", "supplementary_material": "", "author": "Zhang Xinyi;Lihui Chen", "authorids": "xinyi001@e.ntu.edu.sg;elhchen@ntu.edu.sg", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nxinyi2018capsule,\ntitle={Capsule Graph Neural Network},\nauthor={Zhang Xinyi and Lihui Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byl8BnRcYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "wc_review": "461;528;204", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 397.6666666666667, 139.64797010896922 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 254, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4030715970262857024&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Byl8BnRcYm", "pdf": "https://openreview.net/pdf?id=Byl8BnRcYm", "email": ";", "author_num": 2 }, { "id": "Byl9bhA5F7", "title": "Found by NEMO: Unsupervised Object Detection from Negative Examples and Motion", "track": "main", "status": "Withdraw", "tldr": "Learning to detect objects without image labels from 3 minutes of video", "abstract": "This paper introduces NEMO, an approach to unsupervised object detection that uses motion---instead of image labels---as a cue to learn object detection. To discriminate between motion of the target object and other changes in the image, it relies on negative examples that show the scene without the object. The required data can be collected very easily by recording two short videos, a positive one showing the object in motion and a negative one showing the scene without the object. Without any additional form of pretraining or supervision and despite of occlusions, distractions, camera motion, and adverse lighting, those videos are sufficient to learn object detectors that can be applied to new videos and even generalize to unseen scenes and camera angles. In a baseline comparison, unsupervised object detection outperforms off-the shelf template matching and tracking approaches that are given an initial bounding box of the object. The learned object representations are also shown to be accurate enough to capture the relevant information from manipulation task demonstrations, which makes them applicable to learning from demonstration in robotics. An example of object detection that was learned from 3 minutes of video can be found here: http://y2u.be/u_jyz9_ETz4", "keywords": "unsupervised learning;computer vision;object detection", "primary_area": "", "supplementary_material": "", "author": "Rico Jonschkowski", "authorids": "rjon@google.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Byl9bhA5F7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "1196;366;795", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 785.6666666666666, 338.91034540453643 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11274628496748604640&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BylBfnRqFm", "title": "CAML: Fast Context Adaptation via Meta-Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose CAML, a meta-learning method for fast adaptation that partitions the model parameters into two parts: context parameters that serve as additional input to the model and are adapted on individual tasks, and shared parameters that are meta-trained and shared across tasks. At test time, the context parameters are updated with one or several gradient steps on a task-specific loss that is backpropagated through the shared part of the network. Compared to approaches that adjust all parameters on a new task (e.g., MAML), our method can be scaled up to larger networks without overfitting on a single task, is easier to implement, and saves memory writes during training and network communication at test time for distributed machine learning systems. We show empirically that this approach outperforms MAML, is less sensitive to the task-specific learning rate, can capture meaningful task embeddings with the context parameters, and outperforms alternative partitionings of the parameter vectors.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Luisa M Zintgraf;Kyriacos Shiarlis;Vitaly Kurin;Katja Hofmann;Shimon Whiteson", "authorids": "lmzintgraf@gmail.com;kyriacos@latentlogic.com;vitaly.kurin@eng.ox.ac.uk;katja.hofmann@microsoft.com;shimon.whiteson@cs.ox.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzintgraf2019caml,\ntitle={{CAML}: Fast Context Adaptation via Meta-Learning},\nauthor={Luisa M Zintgraf and Kyriacos Shiarlis and Vitaly Kurin and Katja Hofmann and Shimon Whiteson},\nyear={2019},\nurl={https://openreview.net/forum?id=BylBfnRqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=BylBfnRqFm", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "5;2;2;4", "wc_review": "608;112;142;281", "wc_reply_reviewers": "89;0;0;0", "wc_reply_authors": "937;147;11;10", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "rating_avg": [ 5.5, 0.8660254037844386 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "wc_review_avg": [ 285.75, 196.67279298367632 ], "wc_reply_reviewers_avg": [ 22.25, 38.53813046840752 ], "wc_reply_authors_avg": [ 276.25, 385.5329914546873 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7777777777777777, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10412613781945813097&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "BylBns0qtX", "title": "On Learning Heteroscedastic Noise Models within Differentiable Bayes Filters", "track": "main", "status": "Reject", "tldr": "We evaluate learning heteroscedastic noise models within different Differentiable Bayes Filters", "abstract": "In many robotic applications, it is crucial to maintain a belief about the state of \na system, like the location of a robot or the pose of an object.\nThese state estimates serve as input for planning and decision making and \nprovide feedback during task execution. \nRecursive Bayesian Filtering algorithms address the state estimation problem,\nbut they require a model of the process dynamics and the sensory observations as well as \nnoise estimates that quantify the accuracy of these models. \nRecently, multiple works have demonstrated that the process and sensor models can be \nlearned by end-to-end training through differentiable versions of Recursive Filtering methods.\nHowever, even if the predictive models are known, finding suitable noise models \nremains challenging. Therefore, many practical applications rely on very simplistic noise \nmodels. \nOur hypothesis is that end-to-end training through differentiable Bayesian \nFilters enables us to learn more complex heteroscedastic noise models for\nthe system dynamics. We evaluate learning such models with different types of \nfiltering algorithms and on two different robotic tasks. Our experiments show that especially \nfor sampling-based filters like the Particle Filter, learning heteroscedastic noise \nmodels can drastically improve the tracking performance in comparison to using \nconstant noise models.", "keywords": "bayesian filtering;heteroscedastic noise;deep learning", "primary_area": "", "supplementary_material": "", "author": "Alina Kloss;Jeannette Bohg", "authorids": "alina.kloss@tuebingen.mpg.de;bohg@stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkloss2019on,\ntitle={On Learning Heteroscedastic Noise Models within Differentiable Bayes Filters},\nauthor={Alina Kloss and Jeannette Bohg},\nyear={2019},\nurl={https://openreview.net/forum?id=BylBns0qtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=BylBns0qtX", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;5", "wc_review": "387;1143;326", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "296;1901;71", "reply_reviewers": "0;0;0", "reply_authors": "1;3;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 618.6666666666666, 371.59506036664277 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 756.0, 814.831270877597 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16618414963578231518&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Energy-Constrained Compression for Deep Neural Networks via Weighted Sparse Projection and Layer Input Masking", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/965", "id": "BylBr3C9K7", "author_site": "Haichuan Yang, Yuhao Zhu, Ji Liu", "tldr": "", "abstract": "Deep Neural Networks (DNNs) are increasingly deployed in highly energy-constrained environments such as autonomous drones and wearable devices while at the same time must operate in real-time. Therefore, reducing the energy consumption has become a major design consideration in DNN training. This paper proposes the first end-to-end DNN training framework that provides quantitative energy consumption guarantees via weighted sparse projection and input masking. The key idea is to formulate the DNN training as an optimization problem in which the energy budget imposes a previously unconsidered optimization constraint. We integrate the quantitative DNN energy estimation into the DNN training process to assist the constrained optimization. We prove that an approximate algorithm can be used to efficiently solve the optimization problem. Compared to the best prior energy-saving techniques, our framework trains DNNs that provide higher accuracies under same or lower energy budgets.", "keywords": "model compression;inference energy saving;deep neural network pruning", "primary_area": "", "supplementary_material": "", "author": "Haichuan Yang;Yuhao Zhu;Ji Liu", "authorids": "h.yang@rochester.edu;yzhu@rochester.edu;ji.liu.uwisc@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyang2018energyconstrained,\ntitle={Energy-Constrained Compression for Deep Neural Networks via Weighted Sparse Projection and Layer Input Masking},\nauthor={Haichuan Yang and Yuhao Zhu and Ji Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BylBr3C9K7},\n}", "github": "[![github](/images/github_icon.svg) hyang1990/model_based_energy_constrained_compression](https://github.com/hyang1990/model_based_energy_constrained_compression)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;4", "wc_review": "465;295;869", "wc_reply_reviewers": "19;88;12", "wc_reply_authors": "705;353;1884", "reply_reviewers": "1;1;1", "reply_authors": "2;1;3", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 543.0, 240.73775496723954 ], "wc_reply_reviewers_avg": [ 39.666666666666664, 34.29609631171195 ], "wc_reply_authors_avg": [ 980.6666666666666, 654.7184297254983 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6237094978821638350&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=BylBr3C9K7", "pdf": "https://openreview.net/pdf?id=BylBr3C9K7", "email": ";;", "author_num": 3 }, { "title": "Emerging Disentanglement in Auto-Encoder Based Unsupervised Image Content Transfer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/687", "id": "BylE1205Fm", "author_site": "Ori Press, Tomer Galanti, Sagie Benaim, Lior Wolf", "tldr": "An image to image translation method which adds to one image the content of another thereby creating a new image.", "abstract": "We study the problem of learning to map, in an unsupervised way, between domains $A$ and $B$, such that the samples $\\vb \\in B$ contain all the information that exists in samples $\\va\\in A$ and some additional information. For example, ignoring occlusions, $B$ can be people with glasses, $A$ people without, and the glasses, would be the added information. When mapping a sample $\\va$ from the first domain to the other domain, the missing information is replicated from an independent reference sample $\\vb\\in B$. Thus, in the above example, we can create, for every person without glasses a version with the glasses observed in any face image. \n\nOur solution employs a single two-pathway encoder and a single decoder for both domains. The common part of the two domains and the separate part are encoded as two vectors, and the separate part is fixed at zero for domain $A$. The loss terms are minimal and involve reconstruction losses for the two domains and a domain confusion term. Our analysis shows that under mild assumptions, this architecture, which is much simpler than the literature guided-translation methods, is enough to ensure disentanglement between the two domains. We present convincing results in a few visual domains, such as no-glasses to glasses, adding facial hair based on a reference image, etc.", "keywords": "Image-to-image Translation;Disentanglement;Autoencoders;Faces", "primary_area": "", "supplementary_material": "", "author": "Ori Press;Tomer Galanti;Sagie Benaim;Lior Wolf", "authorids": "theoripress@gmail.com;tomer22g@gmail.com;sagiebenaim@gmail.com;wolf@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\npress2018emerging,\ntitle={Emerging Disentanglement in Auto-Encoder Based Unsupervised Image Content Transfer},\nauthor={Ori Press and Tomer Galanti and Sagie Benaim and Lior Wolf},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BylE1205Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;1;3", "wc_review": "248;94;636", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "812;0;1066", "reply_reviewers": "0;0;0", "reply_authors": "1;0;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 2.0, 0.816496580927726 ], "wc_review_avg": [ 326.0, 228.04093199832934 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 626.0, 454.63245228059407 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1130493139270124070&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=BylE1205Fm", "pdf": "https://openreview.net/pdf?id=BylE1205Fm", "email": ";;;", "author_num": 4 }, { "title": "SGD Converges to Global Minimum in Deep Learning via Star-convex Path", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/882", "id": "BylIciRcYQ", "author_site": "Yi Zhou, Junjie Yang, Huishuai Zhang, Yingbin Liang, VAHID TAROKH", "tldr": "", "abstract": "Stochastic gradient descent (SGD) has been found to be surprisingly effective in training a variety of deep neural networks. However, there is still a lack of understanding on how and why SGD can train these complex networks towards a global minimum. In this study, we establish the convergence of SGD to a global minimum for nonconvex optimization problems that are commonly encountered in neural network training. Our argument exploits the following two important properties: 1) the training loss can achieve zero value (approximately), which has been widely observed in deep learning; 2) SGD follows a star-convex path, which is verified by various experiments in this paper. In such a context, our analysis shows that SGD, although has long been considered as a randomized algorithm, converges in an intrinsically deterministic manner to a global minimum. ", "keywords": "SGD;deep learning;global minimum;convergence", "primary_area": "", "supplementary_material": "", "author": "Yi Zhou;Junjie Yang;Huishuai Zhang;Yingbin Liang;Vahid Tarokh", "authorids": "yi.zhou610@duke.edu;baymax@mail.ustc.edu.cn;huishuai.zhang@microsoft.com;liang.889@osu.edu;vahid.tarokh@duke.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhou2018sgd,\ntitle={{SGD} Converges to Global Minimum in Deep Learning via Star-convex Path},\nauthor={Yi Zhou and Junjie Yang and Huishuai Zhang and Yingbin Liang and Vahid Tarokh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BylIciRcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;8", "confidence": "5;4;4", "wc_review": "744;417;149", "wc_reply_reviewers": "0;124;0", "wc_reply_authors": "829;631;356", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 436.6666666666667, 243.30547785768317 ], "wc_reply_reviewers_avg": [ 41.333333333333336, 58.45416057808793 ], "wc_reply_authors_avg": [ 605.3333333333334, 193.95245694642682 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12174745207770753941&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=BylIciRcYQ", "pdf": "https://openreview.net/pdf?id=BylIciRcYQ", "email": ";;;;", "author_num": 5 }, { "title": "Toward Understanding the Impact of Staleness in Distributed Machine Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/961", "id": "BylQV305YQ", "author_site": "Wei Dai, Yi Zhou, Nanqing Dong, Hao Zhang, Eric Xing", "tldr": "Empirical and theoretical study of the effects of staleness in non-synchronous execution on machine learning algorithms.", "abstract": "Most distributed machine learning (ML) systems store a copy of the model parameters locally on each machine to minimize network communication. In practice, in order to reduce synchronization waiting time, these copies of the model are not necessarily updated in lock-step, and can become stale. Despite much development in large-scale ML, the effect of staleness on the learning efficiency is inconclusive, mainly because it is challenging to control or monitor the staleness in complex distributed environments. In this work, we study the convergence behaviors of a wide array of ML models and algorithms under delayed updates. Our extensive experiments reveal the rich diversity of the effects of staleness on the convergence of ML algorithms and offer insights into seemingly contradictory reports in the literature. The empirical findings also inspire a new convergence analysis of SGD in non-convex optimization under staleness, matching the best-known convergence rate of O(1/\\sqrt{T}).", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wei Dai;Yi Zhou;Nanqing Dong;Hao Zhang;Eric Xing", "authorids": "daviddai@apple.com;zhou.1172@osu.edu;nanqing.dong@petuum.com;hao.zhang@petuum.com;eric.xing@petuum.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndai2018toward,\ntitle={Toward Understanding the Impact of Staleness in Distributed Machine Learning},\nauthor={Wei Dai and Yi Zhou and Nanqing Dong and Hao Zhang and Eric Xing},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=BylQV305YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;9", "confidence": "5;5;4", "wc_review": "100;344;326", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "696;779;560", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 6.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 256.6666666666667, 111.02352103145631 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 678.3333333333334, 90.27488884266518 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8029550685469661, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14283113129888033216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=BylQV305YQ", "pdf": "https://openreview.net/pdf?id=BylQV305YQ", "email": ";;;;", "author_num": 5 }, { "id": "BylRVjC9K7", "title": "Explaining Adversarial Examples with Knowledge Representation", "track": "main", "status": "Reject", "tldr": "Hybird storage and representation of learned knowledge may be a reason for adversarial examples.", "abstract": "Adversarial examples are modified samples that preserve original image structures but deviate classifiers. Researchers have put efforts into developing methods for generating adversarial examples and finding out origins. Past research put much attention on decision boundary changes caused by these methods. This paper, in contrast, discusses the origin of adversarial examples from a more underlying knowledge representation point of view. Human beings can learn and classify prototypes as well as transformations of objects. While neural networks store learned knowledge in a more hybrid way of combining all prototypes and transformations as a whole distribution. Hybrid storage may lead to lower distances between different classes so that small modifications can mislead the classifier. A one-step distribution imitation method is designed to imitate distribution of the nearest different class neighbor. Experiments show that simply by imitating distributions from a training set without any knowledge of the classifier can still lead to obvious impacts on classification results from deep networks. It also implies that adversarial examples can be in more forms than small perturbations. Potential ways of alleviating adversarial examples are discussed from the representation point of view. The first path is to change the encoding of data sent to the training step. Training data that are more prototypical can help seize more robust and accurate structural knowledge. The second path requires constructing learning frameworks with improved representations.", "keywords": "adversarial example;knowledge representation;distribution imitation", "primary_area": "", "supplementary_material": "", "author": "Xingyu Zhou;Tengyu Ma;Huahong Zhang", "authorids": "xingyu.zhou@vanderbilt.edu;tengyu.ma@vanderbilt.edu;huahong.zhang@vanderbilt.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhou2019explaining,\ntitle={Explaining Adversarial Examples with Knowledge Representation},\nauthor={Xingyu Zhou and Tengyu Ma and Huahong Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=BylRVjC9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BylRVjC9K7", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;2;4", "wc_review": "881;48;450", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 459.6666666666667, 340.13951385995847 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6S2sf5GnA5oJ:scholar.google.com/&scioq=Explaining+Adversarial+Examples+with+Knowledge+Representation&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BylTHoR5Km", "title": "Isolating effects of age with fair representation learning when assessing dementia", "track": "main", "status": "Withdraw", "tldr": "Show that age confounds cognitive impairment detection + solve with fair representation learning + propose metrics and models.", "abstract": "One of the most prevalent symptoms among the elderly population, dementia, can be detected by classifiers trained on linguistic features extracted from narrative transcripts. However, these linguistic features are impacted in a similar but different fashion by the normal aging process. Aging is therefore a confounding factor, whose effects have been hard for machine learning classifiers to isolate. \n\nIn this paper, we show that deep neural network (DNN) classifiers can infer ages from linguistic features, which is an entanglement that could lead to unfairness across age groups. We show this problem is caused by undesired activations of v-structures in causality diagrams, and it could be addressed with fair representation learning. We build neural network classifiers that learn low-dimensional representations reflecting the impacts of dementia yet discarding the effects of age. To evaluate these classifiers, we specify a model-agnostic score $\\Delta_{eo}^{(N)}$ measuring how classifier results are disentangled from age. Our best models outperform baseline neural network classifiers in disentanglement, while compromising accuracy by as little as 2.56\\% and 2.25\\% on DementiaBank and the Famous People dataset respectively. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zining Zhu;Jekaterina Novikova;Frank Rudzicz", "authorids": "zining.zhu@mail.utoronto.ca;jekaterina@winterlightlabs.com;frank@spoclab.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BylTHoR5Km", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;4", "wc_review": "252;252;504", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "205;302;208", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 336.0, 118.79393923933998 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 238.33333333333334, 45.035788238047104 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11011559812979734408&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Byl_ciRcY7", "title": "ON BREIMAN\u2019S DILEMMA IN NEURAL NETWORKS: SUCCESS AND FAILURE OF NORMALIZED MARGINS", "track": "main", "status": "Reject", "tldr": "Bregman's dilemma is shown in deep learning that improvement of margins of over-parameterized models may result in overfitting, and dynamics of normalized margin distributions are proposed to predict generalization error and identify such a dilemma. ", "abstract": "A belief persists long in machine learning that enlargement of margins over training data accounts for the resistance of models to overfitting by increasing the robustness. Yet Breiman shows a dilemma (Breiman, 1999) that a uniform improvement on margin distribution \\emph{does not} necessarily reduces generalization error. In this paper, we revisit Breiman's dilemma in deep neural networks with recently proposed normalized margins using Lipschitz constant bound by spectral norm products. With both simplified theory and extensive experiments, Breiman's dilemma is shown to rely on dynamics of normalized margin distributions, that reflects the trade-off between model expression power and data complexity. When the complexity of data is comparable to the model expression power in the sense that training and test data share similar phase transitions in normalized margin dynamics, two efficient ways are derived via classic margin-based generalization bounds to successfully predict the trend of generalization error. On the other hand, over-expressed models that exhibit uniform improvements on training normalized margins may lose such a prediction power and fail to prevent the overfitting. \n", "keywords": "Bregman's Dilemma;Generalization Error;Margin;Spectral normalization", "primary_area": "", "supplementary_material": "", "author": "Yifei HUANG;Yuan YAO;Weizhi ZHU", "authorids": "yhuangcc@ust.hk;yuany@ust.hk;wzhuai@connect.ust.hk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhuang2019on,\ntitle={{ON} {BREIMAN}\u2019S {DILEMMA} {IN} {NEURAL} {NETWORKS}: {SUCCESS} {AND} {FAILURE} {OF} {NORMALIZED} {MARGINS}},\nauthor={Yifei HUANG and Yuan YAO and Weizhi ZHU},\nyear={2019},\nurl={https://openreview.net/forum?id=Byl_ciRcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Byl_ciRcY7", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "wc_review": "228;213;325", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "518;183;264", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 255.33333333333334, 49.64093293061909 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 321.6666666666667, 142.712609424987 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9ggfdRsd2jkJ:scholar.google.com/&scioq=ON+BREIMAN%E2%80%99S+DILEMMA+IN+NEURAL+NETWORKS:+SUCCESS+AND+FAILURE+OF+NORMALIZED+MARGINS&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BylahsR9tX", "title": "Low-Rank Matrix Factorization of LSTM as Effective Model Compression", "track": "main", "status": "Withdraw", "tldr": "We propose simple, but effective, low-rank matrix factorization (MF) algorithms to speed up in running time, save memory, and improve the performance of LSTMs.", "abstract": "Large-scale Long Short-Term Memory (LSTM) cells are often the building blocks of many state-of-the-art algorithms for tasks in Natural Language Processing (NLP). However, LSTMs are known to be computationally inefficient because the memory capacity of the models depends on the number of parameters, and the inherent recurrence that models the temporal dependency is not parallelizable. In this paper, we propose simple, but effective, low-rank matrix factorization (MF) algorithms to compress network parameters and significantly speed up LSTMs with almost no loss of performance (and sometimes even gain). To show the effectiveness of our method across different tasks, we examine two settings: 1) compressing core LSTM layers in Language Models, 2) compressing biLSTM layers of ELMo~\\citep{ELMo} and evaluate in three downstream NLP tasks (Sentiment Analysis, Textual Entailment, and Question Answering). The latter is particularly interesting as embeddings from large pre-trained biLSTM Language Models are often used as contextual word representations. Finally, we discover that matrix factorization performs better in general, additive recurrence is often more important than multiplicative recurrence, and we identify an interesting correlation between matrix norms and compression performance.\n\n", "keywords": "NLP;LSTM;Compression;Low Rank;Norm Analysis", "primary_area": "", "supplementary_material": "", "author": "Genta Indra Winata;Andrea Madotto;Jamin Shin;Elham J. Barezi", "authorids": "giwinata@connect.ust.hk;amadotto@connect.ust.hk;jay.shin@connect.ust.hk;ejs@connect.ust.hk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=BylahsR9tX", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;2;4", "wc_review": "364;186;145", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "262;193;266", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 231.66666666666666, 95.05904597786694 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 240.33333333333334, 33.50953429829918 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17042763952228949125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "BylctiCctX", "title": "Guiding Physical Intuition with Neural Stethoscopes", "track": "main", "status": "Reject", "tldr": "Combining auxiliary and adversarial training to interrogate and help physical understanding.", "abstract": "Model interpretability and systematic, targeted model adaptation present central challenges in deep learning. In the domain of intuitive physics, we study the task of visually predicting stability of block towers with the goal of understanding and influencing the model's reasoning. Our contributions are two-fold. Firstly, we introduce neural stethoscopes as a framework for quantifying the degree of importance of specific factors of influence in deep networks as well as for actively promoting and suppressing information as appropriate. In doing so, we unify concepts from multitask learning as well as training with auxiliary and adversarial losses. Secondly, we deploy the stethoscope framework to provide an in-depth analysis of a state-of-the-art deep neural network for stability prediction, specifically examining its physical reasoning. We show that the baseline model is susceptible to being misled by incorrect visual cues. This leads to a performance breakdown to the level of random guessing when training on scenarios where visual cues are inversely correlated with stability. Using stethoscopes to promote meaningful feature extraction increases performance from 51% to 90% prediction accuracy. Conversely, training on an easy dataset where visual cues are positively correlated with stability, the baseline model learns a bias leading to poor performance on a harder dataset. Using an adversarial stethoscope, the network is successfully de-biased, leading to a performance increase from 66% to 88%.", "keywords": "Deep Learning;Intuitive Physics;Stability Prediction;Adversarial Training;Auxiliary Training;Multi-Task Learning", "primary_area": "", "supplementary_material": "", "author": "Fabian Fuchs;Oliver Groth;Adam Kosiorek;Alex Bewley;Markus Wulfmeier;Andrea Vedaldi;Ingmar Posner", "authorids": "fabian@robots.ox.ac.uk;ogroth@robots.ox.ac.uk;adamk@robots.ox.ac.uk;alex.bewley@gmail.com;m.wulfmeier@gmail.com;vedaldi@robots.ox.ac.uk;ingmar@robots.ox.ac.uk", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nfuchs2019guiding,\ntitle={Guiding Physical Intuition with Neural Stethoscopes},\nauthor={Fabian Fuchs and Oliver Groth and Adam Kosiorek and Alex Bewley and Markus Wulfmeier and Andrea Vedaldi and Ingmar Posner},\nyear={2019},\nurl={https://openreview.net/forum?id=BylctiCctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=BylctiCctX", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;4;3", "wc_review": "161;193;238", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "235;282;129", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 197.33333333333334, 31.584102892999123 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 215.33333333333334, 63.99131885567674 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mcsMQL5NkIkJ:scholar.google.com/&scioq=Guiding+Physical+Intuition+with+Neural+Stethoscopes&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Transfer Learning for Sequences via Learning to Collocate", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/735", "id": "ByldlhAqYQ", "author_site": "Wanyun Cui, Guangyu Zheng, Zhiqiang Shen, Sihang Jiang, Wei Wang", "tldr": "Transfer learning for sequence via learning to align cell-level information across domains.", "abstract": "Transfer learning aims to solve the data sparsity for a specific domain by applying information of another domain. Given a sequence (e.g. a natural language sentence), the transfer learning, usually enabled by recurrent neural network (RNN), represent the sequential information transfer. RNN uses a chain of repeating cells to model the sequence data. However, previous studies of neural network based transfer learning simply transfer the information across the whole layers, which are unfeasible for seq2seq and sequence labeling. Meanwhile, such layer-wise transfer learning mechanisms also lose the fine-grained cell-level information from the source domain.\n\nIn this paper, we proposed the aligned recurrent transfer, ART, to achieve cell-level information transfer. ART is in a recurrent manner that different cells share the same parameters. Besides transferring the corresponding information at the same position, ART transfers information from all collocated words in the source domain. This strategy enables ART to capture the word collocation across domains in a more flexible way. We conducted extensive experiments on both sequence labeling tasks (POS tagging, NER) and sentence classification (sentiment analysis). ART outperforms the state-of-the-arts over all experiments.\n", "keywords": "transfer learning;recurrent neural network;attention;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Wanyun Cui;Guangyu Zheng;Zhiqiang Shen;Sihang Jiang;Wei Wang", "authorids": "cui.wanyun@sufe.edu.cn;simonzgy@outlook.com;shen54@illinois.edu;tedjiangfdu@gmail.com;weiwang1@fudan.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ncui2018transfer,\ntitle={Transfer Learning for Sequences via Learning to Collocate},\nauthor={Wanyun Cui and Guangyu Zheng and Zhiqiang Shen and Sihang Jiang and Wei Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByldlhAqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;3", "wc_review": "337;607;505", "wc_reply_reviewers": "0;138;0", "wc_reply_authors": "682;1102;592", "reply_reviewers": "0;1;0", "reply_authors": "1;3;3", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 483.0, 111.31936040060597 ], "wc_reply_reviewers_avg": [ 46.0, 65.05382386916237 ], "wc_reply_authors_avg": [ 792.0, 222.26110770892868 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9924239763188031121&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ByldlhAqYQ", "pdf": "https://openreview.net/pdf?id=ByldlhAqYQ", "email": ";;;;", "author_num": 5 }, { "id": "Byldr3RqKX", "title": "Tinkering with black boxes: counterfactuals uncover modularity in generative models", "track": "main", "status": "Reject", "tldr": "We investigate the modularity of deep generative models.", "abstract": "Deep generative models such as Generative Adversarial Networks (GANs) and\nVariational Auto-Encoders (VAEs) are important tools to capture and investigate\nthe properties of complex empirical data. However, the complexity of their inner\nelements makes their functionment challenging to assess and modify. In this\nrespect, these architectures behave as black box models. In order to better\nunderstand the function of such networks, we analyze their modularity based on\nthe counterfactual manipulation of their internal variables. Our experiments on the\ngeneration of human faces with VAEs and GANs support that modularity between\nactivation maps distributed over channels of generator architectures is achieved\nto some degree, can be used to better understand how these systems operate and allow meaningful transformations of the generated images without further training.\nerate and edit the content of generated images.", "keywords": "generatice models;causality;disentangled representations", "primary_area": "", "supplementary_material": "", "author": "Michel Besserve;Remy Sun;Bernhard Schoelkopf", "authorids": "michel.besserve@tuebingen.mpg.de;remy.sun@ens-rennes.fr;bs@tuebingen.mpg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbesserve2019tinkering,\ntitle={Tinkering with black boxes: counterfactuals uncover modularity in generative models},\nauthor={Michel Besserve and Remy Sun and Bernhard Schoelkopf},\nyear={2019},\nurl={https://openreview.net/forum?id=Byldr3RqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Byldr3RqKX", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;3", "wc_review": "260;434;106", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "153;491;223", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 266.6666666666667, 133.98839087357118 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 289.0, 145.66628527791414 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RMIHRkL5eSMJ:scholar.google.com/&scioq=Tinkering+with+black+boxes:+counterfactuals+uncover+modularity+in+generative+models&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Learning Procedural Abstractions and Evaluating Discrete Latent Temporal Structure", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1057", "id": "ByleB2CcKm", "author_site": "Karan Goel, Emma Brunskill", "tldr": "", "abstract": "Clustering methods and latent variable models are often used as tools for pattern mining and discovery of latent structure in time-series data. In this work, we consider the problem of learning procedural abstractions from possibly high-dimensional observational sequences, such as video demonstrations. Given a dataset of time-series, the goal is to identify the latent sequence of steps common to them and label each time-series with the temporal extent of these procedural steps. We introduce a hierarchical Bayesian model called Prism that models the realization of a common procedure across multiple time-series, and can recover procedural abstractions with supervision. We also bring to light two characteristics ignored by traditional evaluation criteria when evaluating latent temporal labelings (temporal clusterings) -- segment structure, and repeated structure -- and develop new metrics tailored to their evaluation. We demonstrate that our metrics improve interpretability and ease of analysis for evaluation on benchmark time-series datasets. Results on benchmark and video datasets indicate that Prism outperforms standard sequence models as well as state-of-the-art techniques in identifying procedural abstractions.", "keywords": "learning procedural abstractions;latent variable modeling;evaluation criteria", "primary_area": "", "supplementary_material": "", "author": "Karan Goel;Emma Brunskill", "authorids": "kgoel93@gmail.com;ebrun@cs.stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ngoel2018learning,\ntitle={Learning Procedural Abstractions and Evaluating Discrete Latent Temporal Structure},\nauthor={Karan Goel and Emma Brunskill},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByleB2CcKm},\n}", "github": "[![github](/images/github_icon.svg) StanfordAI4HI/ICLR2019_evaluating_discrete_temporal_structure](https://github.com/StanfordAI4HI/ICLR2019_evaluating_discrete_temporal_structure)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;3;3", "wc_review": "950;234;495", "wc_reply_reviewers": "196;0;0", "wc_reply_authors": "1446;478;827", "reply_reviewers": "1;0;0", "reply_authors": "3;1;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 559.6666666666666, 295.8607029592737 ], "wc_reply_reviewers_avg": [ 65.33333333333333, 92.39528607504222 ], "wc_reply_authors_avg": [ 917.0, 400.275738293825 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11760620653931209024&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=ByleB2CcKm", "pdf": "https://openreview.net/pdf?id=ByleB2CcKm", "email": ";", "author_num": 2 }, { "id": "Bylj6oC5K7", "title": "Logit Regularization Methods for Adversarial Robustness", "track": "main", "status": "Withdraw", "tldr": "Logit regularization methods help explain and improve state of the art adversarial defenses", "abstract": "While great progress has been made at making neural networks effective across a wide range of tasks, many are surprisingly vulnerable to small, carefully chosen perturbations of their input, known as adversarial examples. In this paper, we advocate for and experimentally investigate the use of logit regularization techniques as an adversarial defense, which can be used in conjunction with other methods for creating adversarial robustness at little to no cost. We demonstrate that much of the effectiveness of one recent adversarial defense mechanism can be attributed to logit regularization and show how to improve its defense against both white-box and black-box attacks, in the process creating a stronger black-box attacks against PGD-based models.\n", "keywords": "adversarial", "primary_area": "", "supplementary_material": "", "author": "Cecilia Summers;Michael J. Dinneen", "authorids": "ceciliasummers07@gmail.com;mjd@cs.auckland.ac.nz", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Bylj6oC5K7", "pdf_size": 0, "rating": "2;3;5", "confidence": "5;5;5", "wc_review": "359;766;386", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 1.247219128924647 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 503.6666666666667, 185.82488770046086 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CF2xw3bR_U4J:scholar.google.com/&scioq=Logit+Regularization+Methods+for+Adversarial+Robustness&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "BylkG20qYm", "title": "On Meaning-Preserving Adversarial Perturbations for Sequence-to-Sequence Models", "track": "main", "status": "Reject", "tldr": "How you should evaluate adversarial attacks on seq2seq", "abstract": "Adversarial examples have been shown to be an effective way of assessing the robustness of neural sequence-to-sequence (seq2seq) models, by applying perturbations to the input of a model leading to large degradation in performance. However, these perturbations are only indicative of a weakness in the model if they do not change the semantics of the input in a way that would change the expected output. Using the example of machine translation (MT), we propose a new evaluation framework for adversarial attacks on seq2seq models taking meaning preservation into account and demonstrate that existing methods may not preserve meaning in general. Based on these findings, we propose new constraints for attacks on word-based MT systems and show, via human and automatic evaluation, that they produce more semantically similar adversarial inputs. Furthermore, we show that performing adversarial training with meaning-preserving attacks is beneficial to the model in terms of adversarial robustness without hurting test performance.", "keywords": "Sequence-to-sequence;adversarial attacks;evaluation;meaning preservation;machine translation", "primary_area": "", "supplementary_material": "", "author": "Paul Michel;Graham Neubig;Xian Li;Juan Miguel Pino", "authorids": "pmichel1@cs.cmu.edu;gneubig@cs.cmu.edu;xianl@fb.com;juancarabina@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmichel2019on,\ntitle={On Meaning-Preserving Adversarial Perturbations for Sequence-to-Sequence Models},\nauthor={Paul Michel and Graham Neubig and Xian Li and Juan Miguel Pino},\nyear={2019},\nurl={https://openreview.net/forum?id=BylkG20qYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=BylkG20qYm", "pdf_size": 0, "rating": "3;4;4;6", "confidence": "4;4;4;3", "wc_review": "855;320;351;342", "wc_reply_reviewers": "710;116;0;86", "wc_reply_authors": "1698;774;0;231", "reply_reviewers": "2;1;0;1", "reply_authors": "3;2;0;2", "rating_avg": [ 4.25, 1.0897247358851685 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 467.0, 224.29556393294988 ], "wc_reply_reviewers_avg": [ 228.0, 281.52086956387444 ], "wc_reply_authors_avg": [ 675.75, 653.6606057427662 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.75, 1.0897247358851685 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9271726499455306, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GFIckgJ-m60J:scholar.google.com/&scioq=On+Meaning-Preserving+Adversarial+Perturbations+for+Sequence-to-Sequence+Models&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Unsupervised Speech Recognition via Segmental Empirical Output Distribution Matching", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/641", "id": "Bylmkh05KX", "author_site": "Chih-Kuan Yeh, Jianshu Chen, Chengzhu Yu, Dong Yu", "tldr": "", "abstract": "We consider the problem of training speech recognition systems without using any labeled data, under the assumption that the learner can only access to the input utterances and a phoneme language model estimated from a non-overlapping corpus. We propose a fully unsupervised learning algorithm that alternates between solving two sub-problems: (i) learn a phoneme classifier for a given set of phoneme segmentation boundaries, and (ii) refining the phoneme boundaries based on a given classifier. To solve the first sub-problem, we introduce a novel unsupervised cost function named Segmental Empirical Output Distribution Matching, which generalizes the work in (Liu et al., 2017) to segmental structures. For the second sub-problem, we develop an approximate MAP approach to refining the boundaries obtained from Wang et al. (2017). Experimental results on TIMIT dataset demonstrate the success of this fully unsupervised phoneme recognition system, which achieves a phone error rate (PER) of 41.6%. Although it is still far away from the state-of-the-art supervised systems, we show that with oracle boundaries and matching language model, the PER could be improved to 32.5%. This performance approaches the supervised system of the same model architecture, demonstrating the great potential of the proposed method.", "keywords": "Unsupervised speech recognition;unsupervised learning;phoneme classification", "primary_area": "", "supplementary_material": "", "author": "Chih-Kuan Yeh;Jianshu Chen;Chengzhu Yu;Dong Yu", "authorids": "cjyeh@cs.cmu.edu;chenjianshu@gmail.com;czyu@tencent.com;dyu@tencent.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nyeh2018unsupervised,\ntitle={Unsupervised Speech Recognition via Segmental Empirical Output Distribution Matching},\nauthor={Chih-Kuan Yeh and Jianshu Chen and Chengzhu Yu and Dong Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bylmkh05KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "wc_review": "590;482;545", "wc_reply_reviewers": "72;0;0", "wc_reply_authors": "480;351;680", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 539.0, 44.294469180700204 ], "wc_reply_reviewers_avg": [ 24.0, 33.94112549695428 ], "wc_reply_authors_avg": [ 503.6666666666667, 135.352215431526 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4232653024484173716&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Bylmkh05KX", "pdf": "https://openreview.net/pdf?id=Bylmkh05KX", "email": ";;;", "author_num": 4 }, { "title": "Adversarial Attacks on Graph Neural Networks via Meta Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/826", "id": "Bylnx209YX", "author_site": "Daniel Z\u00fcgner, Stephan G\u00fcnnemann", "tldr": "We use meta-gradients to attack the training procedure of deep neural networks for graphs.", "abstract": "Deep learning models for graphs have advanced the state of the art on many tasks. Despite their recent success, little is known about their robustness. We investigate training time attacks on graph neural networks for node classification that perturb the discrete graph structure. Our core principle is to use meta-gradients to solve the bilevel problem underlying training-time attacks, essentially treating the graph as a hyperparameter to optimize. Our experiments show that small graph perturbations consistently lead to a strong decrease in performance for graph convolutional networks, and even transfer to unsupervised embeddings. Remarkably, the perturbations created by our algorithm can misguide the graph neural networks such that they perform worse than a simple baseline that ignores all relational information. Our attacks do not assume any knowledge about or access to the target classifiers.", "keywords": "graph mining;adversarial attacks;meta learning;graph neural networks;node classification", "primary_area": "", "supplementary_material": "", "author": "Daniel Z\u00fcgner;Stephan G\u00fcnnemann", "authorids": "zuegnerd@in.tum.de;guennemann@in.tum.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nz\u00fcgner2018adversarial,\ntitle={Adversarial Attacks on Graph Neural Networks via Meta Learning},\nauthor={Daniel Z\u00fcgner and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Bylnx209YX},\n}", "github": "[![github](/images/github_icon.svg) danielzuegner/gnn-meta-attack](https://github.com/danielzuegner/gnn-meta-attack)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "208;398;197", "wc_reply_reviewers": "0;0;14", "wc_reply_authors": "108;620;226", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 267.6666666666667, 92.26893060806306 ], "wc_reply_reviewers_avg": [ 4.666666666666667, 6.599663291074443 ], "wc_reply_authors_avg": [ 318.0, 218.9124634795074 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=Bylnx209YX", "pdf": "https://openreview.net/pdf?id=Bylnx209YX", "email": ";", "author_num": 2 }, { "title": "Maximal Divergence Sequential Autoencoder for Binary Software Vulnerability Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1135", "id": "ByloIiCqYQ", "author_site": "Tue Le, Tuan Nguyen, Trung Le, Dinh Phung, Paul Montague, Olivier Vel, Lizhen Qu", "tldr": "We propose a novel method named Maximal Divergence Sequential Auto-Encoder that leverages Variational AutoEncoder representation for binary code vulnerability detection.", "abstract": "Due to the sharp increase in the severity of the threat imposed by software vulnerabilities, the detection of vulnerabilities in binary code has become an important concern in the software industry, such as the embedded systems industry, and in the field of computer security. However, most of the work in binary code vulnerability detection has relied on handcrafted features which are manually chosen by a select few, knowledgeable domain experts. In this paper, we attempt to alleviate this severe binary vulnerability detection bottleneck by leveraging recent advances in deep learning representations and propose the Maximal Divergence Sequential Auto-Encoder. In particular, latent codes representing vulnerable and non-vulnerable binaries are encouraged to be maximally divergent, while still being able to maintain crucial information from the original binaries. We conducted extensive experiments to compare and contrast our proposed methods with the baselines, and the results show that our proposed methods outperform the baselines in all performance measures of interest.", "keywords": "Vulnerabilities Detection;Sequential Auto-Encoder;Separable Representation", "primary_area": "", "supplementary_material": "", "author": "Tue Le;Tuan Nguyen;Trung Le;Dinh Phung;Paul Montague;Olivier De Vel;Lizhen Qu", "authorids": "tue.le.ict@jvn.edu.vn;nguyenvutuan1995@gmail.com;trunglm@monash.edu;dinh.phung@monash.edu;paul.montague@dst.defence.gov.au;olivier.devel@dst.defence.gov.au;lizhen.qu@data61.csiro.au", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nle2018maximal,\ntitle={Maximal Divergence Sequential Autoencoder for Binary Software Vulnerability Detection},\nauthor={Tue Le and Tuan Nguyen and Trung Le and Dinh Phung and Paul Montague and Olivier De Vel and Lizhen Qu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByloIiCqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6;7", "confidence": "2;3;4;2", "wc_review": "568;198;280;562", "wc_reply_reviewers": "0;0;16;63", "wc_reply_authors": "789;706;768;435", "reply_reviewers": "0;0;1;1", "reply_authors": "1;1;2;2", "rating_avg": [ 6.25, 0.4330127018922193 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "wc_review_avg": [ 402.0, 165.57173671855955 ], "wc_reply_reviewers_avg": [ 19.75, 25.810608284191986 ], "wc_reply_authors_avg": [ 674.5, 141.602436419717 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.5222329678670935, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14813024667125945030&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=ByloIiCqYQ", "pdf": "https://openreview.net/pdf?id=ByloIiCqYQ", "email": ";;;;;;", "author_num": 7 }, { "title": "Neural Program Repair by Jointly Learning to Localize and Repair", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/869", "id": "ByloJ20qtm", "author_site": "Marko Vasic, Aditya Kanade, Petros Maniatis, David Bieber, Rishabh Singh", "tldr": "Multi-headed Pointer Networks for jointly learning to localize and repair Variable Misuse bugs", "abstract": "Due to its potential to improve programmer productivity and software quality, automated program repair has been an active topic of research. Newer techniques harness neural networks to learn directly from examples of buggy programs and their fixes. In this work, we consider a recently identified class of bugs called variable-misuse bugs. The state-of-the-art solution for variable misuse enumerates potential fixes for all possible bug locations in a program, before selecting the best prediction. We show that it is beneficial to train a model that jointly and directly localizes and repairs variable-misuse bugs. We present multi-headed pointer networks for this purpose, with one head each for localization and repair. The experimental results show that the joint model significantly outperforms an enumerative solution that uses a pointer based model for repair alone.", "keywords": "neural program repair;neural program embeddings;pointer networks", "primary_area": "", "supplementary_material": "", "author": "Marko Vasic;Aditya Kanade;Petros Maniatis;David Bieber;Rishabh Singh", "authorids": "vasic@utexas.edu;akanade@google.com;maniatis@google.com;dbieber@google.com;rising@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nvasic2018neural,\ntitle={Neural Program Repair by Jointly Learning to Localize and Repair},\nauthor={Marko Vasic and Aditya Kanade and Petros Maniatis and David Bieber and Rishabh singh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByloJ20qtm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=ByloJ20qtm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;5;4", "wc_review": "275;582;410", "wc_reply_reviewers": "316;193;0", "wc_reply_authors": "1567;627;227", "reply_reviewers": "2;1;0", "reply_authors": "4;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 422.3333333333333, 125.63527459365153 ], "wc_reply_reviewers_avg": [ 169.66666666666666, 130.05725235022032 ], "wc_reply_authors_avg": [ 807.0, 561.6641938620146 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 167, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2606729175407927305&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ByloJ20qtm", "pdf": "https://openreview.net/pdf?id=ByloJ20qtm", "email": ";;;;", "author_num": 5 }, { "id": "Byx1VnR9K7", "title": "Trajectory VAE for multi-modal imitation", "track": "main", "status": "Reject", "tldr": "A trajectory-VAE method for imitating multi-modal expert demonstrations in sequential decision making problems.", "abstract": "We address the problem of imitating multi-modal expert demonstrations in sequential decision making problems. In many practical applications, for example video games, behavioural demonstrations are readily available that contain multi-modal structure not captured by typical existing imitation learning approaches. For example, differences in the observed players' behaviours may be representative of different underlying playstyles.\n\n In this paper, we use a generative model to capture different emergent playstyles in an unsupervised manner, enabling the imitation of a diverse range of distinct behaviours. We utilise a variational autoencoder to learn an embedding of the different types of expert demonstrations on the trajectory level, and jointly learn a latent representation with a policy. In experiments on a range of 2D continuous control problems representative of Minecraft environments, we empirically demonstrate that our model can capture a multi-modal structured latent space from the demonstrated behavioural trajectories. ", "keywords": "imitation learning;latent variable model;variational autoencoder;diverse behaviour", "primary_area": "", "supplementary_material": "", "author": "Xiaoyu Lu;Jan Stuehmer;Katja Hofmann", "authorids": "xiaoyu.lu@stats.ox.ac.uk;t-jastuh@microsoft.com;katja.hofmann@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlu2019trajectory,\ntitle={Trajectory {VAE} for multi-modal imitation},\nauthor={Xiaoyu Lu and Jan Stuehmer and Katja Hofmann},\nyear={2019},\nurl={https://openreview.net/forum?id=Byx1VnR9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Byx1VnR9K7", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "wc_review": "557;206;660", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "131;59;120", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 474.3333333333333, 194.34391051832716 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 103.33333333333333, 31.668421004036322 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15500503267796843939&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Byx7LjRcYm", "title": "Human Action Recognition Based on Spatial-Temporal Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many state-of-the-art methods of recognizing human action are based on attention mechanism, which shows the importance of attention mechanism in action recognition. With the rapid development of neural networks, human action recognition has been achieved great improvement by using convolutional neural networks (CNN) or recurrent neural networks (RNN). In this paper, we propose a model based on spatial-temporal attention weighted LSTM. This model pays attention to the key part in each video frame, and also focuses on the important frames in each video sequence, thus the most important theme for our model is how to find out the key point spatially and the key frames temporally. We show a feasible architecture which can solve those two problems effectively and achieve a satisfactory result. Our model is trained and tested on three datasets including UCF-11, UCF-101, and HMDB51. Those results demonstrate a high performance of our model in human action recognition.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Wensong Chan;Zhiqiang Tian;Xuguang Lan", "authorids": "2489925838@qq.com;zhiqiangtian@xjtu.edu.cn;xglan@xjtu.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchan2019human,\ntitle={Human Action Recognition Based on Spatial-Temporal Attention},\nauthor={Wensong Chan and Zhiqiang Tian and Xuguang Lan},\nyear={2019},\nurl={https://openreview.net/forum?id=Byx7LjRcYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Byx7LjRcYm", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;4", "wc_review": "291;353;766", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 470.0, 210.82852431933082 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9191647376072044682&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Information-Directed Exploration for Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/983", "id": "Byx83s09Km", "author_site": "Nikolay Nikolov, Johannes Kirschner, Felix Berkenkamp, Andreas Krause", "tldr": "We develop a practical extension of Information-Directed Sampling for Reinforcement Learning, which accounts for parametric uncertainty and heteroscedasticity in the return distribution for exploration.", "abstract": "Efficient exploration remains a major challenge for reinforcement learning. One reason is that the variability of the returns often depends on the current state and action, and is therefore heteroscedastic. Classical exploration strategies such as upper confidence bound algorithms and Thompson sampling fail to appropriately account for heteroscedasticity, even in the bandit setting. Motivated by recent findings that address this issue in bandits, we propose to use Information-Directed Sampling (IDS) for exploration in reinforcement learning. As our main contribution, we build on recent advances in distributional reinforcement learning and propose a novel, tractable approximation of IDS for deep Q-learning. The resulting exploration strategy explicitly accounts for both parametric uncertainty and heteroscedastic observation noise. We evaluate our method on Atari games and demonstrate a significant improvement over alternative approaches.", "keywords": "reinforcement learning;exploration;information directed sampling", "primary_area": "", "supplementary_material": "", "author": "Nikolay Nikolov;Johannes Kirschner;Felix Berkenkamp;Andreas Krause", "authorids": "nikolay.nikolov14@imperial.ac.uk;jkirschner@inf.ethz.ch;befelix@inf.ethz.ch;krausea@ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nnikolov2018informationdirected,\ntitle={Information-Directed Exploration for Deep Reinforcement Learning},\nauthor={Nikolay Nikolov and Johannes Kirschner and Felix Berkenkamp and Andreas Krause},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byx83s09Km},\n}", "github": "[![github](/images/github_icon.svg) nikonikolov/rltf](https://github.com/nikonikolov/rltf)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;4", "wc_review": "357;492;317", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "407;409;294", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 388.6666666666667, 74.87025815072067 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 370.0, 53.74631770332426 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12419979613667846761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=Byx83s09Km", "pdf": "https://openreview.net/pdf?id=Byx83s09Km", "email": ";;;", "author_num": 4 }, { "id": "Byx93sC9tm", "title": "Deep Ensemble Bayesian Active Learning : Adressing the Mode Collapse issue in Monte Carlo dropout via Ensembles", "track": "main", "status": "Reject", "tldr": "We present a method for Deep Bayesian Active Learning combining MC-Dropout with Ensemble Models", "abstract": "In image classification tasks, the ability of deep convolutional neural networks (CNNs) to deal with complex image data has proved to be unrivalled. Deep CNNs, however, require large amounts of labeled training data to reach their full potential. In specialised domains such as healthcare, labeled data can be difficult and expensive to obtain. One way to alleviate this problem is to rely on active learning, a learning technique that aims to reduce the amount of labelled data needed for a specific task while still delivering satisfactory performance.\nWe propose a new active learning strategy designed\nfor deep neural networks. This method improves upon the current state-of-the-art deep Bayesian active learning method, which suffers from the mode collapse problem. We correct for this deficiency by making use of the expressive power and statistical properties of model ensembles. Our proposed method manages to capture superior data uncertainty, which translates into improved classification performance. We demonstrate empirically that our ensemble method yields faster convergence of CNNs trained on the MNIST and CIFAR-10 \ndatasets.", "keywords": "Active Learning;Deep Learning;Bayesian Neural Networks;Bayesian Deep Learning;Ensembles", "primary_area": "", "supplementary_material": "", "author": "Remus Pop;Patric Fulop", "authorids": "remus.p.pop@gmail.com;patric.fulop@ed.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npop2019deep,\ntitle={Deep Ensemble Bayesian Active Learning : Adressing the Mode Collapse issue in Monte Carlo dropout via Ensembles},\nauthor={Remus Pop and Patric Fulop},\nyear={2019},\nurl={https://openreview.net/forum?id=Byx93sC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Byx93sC9tm", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "514;282;214", "wc_reply_reviewers": "504;81;114", "wc_reply_authors": "1122;301;178", "reply_reviewers": "2;1;1", "reply_authors": "3;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 336.6666666666667, 128.42983384798964 ], "wc_reply_reviewers_avg": [ 233.0, 192.09893284451113 ], "wc_reply_authors_avg": [ 533.6666666666666, 419.0340744564284 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14065352179848966728&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ByxAOoR5K7", "title": "Policy Generalization In Capacity-Limited Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "This paper describes the application of rate-distortion theory to the learning of efficient (capacity limited) policy representations in the reinforcement learning setting.", "abstract": "Motivated by the study of generalization in biological intelligence, we examine reinforcement learning (RL) in settings where there are information-theoretic constraints placed on the learner's ability to represent a behavioral policy. We first show that the problem of optimizing expected utility within capacity-limited learning agents maps naturally to the mathematical field of rate-distortion (RD) theory. Applying the RD framework to the RL setting, we develop a new online RL algorithm, Capacity-Limited Actor-Critic, that learns a policy that optimizes a tradeoff between utility maximization and information processing costs. Using this algorithm in a 2D gridworld environment, we demonstrate two novel empirical results. First, at high information rates (high channel capacity), the algorithm achieves faster learning and discovers better policies compared to the standard tabular actor-critic algorithm. Second, we demonstrate that agents with capacity-limited policy representations avoid 'overfitting' and exhibit superior transfer to modified environments, compared to policies learned by agents with unlimited information processing resources. Our work provides a principled framework for the development of computationally rational RL agents.", "keywords": "reinforcement learning;generalization;capacity constraints;information theory", "primary_area": "", "supplementary_material": "", "author": "Rachel A. Lerch;Chris R. Sims", "authorids": "lerchr2@rpi.edu;simsc3@rpi.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlerch2019policy,\ntitle={Policy Generalization In Capacity-Limited Reinforcement Learning},\nauthor={Rachel A. Lerch and Chris R. Sims},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxAOoR5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByxAOoR5K7", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;3", "wc_review": "644;1147;147", "wc_reply_reviewers": "0;576;0", "wc_reply_authors": "328;634;147", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 646.0, 408.25073994625734 ], "wc_reply_reviewers_avg": [ 192.0, 271.5290039756342 ], "wc_reply_authors_avg": [ 369.6666666666667, 200.98811462925417 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4754362676417236248&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ByxAcjCqt7", "title": "Point Cloud GAN", "track": "main", "status": "Reject", "tldr": "We propose a GAN variant which learns to generate point clouds. Different studies have been explores, including tighter Wasserstein distance estimate, conditional generation, generalization to unseen point clouds and image to point cloud.", "abstract": "Generative Adversarial Networks (GAN) can achieve promising performance on learning complex data distributions on different types of data. In this paper, we first show that a straightforward extension of an existing GAN algorithm is not applicable to point clouds, because the constraint required for discriminators is undefined for set data. We propose a two fold modification to a GAN algorithm to be able to generate point clouds (PC-GAN). First, we combine ideas from hierarchical Bayesian modeling and implicit generative models by learning a hierarchical and interpretable sampling process. A key component of our method is that we train a posterior inference network for the hidden variables. Second, PC-GAN defines a generic framework that can incorporate many existing GAN algorithms. We further propose a sandwiching objective, which results in a tighter Wasserstein distance estimate than the commonly used dual form in WGAN. We validate our claims on the ModelNet40 benchmark dataset and observe that PC- GAN trained by the sandwiching objective achieves better results on test data than existing methods. We also conduct studies on several tasks, including generalization on unseen point clouds, latent space interpolation, classification, and image to point clouds transformation, to demonstrate the versatility of the proposed PC-GAN algorithm.", "keywords": "Point Cloud;GAN", "primary_area": "", "supplementary_material": "", "author": "Chun-Liang Li;Manzil Zaheer;Yang Zhang;Barnab\u00e1s P\u00f3czos;Ruslan Salakhutdinov", "authorids": "chunlial@cs.cmu.edu;manzilz@cs.cmu.edu;yz6@andrew.cmu.edu;bapoczos@cs.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019point,\ntitle={Point Cloud {GAN}},\nauthor={Chun-Liang Li and Manzil Zaheer and Yang Zhang and Barnab\u00e1s P\u00f3czos and Ruslan Salakhutdinov},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxAcjCqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByxAcjCqt7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "994;209;374", "wc_reply_reviewers": "99;49;0", "wc_reply_authors": "641;348;307", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 525.6666666666666, 337.9431247348517 ], "wc_reply_reviewers_avg": [ 49.333333333333336, 40.41726803676314 ], "wc_reply_authors_avg": [ 432.0, 148.73018075248436 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 277, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11465399646323395805&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Attention, Learn to Solve Routing Problems!", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1049", "id": "ByxBFsRqYm", "author_site": "Wouter Kool, Herke van Hoof, Max Welling", "tldr": "Attention based model trained with REINFORCE with greedy rollout baseline to learn heuristics with competitive results on TSP and other routing problems", "abstract": "The recently presented idea to learn heuristics for combinatorial optimization problems is promising as it can save costly development. However, to push this idea towards practical implementation, we need better models and better ways of training. We contribute in both directions: we propose a model based on attention layers with benefits over the Pointer Network and we show how to train this model using REINFORCE with a simple baseline based on a deterministic greedy rollout, which we find is more efficient than using a value function. We significantly improve over recent learned heuristics for the Travelling Salesman Problem (TSP), getting close to optimal results for problems up to 100 nodes. With the same hyperparameters, we learn strong heuristics for two variants of the Vehicle Routing Problem (VRP), the Orienteering Problem (OP) and (a stochastic variant of) the Prize Collecting TSP (PCTSP), outperforming a wide range of baselines and getting results close to highly optimized and specialized algorithms.", "keywords": "learning;routing problems;heuristics;attention;reinforce;travelling salesman problem;vehicle routing problem;orienteering problem;prize collecting travelling salesman problem", "primary_area": "", "supplementary_material": "", "author": "Wouter Kool;Herke van Hoof;Max Welling", "authorids": "w.w.m.kool@uva.nl;h.c.vanhoof@uva.nl;m.welling@uva.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkool2018attention,\ntitle={Attention, Learn to Solve Routing Problems!},\nauthor={Wouter Kool and Herke van Hoof and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxBFsRqYm},\n}", "github": "[![github](/images/github_icon.svg) wouterkool/attention-tsp](https://github.com/wouterkool/attention-tsp) + [![Papers with Code](/images/pwc_icon.svg) 13 community implementations](https://paperswithcode.com/paper/?openreview=ByxBFsRqYm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;5;5", "wc_review": "559;411;606", "wc_reply_reviewers": "0;11;8", "wc_reply_authors": "452;117;222", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 525.3333333333334, 83.0916495336458 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 4.642796092394706 ], "wc_reply_authors_avg": [ 263.6666666666667, 139.90075847622208 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1846, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14639976201161443491&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ByxBFsRqYm", "pdf": "https://openreview.net/pdf?id=ByxBFsRqYm", "email": ";;", "author_num": 3 }, { "id": "ByxF-nAqYX", "title": "Locally Linear Unsupervised Feature Selection", "track": "main", "status": "Reject", "tldr": "Unsupervised feature selection through capturing the local linear structure of the data", "abstract": "The paper, interested in unsupervised feature selection, aims to retain the features best accounting for the local patterns in the data. The proposed approach, called Locally Linear Unsupervised Feature Selection, relies on a dimensionality reduction method to characterize such patterns; each feature is thereafter assessed according to its compliance w.r.t. the local patterns, taking inspiration from Locally Linear Embedding (Roweis and Saul, 2000). The experimental validation of the approach on the scikit-feature benchmark suite demonstrates its effectiveness compared to the state of the art.", "keywords": "Unsupervised Learning;Feature Selection;Dimension Reduction", "primary_area": "", "supplementary_material": "", "author": "Guillaume DOQUET;Mich\u00e8le SEBAG", "authorids": "doquet@lri.fr;sebag@lri.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndoquet2019locally,\ntitle={Locally Linear Unsupervised Feature Selection},\nauthor={Guillaume DOQUET and Mich\u00e8le SEBAG},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxF-nAqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByxF-nAqYX", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;5;2", "wc_review": "360;174;240", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "660;638;447", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 258.0, 76.99350621968063 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 581.6666666666666, 95.64633930382398 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:KbxAEM1svRMJ:scholar.google.com/&scioq=Locally+Linear+Unsupervised+Feature+Selection&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "L2-Nonexpansive Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1134", "id": "ByxGSsR9FQ", "author_site": "Haifeng Qian, Mark N Wegman", "tldr": "", "abstract": "This paper proposes a class of well-conditioned neural networks in which a unit amount of change in the inputs causes at most a unit amount of change in the outputs or any of the internal layers. We develop the known methodology of controlling Lipschitz constants to realize its full potential in maximizing robustness, with a new regularization scheme for linear layers, new ways to adapt nonlinearities and a new loss function. With MNIST and CIFAR-10 classifiers, we demonstrate a number of advantages. Without needing any adversarial training, the proposed classifiers exceed the state of the art in robustness against white-box L2-bounded adversarial attacks. They generalize better than ordinary networks from noisy data with partially random labels. Their outputs are quantitatively meaningful and indicate levels of confidence and generalization, among other desirable properties.", "keywords": "adversarial defense;regularization;robustness;generalization", "primary_area": "", "supplementary_material": "", "author": "Haifeng Qian;Mark N. Wegman", "authorids": "qianhaifeng@us.ibm.com;wegman@us.ibm.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nqian2018lnonexpansive,\ntitle={L2-Nonexpansive Neural Networks},\nauthor={Haifeng Qian and Mark N. Wegman},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxGSsR9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;8", "confidence": "3;4;4", "wc_review": "946;908;516", "wc_reply_reviewers": "325;24;34", "wc_reply_authors": "2199;884;1116", "reply_reviewers": "1;1;1", "reply_authors": "6;2;3", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 790.0, 194.3673497958612 ], "wc_reply_reviewers_avg": [ 127.66666666666667, 139.59544723553446 ], "wc_reply_authors_avg": [ 1399.6666666666667, 573.0947177871695 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.6666666666666665, 1.699673171197595 ], "replies_avg": [ 66, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6491454663849798567&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ByxGSsR9FQ", "pdf": "https://openreview.net/pdf?id=ByxGSsR9FQ", "email": ";", "author_num": 2 }, { "id": "ByxHb3R5tX", "title": "Universal Successor Features for Transfer Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transfer in Reinforcement Learning (RL) refers to the idea of applying knowledge gained from previous tasks to solving related tasks. Learning a universal value function (Schaul et al., 2015), which generalizes over goals and states, has previously been shown to be useful for transfer. However, successor features are believed to be more suitable than values for transfer (Dayan, 1993; Barreto et al.,2017), even though they cannot directly generalize to new goals. In this paper, we propose (1) Universal Successor Features (USFs) to capture the underlying dynamics of the environment while allowing generalization to unseen goals and (2) a flexible end-to-end model of USFs that can be trained by interacting with the environment. We show that learning USFs is compatible with any RL algorithm that learns state values using a temporal difference method. Our experiments in a simple gridworld and with two MuJoCo environments show that USFs can greatly accelerate training when learning multiple tasks and can effectively transfer knowledge to new tasks.", "keywords": "Reinforcement Learning;Successor Features;Successor Representations;Transfer Learning;Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Chen Ma;Dylan R. Ashley;Junfeng Wen;Yoshua Bengio", "authorids": "chenchloem@gmail.com;dashley@ualberta.ca;junfengwen@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nma2019universal,\ntitle={Universal Successor Features for Transfer Reinforcement Learning},\nauthor={Chen Ma and Dylan R. Ashley and Junfeng Wen and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxHb3R5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByxHb3R5tX", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;5;5", "wc_review": "896;324;233", "wc_reply_reviewers": "61;288;0", "wc_reply_authors": "1025;869;294", "reply_reviewers": "1;3;0", "reply_authors": "3;5;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 484.3333333333333, 293.45338452450824 ], "wc_reply_reviewers_avg": [ 116.33333333333333, 123.91484531277473 ], "wc_reply_authors_avg": [ 729.3333333333334, 314.3462351540981 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12772764211059252047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ByxLl309Ym", "title": "Conditional Inference in Pre-trained Variational Autoencoders via Cross-coding", "track": "main", "status": "Reject", "tldr": "", "abstract": "Variational Autoencoders (VAEs) are a popular generative model, but one in which conditional inference can be challenging. If the decomposition into query and evidence variables is fixed, conditional VAEs provide an attractive solution. To support arbitrary queries, one is generally reduced to Markov Chain Monte Carlo sampling methods that can suffer from long mixing times. In this paper, we propose an idea we term cross-coding to approximate the distribution over the latent variables after conditioning on an evidence assignment to some subset of the variables. This allows generating query samples without retraining the full VAE. We experimentally evaluate three variations of cross-coding showing that (i) can be quickly optimized for different decompositions of evidence and query and (ii) they quantitatively and qualitatively outperform Hamiltonian Monte Carlo.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ga Wu;Justin Domke;Scott Sanner", "authorids": "wuga@mie.utoronto.ca;domke@cs.umass.edu;ssanner@mie.utoronto.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwu2019conditional,\ntitle={Conditional Inference in Pre-trained Variational Autoencoders via Cross-coding},\nauthor={Ga Wu and Justin Domke and Scott Sanner},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxLl309Ym},\n}", "github": "[![github](/images/github_icon.svg) wuga214/XCoder_VAE_Conditional_Inference](https://github.com/wuga214/XCoder_VAE_Conditional_Inference)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ByxLl309Ym", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;4", "wc_review": "763;141;846", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 583.3333333333334, 314.60698586578707 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10927540199704545549&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Improving Generalization and Stability of Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/896", "id": "ByxPYjC5KQ", "author_site": "Hoang Thanh-Tung, Truyen Tran, Svetha Venkatesh", "tldr": "We propose a zero-centered gradient penalty for improving generalization and stability of GANs", "abstract": "Generative Adversarial Networks (GANs) are one of the most popular tools for learning complex high dimensional distributions. However, generalization properties of GANs have not been well understood. In this paper, we analyze the generalization of GANs in practical settings. We show that discriminators trained on discrete datasets with the original GAN loss have poor generalization capability and do not approximate the theoretically optimal discriminator. We propose a zero-centered gradient penalty for improving the generalization of the discriminator by pushing it toward the optimal discriminator. The penalty guarantees the generalization and convergence of GANs. Experiments on synthetic and large scale datasets verify our theoretical analysis.\n", "keywords": "GAN;generalization;gradient penalty;zero centered;convergence", "primary_area": "", "supplementary_material": "", "author": "Hoang Thanh-Tung;Truyen Tran;Svetha Venkatesh", "authorids": "hoangtha@deakin.edu.au;truyen.tran@deakin.edu.au;svetha.venkatesh@deakin.edu.au", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nthanh-tung2018improving,\ntitle={Improving Generalization and Stability of Generative Adversarial Networks},\nauthor={Hoang Thanh-Tung and Truyen Tran and Svetha Venkatesh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxPYjC5KQ},\n}", "github": "[![github](/images/github_icon.svg) htt210/GeneralizationAndStabilityInGANs](https://github.com/htt210/GeneralizationAndStabilityInGANs)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;3", "wc_review": "233;166;227", "wc_reply_reviewers": "272;0;0", "wc_reply_authors": "1784;673;303", "reply_reviewers": "2;0;0", "reply_authors": "4;2;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 208.66666666666666, 30.26916289265731 ], "wc_reply_reviewers_avg": [ 90.66666666666667, 128.2220296551606 ], "wc_reply_authors_avg": [ 920.0, 629.3366878441672 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 173, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13499019185526283919&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ByxPYjC5KQ", "pdf": "https://openreview.net/pdf?id=ByxPYjC5KQ", "email": ";;", "author_num": 3 }, { "title": "Adaptive Input Representations for Neural Language Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/950", "id": "ByxZX20qFQ", "author_site": "Alexei Baevski, Michael Auli", "tldr": "Variable capacity input word embeddings and SOTA on WikiText-103, Billion Word benchmarks.", "abstract": "We introduce adaptive input representations for neural language modeling which extend the adaptive softmax of Grave et al. (2017) to input representations of variable capacity. There are several choices on how to factorize the input and output layers, and whether to model words, characters or sub-word units. We perform a systematic comparison of popular choices for a self-attentional architecture. Our experiments show that models equipped with adaptive embeddings are more than twice as fast to train than the popular character input CNN while having a lower number of parameters. On the WikiText-103 benchmark we achieve 18.7 perplexity, an improvement of 10.5 perplexity compared to the previously best published result and on the Billion Word benchmark, we achieve 23.02 perplexity.", "keywords": "Neural language modeling", "primary_area": "", "supplementary_material": "", "author": "Alexei Baevski;Michael Auli", "authorids": "alexei.b@gmail.com;michael.auli@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbaevski2018adaptive,\ntitle={Adaptive Input Representations for Neural Language Modeling},\nauthor={Alexei Baevski and Michael Auli},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxZX20qFQ},\n}", "github": "[![github](/images/github_icon.svg) pytorch/fairseq](https://github.com/pytorch/fairseq) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=ByxZX20qFQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "wc_review": "292;155;302", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "219;63;274", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 249.66666666666666, 67.06381703687582 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 185.33333333333334, 89.36939570618618 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 489, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9932684582274973195&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ByxZX20qFQ", "pdf": "https://openreview.net/pdf?id=ByxZX20qFQ", "email": ";", "author_num": 2 }, { "id": "ByxZdj09tX", "title": "FROM DEEP LEARNING TO DEEP DEDUCING: AUTOMATICALLY TRACKING DOWN NASH EQUILIBRIUM THROUGH AUTONOMOUS NEURAL AGENT, A POSSIBLE MISSING STEP TOWARD GENERAL A.I.", "track": "main", "status": "Reject", "tldr": "FROM DEEP LEARNING TO DEEP DEDUCING", "abstract": "Contrary to most reinforcement learning studies, which emphasize on training a deep neural network to approximate its output layer to certain strategies, this paper proposes a reversed method for reinforcement learning. We call this \u201cDeep Deducing\u201d. In short, after adequately training a deep neural network according to a strategy-environment-to-payoff table, then we initialize randomized strategy\ninput and propagate the error between the actual output and the desired output back to the initially-randomized strategy input in the \u201cinput layer\u201d of the trained deep neural network gradually to perform a task similar to \u201chuman deduction\u201d. And we view the final strategy input in the \u201cinput layer\u201d as the fittest strategy for a neural network when confronting the observed environment input from the world outside.", "keywords": "Reinforcement Learning;Deep Feed-forward Neural Network;Recurrent Neural Network;Game Theory;Control Theory;Nash Equilibrium;Optimization", "primary_area": "", "supplementary_material": "", "author": "Brown Wang", "authorids": "brownwang0426@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nwang2019from,\ntitle={{FROM} {DEEP} {LEARNING} {TO} {DEEP} {DEDUCING}: {AUTOMATICALLY} {TRACKING} {DOWN} {NASH} {EQUILIBRIUM} {THROUGH} {AUTONOMOUS} {NEURAL} {AGENT}, A {POSSIBLE} {MISSING} {STEP} {TOWARD} {GENERAL} A.I.},\nauthor={Brown Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxZdj09tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=ByxZdj09tX", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;5", "wc_review": "245;571;97", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "123;0;0", "reply_reviewers": "0;0;0", "reply_authors": "1;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 304.3333333333333, 198.00561159275819 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 41.0, 57.982756057296896 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.5, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "ByxkCj09Fm", "title": "DEEP HIERARCHICAL MODEL FOR HIERARCHICAL SELECTIVE CLASSIFICATION AND ZERO SHOT LEARNING", "track": "main", "status": "Reject", "tldr": "We propose a new hierarchical probability based loss function which yields a significantly better semantic classifier for large scale classification scenario. Moreover, we show the importance of such a model in two applications.", "abstract": "Object recognition in real-world image scenes is still an open problem. With the growing number of classes, the similarity structures between them become complex and the distinction between classes blurs, which makes the classification problem particularly challenging. Standard N-way discrete classifiers treat all classes as disconnected and unrelated, and therefore unable to learn from their semantic relationships. In this work, we present a hierarchical inter-class relationship model and train it using a newly proposed probability-based loss function. Our hierarchical model provides significantly better semantic generalization ability compared to a regular N-way classifier. We further proposed an algorithm where given a probabilistic classification model it can return the input corresponding super-group based on classes hierarchy without any further learning. We deploy it in two scenarios in which super-group retrieval can be useful. The first one, selective classification, deals with the problem of low-confidence classification, wherein a model is unable to make a successful exact classification. \nThe second, zero-shot learning problem deals with making reasonable inferences on novel classes. Extensive experiments with the two scenarios show that our proposed hierarchical model yields more accurate and meaningful super-class predictions compared to a regular N-way classifier because of its significantly better semantic generalization ability.", "keywords": "deep learning;large-scale classificaion;heirarchical classification;zero-shot learning", "primary_area": "", "supplementary_material": "", "author": "Eliyahu Sason;Koby Crammer", "authorids": "sasonil@gmail.com;koby@ee.technion.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsason2019deep,\ntitle={{DEEP} {HIERARCHICAL} {MODEL} {FOR} {HIERARCHICAL} {SELECTIVE} {CLASSIFICATION} {AND} {ZERO} {SHOT} {LEARNING}},\nauthor={Eliyahu Sason and Koby Crammer},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxkCj09Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ByxkCj09Fm", "pdf_size": 0, "rating": "2;4;5", "confidence": "4;4;3", "wc_review": "196;679;219", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "315;592;326", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 364.6666666666667, 222.46547797105262 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 411.0, 128.0650876182368 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YRT2vAOG7SkJ:scholar.google.com/&scioq=DEEP+HIERARCHICAL+MODEL+FOR+HIERARCHICAL+SELECTIVE+CLASSIFICATION+AND+ZERO+SHOT+LEARNING&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "title": "Neural Persistence: A Complexity Measure for Deep Neural Networks Using Algebraic Topology", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/909", "id": "ByxkijC5FQ", "author_site": "Bastian Rieck, Matteo Togninalli, Christian Bock, Michael Moor, Max Horn, Thomas Gumbsch, Karsten Borgwardt", "tldr": "We develop a new topological complexity measure for deep neural networks and demonstrate that it captures their salient properties.", "abstract": "While many approaches to make neural networks more fathomable have been proposed, they are restricted to interrogating the network with input data. Measures for characterizing and monitoring structural properties, however, have not been developed. In this work, we propose neural persistence, a complexity measure for neural network architectures based on topological data analysis on weighted stratified graphs. To demonstrate the usefulness of our approach, we show that neural persistence reflects best practices developed in the deep learning community such as dropout and batch normalization. Moreover, we derive a neural persistence-based stopping criterion that shortens the training process while achieving comparable accuracies as early stopping based on validation loss.", "keywords": "Algebraic topology;persistent homology;network complexity;neural network", "primary_area": "", "supplementary_material": "", "author": "Bastian Rieck;Matteo Togninalli;Christian Bock;Michael Moor;Max Horn;Thomas Gumbsch;Karsten Borgwardt", "authorids": "bastian.rieck@bsse.ethz.ch;matteo.togninalli@bsse.ethz.ch;christian.bock@bsse.ethz.ch;michael.moor@bsse.ethz.ch;max.horn@bsse.ethz.ch;thomas.gumbsch@bsse.ethz.ch;karsten.borgwardt@bsse.ethz.ch", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nrieck2018neural,\ntitle={Neural Persistence: A Complexity Measure for Deep Neural Networks Using Algebraic Topology},\nauthor={Bastian Rieck and Matteo Togninalli and Christian Bock and Michael Moor and Max Horn and Thomas Gumbsch and Karsten Borgwardt},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxkijC5FQ},\n}", "github": "[![github](/images/github_icon.svg) BorgwardtLab/Neural-Persistence](https://github.com/BorgwardtLab/Neural-Persistence) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=ByxkijC5FQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;5;4", "wc_review": "334;444;553", "wc_reply_reviewers": "0;0;65", "wc_reply_authors": "770;786;777", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 443.6666666666667, 89.40668630228701 ], "wc_reply_reviewers_avg": [ 21.666666666666668, 30.641293851417057 ], "wc_reply_authors_avg": [ 777.6666666666666, 6.548960901462833 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.18898223650461357, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12286997751595249495&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ByxkijC5FQ", "pdf": "https://openreview.net/pdf?id=ByxkijC5FQ", "email": ";;;;;;", "author_num": 7 }, { "id": "ByxmXnA9FQ", "title": "A Variational Dirichlet Framework for Out-of-Distribution Detection", "track": "main", "status": "Reject", "tldr": "A new framework based variational inference for out-of-distribution detection", "abstract": "With the recently rapid development in deep learning, deep neural networks have been widely adopted in many real-life applications. However, deep neural networks are also known to have very little control over its uncertainty for test examples, which potentially causes very harmful and annoying consequences in practical scenarios. In this paper, we are particularly interested in designing a higher-order uncertainty metric for deep neural networks and investigate its performance on the out-of-distribution detection task proposed by~\\cite{hendrycks2016baseline}. Our method first assumes there exists a underlying higher-order distribution $\\mathcal{P}(z)$, which generated label-wise distribution $\\mathcal{P}(y)$ over classes on the K-dimension simplex, and then approximate such higher-order distribution via parameterized posterior function $p_{\\theta}(z|x)$ under variational inference framework, finally we use the entropy of learned posterior distribution $p_{\\theta}(z|x)$ as uncertainty measure to detect out-of-distribution examples. However, we identify the overwhelming over-concentration issue in such a framework, which greatly hinders the detection performance. Therefore, we further design a log-smoothing function to alleviate such issue to greatly increase the robustness of the proposed entropy-based uncertainty measure. Through comprehensive experiments on various datasets and architectures, our proposed variational Dirichlet framework with entropy-based uncertainty measure is consistently observed to yield significant improvements over many baseline systems.", "keywords": "out-of-distribution detection;variational inference;Dirichlet distribution;deep learning;uncertainty measure", "primary_area": "", "supplementary_material": "", "author": "Wenhu Chen;Yilin Shen;William Wang;Hongxia Jin", "authorids": "wenhuchen@ucsb.edu;yilin.shen@samsung.com;william@cs.ucsb.edu;hongxia.jin@samsung.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2019a,\ntitle={A Variational Dirichlet Framework for Out-of-Distribution Detection},\nauthor={Wenhu Chen and Yilin Shen and William Wang and Hongxia Jin},\nyear={2019},\nurl={https://openreview.net/forum?id=ByxmXnA9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ByxmXnA9FQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;3;4", "wc_review": "680;308;534", "wc_reply_reviewers": "0;147;0", "wc_reply_authors": "889;620;296", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 507.3333333333333, 153.03449139182825 ], "wc_reply_reviewers_avg": [ 49.0, 69.29646455628166 ], "wc_reply_authors_avg": [ 601.6666666666666, 242.43807915057863 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17333413802296884636&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Efficient Augmentation via Data Subsampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1118", "id": "Byxpfh0cFm", "author_site": "Michael Kuchnik, Virginia Smith", "tldr": "Selectively augmenting difficult to classify points results in efficient training.", "abstract": "Data augmentation is commonly used to encode invariances in learning methods. However, this process is often performed in an inefficient manner, as artificial examples are created by applying a number of transformations to all points in the training set. The resulting explosion of the dataset size can be an issue in terms of storage and training costs, as well as in selecting and tuning the optimal set of transformations to apply. In this work, we demonstrate that it is possible to significantly reduce the number of data points included in data augmentation while realizing the same accuracy and invariance benefits of augmenting the entire dataset. We propose a novel set of subsampling policies, based on model influence and loss, that can achieve a 90% reduction in augmentation set size while maintaining the accuracy gains of standard data augmentation.", "keywords": "data augmentation;invariance;subsampling;influence", "primary_area": "", "supplementary_material": "", "author": "Michael Kuchnik;Virginia Smith", "authorids": "mkuchnik@andrew.cmu.edu;smithv@cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkuchnik2018efficient,\ntitle={Efficient Augmentation via Data Subsampling},\nauthor={Michael Kuchnik and Virginia Smith},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Byxpfh0cFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "wc_review": "481;608;241", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "527;315;81", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 443.3333333333333, 152.17606761759734 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 307.6666666666667, 182.15256121052911 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12087290703747116451&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Byxpfh0cFm", "pdf": "https://openreview.net/pdf?id=Byxpfh0cFm", "email": ";", "author_num": 2 }, { "id": "Byxr73R5FQ", "title": "Successor Options : An Option Discovery Algorithm for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "An option discovery method for Reinforcement Learning using the Successor Representation", "abstract": "Hierarchical Reinforcement Learning is a popular method to exploit temporal abstractions in order to tackle the curse of dimensionality. The options framework is one such hierarchical framework that models the notion of skills or options. However, learning a collection of task-agnostic transferable skills is a challenging task. Option discovery typically entails using heuristics, the majority of which revolve around discovering bottleneck states. In this work, we adopt a method complementary to the idea of discovering bottlenecks. Instead, we attempt to discover ``landmark\" sub-goals which are prototypical states of well connected regions. These sub-goals are points from which densely connected set of states are easily accessible. We propose a new model called Successor options that leverages Successor Representations to achieve the same. We also design a novel pseudo-reward for learning the intra-option policies. Additionally, we describe an Incremental Successor options model that iteratively builds options and explores in environments where exploration through primitive actions is inadequate to form the Successor Representations. Finally, we demonstrate the efficacy of our approach on a collection of grid worlds and on complex high dimensional environments like Deepmind-Lab.\n", "keywords": "Hierarchical Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Manan Tomar*;Rahul Ramesh*;Balaraman Ravindran", "authorids": "manan.tomar@gmail.com;rahul13ramesh@gmail.com;ravi@cse.iitm.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntomar*2019successor,\ntitle={Successor Options : An Option Discovery Algorithm for Reinforcement Learning},\nauthor={Manan Tomar* and Rahul Ramesh* and Balaraman Ravindran},\nyear={2019},\nurl={https://openreview.net/forum?id=Byxr73R5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=Byxr73R5FQ", "pdf_size": 0, "rating": "4;4;5;6", "confidence": "5;5;4;4", "wc_review": "1245;606;300;192", "wc_reply_reviewers": "0;0;149;0", "wc_reply_authors": "1592;1419;767;705", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;2;1", "rating_avg": [ 4.75, 0.82915619758885 ], "confidence_avg": [ 4.5, 0.5 ], "wc_review_avg": [ 585.75, 409.79041899488084 ], "wc_reply_reviewers_avg": [ 37.25, 64.51889258194068 ], "wc_reply_authors_avg": [ 1120.75, 390.197626210104 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9045340337332909, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17506220949557036036&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Byxz4n09tQ", "title": "Model Compression with Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "More accurate machine learning models often demand more computation and memory at test time, making them difficult to deploy on CPU- or memory-constrained devices. Model compression (also known as distillation) alleviates this burden by training a less expensive student model to mimic the expensive teacher model while maintaining most of the original accuracy. However, when fresh data is unavailable for the compression task, the teacher's training data is typically reused, leading to suboptimal compression. In this work, we propose to augment the compression dataset with synthetic data from a generative adversarial network (GAN) designed to approximate the training data distribution. Our GAN-assisted model compression (GAN-MC) significantly improves student accuracy for expensive models such as deep neural networks and large random forests on both image and tabular datasets. Building on these results, we propose a comprehensive metric\u2014the Compression Score\u2014to evaluate the quality of synthetic datasets based on their induced model compression performance. The Compression Score captures both data diversity and discriminability, and we illustrate its benefits over the popular Inception Score in the context of image classification.", "keywords": "Model compression;distillation;generative adversarial network;GAN;deep neural network;random forest;ensemble;decision tree;convolutional neural network", "primary_area": "", "supplementary_material": "", "author": "Ruishan Liu;Nicolo Fusi;Lester Mackey", "authorids": "ruishan@stanford.edu;fusi@microsoft.com;lmackey@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliu2019model,\ntitle={Model Compression with Generative Adversarial Networks},\nauthor={Ruishan Liu and Nicolo Fusi and Lester Mackey},\nyear={2019},\nurl={https://openreview.net/forum?id=Byxz4n09tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Byxz4n09tQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "287;675;682", "wc_reply_reviewers": "0;0;232", "wc_reply_authors": "257;1043;1137", "reply_reviewers": "0;0;1", "reply_authors": "1;2;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 548.0, 184.5769938715729 ], "wc_reply_reviewers_avg": [ 77.33333333333333, 109.36584882351936 ], "wc_reply_authors_avg": [ 812.3333333333334, 394.550658626015 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17669779383823332249&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Neural TTS Stylization with Adversarial and Collaborative Games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1111", "id": "ByzcS3AcYX", "author_site": "shuang ma, Daniel McDuff, Yale Song", "tldr": "a generative adversarial network for style modeling in a text-to-speech system", "abstract": "The modeling of style when synthesizing natural human speech from text has been the focus of significant attention. Some state-of-the-art approaches train an encoder-decoder network on paired text and audio samples (x_txt, x_aud) by encouraging its output to reconstruct x_aud. The synthesized audio waveform is expected to contain the verbal content of x_txt and the auditory style of x_aud. Unfortunately, modeling style in TTS is somewhat under-determined and training models with a reconstruction loss alone is insufficient to disentangle content and style from other factors of variation. In this work, we introduce an end-to-end TTS model that offers enhanced content-style disentanglement ability and controllability. We achieve this by combining a pairwise training procedure, an adversarial game, and a collaborative game into one training scheme. The adversarial game concentrates the true data distribution, and the collaborative game minimizes the distance between real samples and generated samples in both the original space and the latent space. As a result, the proposed model delivers a highly controllable generator, and a disentangled representation. Benefiting from the separate modeling of style and content, our model can generate human fidelity speech that satisfies the desired style conditions. Our model achieves start-of-the-art results across multiple tasks, including style transfer (content and style swapping), emotion modeling, and identity transfer (fitting a new speaker's voice).", "keywords": "Text-To-Speech synthesis;GANs", "primary_area": "", "supplementary_material": "", "author": "Shuang Ma;Daniel Mcduff;Yale Song", "authorids": "shuangma@buffalo.edu;damcduff@microsoft.com;yalesong@csail.mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nma2018a,\ntitle={A generative adversarial network for style modeling in a text-to-speech system},\nauthor={Shuang Ma and Daniel Mcduff and Yale Song},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ByzcS3AcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6;7", "confidence": "3;5;5;5", "wc_review": "119;1186;271;1553", "wc_reply_reviewers": "0;1125;208;131", "wc_reply_authors": "13;1059;622;770", "reply_reviewers": "0;3;2;1", "reply_authors": "1;4;2;1", "rating_avg": [ 6.25, 0.4330127018922193 ], "confidence_avg": [ 4.5, 0.8660254037844386 ], "wc_review_avg": [ 782.25, 603.810141932048 ], "wc_reply_reviewers_avg": [ 366.0, 444.4732837865511 ], "wc_reply_authors_avg": [ 616.0, 381.97185760210135 ], "reply_reviewers_avg": [ 1.5, 1.118033988749895 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.3333333333333333, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14283708523200938731&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=ByzcS3AcYX", "pdf": "https://openreview.net/pdf?id=ByzcS3AcYX", "email": ";;", "author_num": 3 }, { "id": "ByzoVi0cFQ", "title": "Transfer Learning for Estimating Causal Effects Using Neural Networks", "track": "main", "status": "Withdraw", "tldr": "Transfer learning for estimating causal effects using neural networks.", "abstract": "We develop new algorithms for estimating heterogeneous treatment effects, combining recent developments in transfer learning for neural networks with insights from the causal inference literature. By taking advantage of transfer learning, we are able to efficiently use different data sources that are related to the same underlying causal mechanisms. We compare our algorithms with those in the extant literature using extensive simulation studies based on large-scale voter persuasion experiments and the MNIST database. Our methods can perform an order of magnitude better than existing benchmarks while using a fraction of the data.", "keywords": "machine learning;causal inference;causal neural networks;deep learning;CATE estimation;transfer learning;meta-learning;causal transfer", "primary_area": "", "supplementary_material": "", "author": "S\u00f6ren R. K\u00fcnzel;Bradly C. Stadie;Nikita Vemuri;Varsha Ramakrishnan;Jasjeet S. Sekhon;Pieter Abbeel", "authorids": "srk@berkeley.edu;bstadie@berkeley.edu;nikitavemuri@berkeley.edu;vio@berkeley.edu;sekhon@berkeley.edu;pabbeel@cs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ByzoVi0cFQ", "pdf_size": 0, "rating": "3;5;7", "confidence": "3;4;3", "wc_review": "287;451;171", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 303.0, 114.86804023167917 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16957552114779122256&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "H1ERcs09KQ", "title": "Hierarchically Clustered Representation Learning", "track": "main", "status": "Reject", "tldr": "We introduce hierarchically clustered representation learning (HCRL), which simultaneously optimizes representation learning and hierarchical clustering in the embedding space.", "abstract": "The joint optimization of representation learning and clustering in the embedding space has experienced a breakthrough in recent years. In spite of the advance, clustering with representation learning has been limited to flat-level categories, which oftentimes involves cohesive clustering with a focus on instance relations. To overcome the limitations of flat clustering, we introduce hierarchically clustered representation learning (HCRL), which simultaneously optimizes representation learning and hierarchical clustering in the embedding space. Specifically, we place a nonparametric Bayesian prior on embeddings to handle dynamic mixture hierarchies under the variational autoencoder framework, and to adopt the generative process of a hierarchical-versioned Gaussian mixture model. Compared with a few prior works focusing on unifying representation learning and hierarchical clustering, HCRL is the first model to consider a generation of deep embeddings from every component of the hierarchy, not just leaf components. This generation process enables more meaningful separations and mergers of clusters via branches in a hierarchy. In addition to obtaining hierarchically clustered embeddings, we can reconstruct data by the various abstraction levels, infer the intrinsic hierarchical structure, and learn the level-proportion features. We conducted evaluations with image and text domains, and our quantitative analyses showed competent likelihoods and the best accuracies compared with the baselines.", "keywords": "Representation learning;Hierarchical clustering;Nonparametric Bayesian modeling", "primary_area": "", "supplementary_material": "", "author": "Su-Jin Shin;Kyungwoo Song;Il-Chul Moon", "authorids": "sujin.shin@kaist.ac.kr;gtshs2@kaist.ac.kr;icmoon@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nshin2019hierarchically,\ntitle={Hierarchically Clustered Representation Learning},\nauthor={Su-Jin Shin and Kyungwoo Song and Il-Chul Moon},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ERcs09KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1ERcs09KQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "wc_review": "152;269;199", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "649;1331;729", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 206.66666666666666, 48.07170569980733 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 903.0, 304.3988611454824 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5661366232138670868&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "H1GLm2R9Km", "title": "Learning Backpropagation-Free Deep Architectures with Kernels", "track": "main", "status": "Reject", "tldr": "We combine kernel method with connectionist models and show that the resulting deep architectures can be trained layer-wise and have more transparent learning dynamics. ", "abstract": "One can substitute each neuron in any neural network with a kernel machine and obtain a counterpart powered by kernel machines. The new network inherits the expressive power and architecture of the original but works in a more intuitive way since each node enjoys the simple interpretation as a hyperplane (in a reproducing kernel Hilbert space). Further, using the kernel multilayer perceptron as an example, we prove that in classification, an optimal representation that minimizes the risk of the network can be characterized for each hidden layer. This result removes the need of backpropagation in learning the model and can be generalized to any feedforward kernel network. Moreover, unlike backpropagation, which turns models into black boxes, the optimal hidden representation enjoys an intuitive geometric interpretation, making the dynamics of learning in a deep kernel network simple to understand. Empirical results are provided to validate our theory.", "keywords": "supervised learning;backpropagation-free deep architecture;kernel method", "primary_area": "", "supplementary_material": "", "author": "Shiyu Duan;Shujian Yu;Yunmei Chen;Jose Principe", "authorids": ";;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nduan2019learning,\ntitle={Learning Backpropagation-Free Deep Architectures with Kernels},\nauthor={Shiyu Duan and Shujian Yu and Yunmei Chen and Jose Principe},\nyear={2019},\nurl={https://openreview.net/forum?id=H1GLm2R9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1GLm2R9Km", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "wc_review": "408;1418;289", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1410;3990;1299", "reply_reviewers": "0;0;0", "reply_authors": "7;11;7", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 705.0, 506.5023856475571 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 2233.0, 1243.2127734221524 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 8.333333333333334, 1.8856180831641267 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4WD2ZXGkAgAJ:scholar.google.com/&scioq=Learning+Backpropagation-Free+Deep+Architectures+with+Kernels&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1GWAoRcKX", "title": "A Teacher Student Network For Faster Video Classification", "track": "main", "status": "Withdraw", "tldr": "Teacher-Student framework for efficient video classification using fewer frames ", "abstract": "Over the past few years, various tasks involving videos such as classification, description, summarization and question answering have received a lot of attention. Current models for these tasks compute an encoding of the video by treating it as a sequence of images and going over every image in the sequence, which becomes computationally expensive for longer videos. In this paper, we focus on the task of video classification and aim to reduce the computational cost by using the idea of distillation. Specifically, we propose a Teacher-Student network wherein the teacher looks at all the frames in the video but the student looks at only a small fraction of the frames in the video. The idea is to then train the student to minimize (i) the difference between the final representation computed by the student and the teacher and/or (ii) the difference between the distributions predicted by the teacher and the student. This smaller student network which involves fewer computations but still learns to mimic the teacher can then be employed at inference time for video classification. We experiment with the YouTube-8M dataset and show that the proposed student network can reduce the inference time by upto 30% with a negligent drop in the performance. ", "keywords": "video classification;efficient computation;knowledge distillation;teacher-student", "primary_area": "", "supplementary_material": "", "author": "Shweta Bhardwaj;Mukundhan Srinivasan;Mitesh M. Khapra", "authorids": "cs16s003@cse.iitm.ac.in;msrinivasan@nvidia.com;miteshk@cse.iitm.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1GWAoRcKX", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;5;4", "wc_review": "319;527;237", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 361.0, 122.06009448901253 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g_D0e-XrvsUJ:scholar.google.com/&scioq=A+Teacher+Student+Network+For+Faster+Video+Classification&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "H1GaLiAcY7", "title": "Learning to Separate Domains in Generalized Zero-Shot and Open Set Learning: a probabilistic perspective", "track": "main", "status": "Reject", "tldr": " This paper studies the problem of domain division by segmenting instances drawn from different probabilistic distributions. ", "abstract": "This paper studies the problem of domain division which aims to segment instances drawn from different probabilistic distributions. This problem exists in many previous recognition tasks, such as Open Set Learning (OSL) and Generalized Zero-Shot Learning (G-ZSL), where the testing instances come from either seen or unseen/novel classes with different probabilistic distributions. Previous works only calibrate the con\ufb01dent prediction of classi\ufb01ers of seen classes (WSVM Scheirer et al. (2014)) or taking unseen classes as outliers Socher et al. (2013). In contrast, this paper proposes a probabilistic way of directly estimating and \ufb01ne-tuning the decision boundary between seen and unseen classes. In particular, we propose a domain division algorithm to split the testing instances into known, unknown and uncertain domains, and then conduct recognition tasks in each domain. Two statistical tools, namely, bootstrapping and KolmogorovSmirnov (K-S) Test, for the \ufb01rst time, are introduced to uncover and \ufb01ne-tune the decision boundary of each domain. Critically, the uncertain domain is newly introduced in our framework to adopt those instances whose domain labels cannot be predicted con\ufb01dently. Extensive experiments demonstrate that our approach achieved the state-of-the-art performance on OSL and G-ZSL benchmarks.", "keywords": "Generalized zero-shot learning;domain division;bootstrapping;Kolmogorov-Smirnov", "primary_area": "", "supplementary_material": "", "author": "Hanze Dong;Yanwei Fu;Leonid Sigal;SungJu Hwang;Yu-Gang Jiang;Xiangyang Xue", "authorids": "hzdong15@fudan.edu.cn;yanweifu@fudan.edu.cn;lsigal@cs.ubc.ca;sjhwang82@kaist.ac.kr;ygj@fudan.edu.cn;xyxue@fudan.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ndong2019learning,\ntitle={Learning to Separate Domains in Generalized Zero-Shot and Open Set Learning: a probabilistic perspective},\nauthor={Hanze Dong and Yanwei Fu and Leonid Sigal and SungJu Hwang and Yu-Gang Jiang and Xiangyang Xue},\nyear={2019},\nurl={https://openreview.net/forum?id=H1GaLiAcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1GaLiAcY7", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "wc_review": "325;290;232", "wc_reply_reviewers": "0;0;34", "wc_reply_authors": "279;243;258", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 282.3333333333333, 38.35216928530756 ], "wc_reply_reviewers_avg": [ 11.333333333333334, 16.027753706895076 ], "wc_reply_authors_avg": [ 260.0, 14.7648230602334 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4925226052302238388&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "H1Gfx3Rqtm", "title": "End-to-End Hierarchical Text Classification with Label Assignment Policy", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present an end-to-end reinforcement learning approach to hierarchical text classification where documents are labeled by placing them at the right positions in a given hierarchy.\nWhile existing \u201cglobal\u201d methods construct hierarchical losses for model training, they either make \u201clocal\u201d decisions at each hierarchy node or ignore the hierarchy structure during inference. To close the gap between training/inference and optimize holistic metrics in an end-to-end manner, we propose to learn a label assignment policy to determine where to place the documents and when to stop. The proposed method, HiLAP, optimizes holistic metrics over the hierarchy, makes inter-dependent decisions during inference, and can be combined with different text encoding models for end-to-end training.\nExperiments on three public datasets show that HiLAP yields an average improvement of 33.4% in Macro-F1 and 5.0% in Samples-F1, outperforming state-of-the-art methods by a large margin.", "keywords": "Hierarchical Classification;Text Classification", "primary_area": "", "supplementary_material": "", "author": "Yuning Mao;Jingjing Tian;Jiawei Han;Xiang Ren", "authorids": "yuningm2@illinois.edu;tianjj97@pku.edu.cn;hanj@illinois.edu;xiangren@usc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmao2019endtoend,\ntitle={End-to-End Hierarchical Text Classification with Label Assignment Policy},\nauthor={Yuning Mao and Jingjing Tian and Jiawei Han and Xiang Ren},\nyear={2019},\nurl={https://openreview.net/forum?id=H1Gfx3Rqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1Gfx3Rqtm", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "411;362;489", "wc_reply_reviewers": "0;251;0", "wc_reply_authors": "701;1432;431", "reply_reviewers": "0;2;0", "reply_authors": "1;4;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 420.6666666666667, 52.29616514515083 ], "wc_reply_reviewers_avg": [ 83.66666666666667, 118.32253471854894 ], "wc_reply_authors_avg": [ 854.6666666666666, 422.8556359904511 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=782779544336707208&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H1Gnx2CqKQ", "title": "Hiding Objects from Detectors: Exploring Transferrable Adversarial Patterns", "track": "main", "status": "Withdraw", "tldr": "We focus on creating universal adversaries to fool object detectors and hide objects from the detectors. ", "abstract": "Adversaries in neural networks have drawn much attention since their first debut. \nWhile most existing methods aim at deceiving image classification models into misclassification or crafting attacks for specific object instances in the object setection tasks, we focus on creating universal adversaries to fool object detectors and hide objects from the detectors. \nThe adversaries we examine are universal in three ways: \n(1) They are not specific for specific object instances; \n(2) They are image-independent; \n(3) They can further transfer to different unknown models. \nTo achieve this, we propose two novel techniques to improve the transferability of the adversaries: \\textit{piling-up} and \\textit{monochromatization}. \nBoth techniques prove to simplify the patterns of generated adversaries, and ultimately result in higher transferability. ", "keywords": "adversarial;object detection", "primary_area": "", "supplementary_material": "", "author": "Shangbang Long;Jie Fu;Chris Pal", "authorids": "longlongsb@pku.edu.cn;jie.fu@polymtl.ca;christopher.pal@polymtl.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1Gnx2CqKQ", "pdf_size": 0, "rating": "3;4;6", "confidence": "3;4;4", "wc_review": "93;217;262", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "151;87;82", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 190.66666666666666, 71.46249987852993 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 106.66666666666667, 31.414787742222437 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8UuIuXeRua8J:scholar.google.com/&scioq=Hiding+Objects+from+Detectors:+Exploring+Transferrable+Adversarial+Patterns&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1M7soActX", "title": "The Anisotropic Noise in Stochastic Gradient Descent: Its Behavior of Escaping from Minima and Regularization Effects", "track": "main", "status": "Reject", "tldr": "We provide theoretical and empirical analysis on the role of anisotropic noise introduced by stochastic gradient on escaping from minima.", "abstract": "Understanding the behavior of stochastic gradient descent (SGD) in the context of deep neural networks has raised lots of concerns recently. Along this line, we theoretically study a general form of gradient based optimization dynamics with unbiased noise, which unifies SGD and standard Langevin dynamics. Through investigating this general optimization dynamics, we analyze the behavior of SGD on escaping from minima and its regularization effects. A novel indicator is derived to characterize the efficiency of escaping from minima through measuring the alignment of noise covariance and the curvature of loss function. Based on this indicator, two conditions are established to show which type of noise structure is superior to isotropic noise in term of escaping efficiency. We further show that the anisotropic noise in SGD satisfies the two conditions, and thus helps to escape from sharp and poor minima effectively, towards more stable and flat minima that typically generalize well. We verify our understanding through comparing\nthis anisotropic diffusion with full gradient descent plus isotropic diffusion (i.e. Langevin dynamics) and other types of position-dependent noise.", "keywords": "Stochastic gradient descent;anisotropic noise;regularization", "primary_area": "", "supplementary_material": "", "author": "Zhanxing Zhu;Jingfeng Wu;Bing Yu;Lei Wu;Jinwen Ma", "authorids": "zhanxing.zhu@pku.edu.cn;pkuwjf@pku.edu.cn;byu@pku.edu.cn;leiwu@pku.edu.cn;jwma@math.pku.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhu2019the,\ntitle={The Anisotropic Noise in Stochastic Gradient Descent: Its Behavior of Escaping from Minima and Regularization Effects},\nauthor={Zhanxing Zhu and Jingfeng Wu and Bing Yu and Lei Wu and Jinwen Ma},\nyear={2019},\nurl={https://openreview.net/forum?id=H1M7soActX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1M7soActX", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;3", "wc_review": "256;752;129", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 379.0, 268.79856150408744 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8530319537943237114&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "H1MBuiAqtX", "title": "Unicorn: Continual learning with a universal, off-policy agent", "track": "main", "status": "Reject", "tldr": "Agents learning jointly and off-policy about many tasks make progress on challenging continual learning domains.", "abstract": "Some real-world domains are best characterized as a single task, but for others this perspective is limiting. Instead, some tasks continually grow in complexity, in tandem with the agent's competence. In continual learning there are no explicit task boundaries or curricula. As learning agents have become more powerful, continual learning remains one of the frontiers that has resisted quick progress. To test continual learning capabilities we consider a challenging 3D domain with an implicit sequence of tasks and sparse rewards. We propose a novel agent architecture called Unicorn, which demonstrates strong continual learning and outperforms several baseline agents on the proposed domain. The agent achieves this by jointly representing and efficiently learning multiple policies for multiple goals, using a parallel off-policy learning setup. ", "keywords": "reinforcement learning;continual learning;universal value functions;off-policy learning;multi-task", "primary_area": "", "supplementary_material": "", "author": "Daniel J. Mankowitz;Augustin \u017d\u00eddek;Andr\u00e9 Barreto;Dan Horgan;Matteo Hessel;John Quan;Junhyuk Oh;Hado van Hasselt;David Silver;Tom Schaul", "authorids": "dmankowitz@google.com;augustinzidek@google.com;andrebarreto@google.com;horgan@google.com;mtthss@google.com;johnquan@google.com;junhyuk@google.com;hado@google.com;davidsilver@google.com;schaul@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@misc{\nmankowitz2019unicorn,\ntitle={Unicorn: Continual learning with a universal, off-policy agent},\nauthor={Daniel J. Mankowitz and Augustin \u017d\u00eddek and Andr\u00e9 Barreto and Dan Horgan and Matteo Hessel and John Quan and Junhyuk Oh and Hado van Hasselt and David Silver and Tom Schaul},\nyear={2019},\nurl={https://openreview.net/forum?id=H1MBuiAqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1MBuiAqtX", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;4", "wc_review": "211;193;470", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "464;385;385", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 291.3333333333333, 126.5499462223864 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 411.3333333333333, 37.2409571424915 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4532324389717718062&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Optimal Control Via Neural Networks: A Convex Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/722", "id": "H1MW72AcK7", "author_site": "Yize Chen, Yuanyuan Shi, Baosen Zhang", "tldr": "", "abstract": "Control of complex systems involves both system identification and controller design. Deep neural networks have proven to be successful in many identification tasks, however, from model-based control perspective, these networks are difficult to work with because they are typically nonlinear and nonconvex. Therefore many systems are still identified and controlled based on simple linear models despite their poor representation capability.\n\nIn this paper we bridge the gap between model accuracy and control tractability faced by neural networks, by explicitly constructing networks that are convex with respect to their inputs. We show that these input convex networks can be trained to obtain accurate models of complex physical systems. In particular, we design input convex recurrent neural networks to capture temporal behavior of dynamical systems. Then optimal controllers can be achieved via solving a convex model predictive control problem. Experiment results demonstrate the good potential of the proposed input convex neural network based approach in a variety of control applications. In particular we show that in the MuJoCo locomotion tasks, we could achieve over 10% higher performance using 5 times less time compared with state-of-the-art model-based reinforcement learning method; and in the building HVAC control example, our method achieved up to 20% energy reduction compared with classic linear models.\n", "keywords": "optimal control;input convex neural network;convex optimization", "primary_area": "", "supplementary_material": "", "author": "Yize Chen;Yuanyuan Shi;Baosen Zhang", "authorids": "yizechen@uw.edu;yyshi@uw.edu;zhangbao@uw.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018optimal,\ntitle={Optimal Control Via Neural Networks: A Convex Approach},\nauthor={Yize Chen and Yuanyuan Shi and Baosen Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1MW72AcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;4", "wc_review": "607;448;296", "wc_reply_reviewers": "537;0;0", "wc_reply_authors": "2788;582;347", "reply_reviewers": "1;0;0", "reply_authors": "5;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 450.3333333333333, 126.97593822803158 ], "wc_reply_reviewers_avg": [ 179.0, 253.144227664784 ], "wc_reply_authors_avg": [ 1239.0, 1099.5020084868725 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 257, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1075593248050773588&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1MW72AcK7", "pdf": "https://openreview.net/pdf?id=H1MW72AcK7", "email": ";;", "author_num": 3 }, { "title": "CBOW Is Not All You Need: Combining CBOW with the Compositional Matrix Space Model", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1054", "id": "H1MgjoR9tQ", "author_site": "Florian Mai, Lukas Galke, Ansgar Scherp", "tldr": "We present a novel training scheme for efficiently obtaining order-aware sentence representations.", "abstract": "Continuous Bag of Words (CBOW) is a powerful text embedding method. Due to its strong capabilities to encode word content, CBOW embeddings perform well on a wide range of downstream tasks while being efficient to compute. However, CBOW is not capable of capturing the word order. The reason is that the computation of CBOW's word embeddings is commutative, i.e., embeddings of XYZ and ZYX are the same. In order to address this shortcoming, we propose a\nlearning algorithm for the Continuous Matrix Space Model, which we call Continual Multiplication of Words (CMOW). Our algorithm is an adaptation of word2vec, so that it can be trained on large quantities of unlabeled text. We empirically show that CMOW better captures linguistic properties, but it is inferior to CBOW in memorizing word content. Motivated by these findings, we propose a hybrid model that combines the strengths of CBOW and CMOW. Our results show that the hybrid CBOW-CMOW-model retains CBOW's strong ability to memorize word content while at the same time substantially improving its ability to encode other linguistic information by 8%. As a result, the hybrid also performs better on 8 out of 11 supervised downstream tasks with an average improvement of 1.2%.", "keywords": "Text representation learning;Sentence embedding;Efficient training scheme;word2vec", "primary_area": "", "supplementary_material": "", "author": "Florian Mai;Lukas Galke;Ansgar Scherp", "authorids": "florian.ren.mai@googlemail.com;lga@informatik.uni-kiel.de;mail@ansgarscherp.net", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmai2018cbow,\ntitle={{CBOW} Is Not All You Need: Combining {CBOW} with the Compositional Matrix Space Model},\nauthor={Florian Mai and Lukas Galke and Ansgar Scherp},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1MgjoR9tQ},\n}", "github": "[![github](/images/github_icon.svg) florianmai/word2mat](https://github.com/florianmai/word2mat)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;4", "wc_review": "339;190;236", "wc_reply_reviewers": "0;10;68", "wc_reply_authors": "741;506;1094", "reply_reviewers": "0;1;1", "reply_authors": "1;1;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 255.0, 62.294997123899655 ], "wc_reply_reviewers_avg": [ 26.0, 29.97776954122282 ], "wc_reply_authors_avg": [ 780.3333333333334, 241.65586182742507 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6038502138949255694&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "openreview": "https://openreview.net/forum?id=H1MgjoR9tQ", "pdf": "https://openreview.net/pdf?id=H1MgjoR9tQ", "email": ";;", "author_num": 3 }, { "id": "H1MzKs05F7", "title": "Adversarial Vulnerability of Neural Networks Increases with Input Dimension", "track": "main", "status": "Reject", "tldr": "Neural nets have large gradients by design; that makes them adversarially vulnerable.", "abstract": "Over the past four years, neural networks have been proven vulnerable to adversarial images: targeted but imperceptible image perturbations lead to drastically different predictions. We show that adversarial vulnerability increases with the gradients of the training objective when viewed as a function of the inputs. For most current network architectures, we prove that the L1-norm of these gradients grows as the square root of the input size. These nets therefore become increasingly vulnerable with growing image size. Our proofs rely on the network\u2019s weight distribution at initialization, but extensive experiments confirm that our conclusions still hold after usual training.", "keywords": "adversarial vulnerability;neural networks;gradients;FGSM;adversarial data-augmentation;gradient regularization;robust optimization", "primary_area": "", "supplementary_material": "", "author": "Carl-Johann Simon-Gabriel;Yann Ollivier;L\u00e9on Bottou;Bernhard Sch\u00f6lkopf;David Lopez-Paz", "authorids": "cjsimon@tuebingen.mpg.de;yol@fb.com;leon@bottou.org;bs@tuebingen.mpg.de;dlp@fb.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nsimon-gabriel2019adversarial,\ntitle={Adversarial Vulnerability of Neural Networks Increases with Input Dimension},\nauthor={Carl-Johann Simon-Gabriel and Yann Ollivier and L\u00e9on Bottou and Bernhard Sch\u00f6lkopf and David Lopez-Paz},\nyear={2019},\nurl={https://openreview.net/forum?id=H1MzKs05F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=H1MzKs05F7", "pdf_size": 0, "rating": "4;5;6;9", "confidence": "5;5;4;4", "wc_review": "616;431;459;181", "wc_reply_reviewers": "852;0;432;0", "wc_reply_authors": "441;370;764;140", "reply_reviewers": "3;0;2;0", "reply_authors": "1;1;3;1", "rating_avg": [ 6.0, 1.8708286933869707 ], "confidence_avg": [ 4.5, 0.5 ], "wc_review_avg": [ 421.75, 155.85790804447493 ], "wc_reply_reviewers_avg": [ 321.0, 353.6820606137665 ], "wc_reply_authors_avg": [ 428.75, 223.25475918779424 ], "reply_reviewers_avg": [ 1.25, 1.299038105676658 ], "reply_authors_avg": [ 1.5, 0.8660254037844386 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8017837257372733, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5780158561780944700&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "H1V4QhAqYQ", "title": "Augment your batch: better training with larger batches", "track": "main", "status": "Reject", "tldr": "Improve accuracy by large batches composed of multiple instances of each sample at the same batch", "abstract": "Recently, there is regained interest in large batch training of neural networks, both of theory and practice. New insights and methods allowed certain models to be trained using large batches with no adverse impact on performance. Most works focused on accelerating wall clock training time by modifying the learning rate schedule, without introducing accuracy degradation. \nWe propose to use large batch training to boost accuracy and accelerate convergence by combining it with data augmentation. Our method, \"batch augmentation\", suggests using multiple instances of each sample at the same large batch. We show empirically that this simple yet effective method improves convergence and final generalization accuracy. We further suggest possible reasons for its success.", "keywords": "Large Batch Training;Augmentation;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Elad Hoffer;Itay Hubara;Niv Giladi;Daniel Soudry", "authorids": "elad.hoffer@gmail.com;itayhubara@gmail.com;giladiniv@gmail.com;daniel.soudry@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhoffer2019augment,\ntitle={Augment your batch: better training with larger batches},\nauthor={Elad Hoffer and Itay Hubara and Niv Giladi and Daniel Soudry},\nyear={2019},\nurl={https://openreview.net/forum?id=H1V4QhAqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1V4QhAqYQ", "pdf_size": 0, "rating": "4;4;8", "confidence": "4;4;3", "wc_review": "605;639;134", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "238;591;13", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 459.3333333333333, 230.46378360938962 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 280.6666666666667, 237.88839586850153 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6877208559271533414&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1e0-30qKm", "title": "Unlabeled Disentangling of GANs with Guided Siamese Networks", "track": "main", "status": "Reject", "tldr": "We use Siamese Networks to guide and disentangle the generation process in GANs without labeled data.", "abstract": "Disentangling underlying generative factors of a data distribution is important for interpretability and generalizable representations. In this paper, we introduce two novel disentangling methods. Our first method, Unlabeled Disentangling GAN (UD-GAN, unsupervised), decomposes the latent noise by generating similar/dissimilar image pairs and it learns a distance metric on these pairs with siamese networks and a contrastive loss. This pairwise approach provides consistent representations for similar data points. Our second method (UD-GAN-G, weakly supervised) modifies the UD-GAN with user-defined guidance functions, which restrict the information that goes into the siamese networks. This constraint helps UD-GAN-G to focus on the desired semantic variations in the data. We show that both our methods outperform existing unsupervised approaches in quantitative metrics that measure semantic accuracy of the learned representations. In addition, we illustrate that simple guidance functions we use in UD-GAN-G allow us to directly capture the desired variations in the data.", "keywords": "GAN;disentange;siamese networks;semantic", "primary_area": "", "supplementary_material": "", "author": "G\u00f6khan Yildirim;Nikolay Jetchev;Urs Bergmann", "authorids": "gokhan.yildirim@zalando.de;nikolay.jetchev@zalando.de;urs.bergmann@zalando.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyildirim2019unlabeled,\ntitle={Unlabeled Disentangling of {GAN}s with Guided Siamese Networks},\nauthor={G\u00f6khan Yildirim and Nikolay Jetchev and Urs Bergmann},\nyear={2019},\nurl={https://openreview.net/forum?id=H1e0-30qKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer5;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=H1e0-30qKm", "pdf_size": 0, "rating": "5;5;6;6", "confidence": "4;4;4;3", "wc_review": "228;201;847;234", "wc_reply_reviewers": "0;104;594;45", "wc_reply_authors": "112;232;1016;122", "reply_reviewers": "0;1;2;1", "reply_authors": "1;2;2;2", "rating_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 377.5, 271.35078772688314 ], "wc_reply_reviewers_avg": [ 185.75, 238.57113718972795 ], "wc_reply_authors_avg": [ 370.5, 375.64178415080505 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BkqAi4LoU64J:scholar.google.com/&scioq=Unlabeled+Disentangling+of+GANs+with+Guided+Siamese+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1e572A5tQ", "title": "TarMAC: Targeted Multi-Agent Communication", "track": "main", "status": "Reject", "tldr": "Targeted communication in multi-agent cooperative reinforcement learning", "abstract": "We explore the collaborative multi-agent setting where a team of deep reinforcement learning agents attempt to solve a shared task in partially observable environments. In this scenario, learning an effective communication protocol is key. We propose a communication protocol that allows for targeted communication, where agents learn \\emph{what} messages to send and \\emph{who} to send them to. Additionally, we introduce a multi-stage communication approach where the agents co-ordinate via several rounds of communication before taking an action in the environment. We evaluate our approach on several cooperative multi-agent tasks, of varying difficulties with varying number of agents, in a variety of environments ranging from 2D grid layouts of shapes and simulated traffic junctions to complex 3D indoor environments. We demonstrate the benefits of targeted as well as multi-stage communication. Moreover, we show that the targeted communication strategies learned by the agents are quite interpretable and intuitive.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Abhishek Das;Theophile Gervet;Joshua Romoff;Dhruv Batra;Devi Parikh;Mike Rabbat;Joelle Pineau", "authorids": "abhshkdz@gatech.edu;tgervet@andrew.cmu.edu;joshua.romoff@mail.mcgill.ca;dbatra@gatech.edu;parikh@gatech.edu;mikerabbat@fb.com;jpineau@cs.mcgill.ca", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ndas2019tarmac,\ntitle={Tar{MAC}: Targeted Multi-Agent Communication},\nauthor={Abhishek Das and Theophile Gervet and Joshua Romoff and Dhruv Batra and Devi Parikh and Mike Rabbat and Joelle Pineau},\nyear={2019},\nurl={https://openreview.net/forum?id=H1e572A5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1e572A5tQ", "pdf_size": 0, "rating": "6;6;6", "confidence": "5;4;5", "wc_review": "477;237;181", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1101;387;694", "reply_reviewers": "0;0;0", "reply_authors": "3;2;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 298.3333333333333, 128.38829991691438 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 727.3333333333334, 292.44068724367946 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 534, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12185105573950195413&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "H1e6ij0cKQ", "title": "EFFICIENT SEQUENCE LABELING WITH ACTOR-CRITIC TRAINING", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural approaches to sequence labeling often use a Conditional Random Field (CRF) to model their output dependencies, while Recurrent Neural Networks (RNN) are used for the same purpose in other tasks. We set out to establish RNNs as an attractive alternative to CRFs for sequence labeling. To do so, we address one of the RNN\u2019s most prominent shortcomings, the fact that it is not exposed to its own errors with the maximum-likelihood training. We frame the prediction of the output sequence as a sequential decision-making process, where we train the network with an adjusted actor-critic algorithm (AC-RNN). We comprehensively compare this strategy with maximum-likelihood training for both RNNs and CRFs on three structured-output tasks. The proposed AC-RNN efficiently matches the performance of the CRF on NER and CCG tagging, and outperforms it on Machine Transliteration. We also show that our training strategy is significantly better than other techniques for addressing RNN\u2019s exposure bias, such as Scheduled Sampling, and Self-Critical policy training.\n", "keywords": "Structured Prediction;Reinforcement Learning;NLP", "primary_area": "", "supplementary_material": "", "author": "Saeed Najafi;Colin Cherry;Greg Kondrak", "authorids": "snajafi@ualberta.ca;colin.a.cherry@gmail.com;gkondrak@ualberta.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnajafi2019efficient,\ntitle={{EFFICIENT} {SEQUENCE} {LABELING} {WITH} {ACTOR}-{CRITIC} {TRAINING}},\nauthor={Saeed Najafi and Colin Cherry and Greg Kondrak},\nyear={2019},\nurl={https://openreview.net/forum?id=H1e6ij0cKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1e6ij0cKQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;4", "wc_review": "488;214;221", "wc_reply_reviewers": "220;0;0", "wc_reply_authors": "219;59;38", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 307.6666666666667, 127.54694124996578 ], "wc_reply_reviewers_avg": [ 73.33333333333333, 103.70899457402697 ], "wc_reply_authors_avg": [ 105.33333333333333, 80.83041231835672 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1565135452660337088&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "H1e8wsCqYX", "title": "Laplacian Networks: Bounding Indicator Function Smoothness for Neural Networks Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": " For the past few years, Deep Neural Network (DNN) robustness has become a question of paramount importance. As a matter of fact, in sensitive settings misclassification can lead to dramatic consequences. Such misclassifications are likely to occur when facing adversarial attacks, hardware failures or limitations, and imperfect signal acquisition. To address this question, authors have proposed different approaches aiming at increasing the robustness of DNNs, such as adding regularizers or training using noisy examples. In this paper we propose a new regularizer built upon the Laplacian of similarity graphs obtained from the representation of training data at each layer of the DNN architecture. This regularizer penalizes large changes (across consecutive layers in the architecture) in the distance between examples of different classes, and as such enforces smooth variations of the class boundaries. Since it is agnostic to the type of deformations that are expected when predicting with the DNN, the proposed regularizer can be combined with existing ad-hoc methods. We provide theoretical justification for this regularizer and demonstrate its effectiveness to improve robustness of DNNs on classical supervised learning vision datasets.\n", "keywords": "GSP;robustness;noise;deep learning;neural networks", "primary_area": "", "supplementary_material": "", "author": "Carlos Eduardo Rosar Kos Lassance;Vincent Gripon;Antonio Ortega", "authorids": "carlos.rosarkoslassance@imt-atlantique.fr;vincent.gripon@imt-atlantique.fr;antonio.ortega@ee.usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlassance2019laplacian,\ntitle={Laplacian Networks: Bounding Indicator Function Smoothness for Neural Networks Robustness},\nauthor={Carlos Eduardo Rosar Kos Lassance and Vincent Gripon and Antonio Ortega},\nyear={2019},\nurl={https://openreview.net/forum?id=H1e8wsCqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1e8wsCqYX", "pdf_size": 0, "rating": "5;5;9", "confidence": "4;3;5", "wc_review": "427;386;241", "wc_reply_reviewers": "239;0;0", "wc_reply_authors": "1705;394;314", "reply_reviewers": "1;0;0", "reply_authors": "4;1;1", "rating_avg": [ 6.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 351.3333333333333, 79.79278719839837 ], "wc_reply_reviewers_avg": [ 79.66666666666667, 112.66568046905657 ], "wc_reply_authors_avg": [ 804.3333333333334, 637.7043898930671 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5926202143246592749&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 14 }, { "id": "H1eH4n09KX", "title": "Adversarial Audio Super-Resolution with Unsupervised Feature Losses", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural network-based methods have recently demonstrated state-of-the-art results on image synthesis and super-resolution tasks, in particular by using variants of generative adversarial networks (GANs) with supervised feature losses. Nevertheless, previous feature loss formulations rely on the availability of large auxiliary classifier networks, and labeled datasets that enable such classifiers to be trained. Furthermore, there has been comparatively little work to explore the applicability of GAN-based methods to domains other than images and video. In this work we explore a GAN-based method for audio processing, and develop a convolutional neural network architecture to perform audio super-resolution. In addition to several new architectural building blocks for audio processing, a key component of our approach is the use of an autoencoder-based loss that enables training in the GAN framework, with feature losses derived from unlabeled data. We explore the impact of our architectural choices, and demonstrate significant improvements over previous works in terms of both objective and perceptual quality.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sung Kim;Visvesh Sathe", "authorids": "sungmk@umich.edu;sathe@uw.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkim2019adversarial,\ntitle={Adversarial Audio Super-Resolution with Unsupervised Feature Losses},\nauthor={Sung Kim and Visvesh Sathe},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eH4n09KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1eH4n09KX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "636;168;1133", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1511;494;1264", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 645.6666666666666, 394.01889407601874 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1089.6666666666667, 433.10224607539914 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4179535002059536384&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "H1eMBn09Km", "title": "Using GANs for Generation of Realistic City-Scale Ride Sharing/Hailing Data Sets", "track": "main", "status": "Reject", "tldr": "This paper focuses on the synthetic generation of human mobility data in urban areas using GANs. ", "abstract": "This paper focuses on the synthetic generation of human mobility data in urban areas. We present a novel and scalable application of Generative Adversarial Networks (GANs) for modeling and generating human mobility data. We leverage actual ride requests from ride sharing/hailing services from four major cities in the US to train our GANs model. Our model captures the spatial and temporal variability of the ride-request patterns observed for all four cities on any typical day and over any typical week. Previous works have succinctly characterized the spatial and temporal properties of human mobility data sets using the fractal dimensionality and the densification power law, respectively, which we utilize to validate our GANs-generated synthetic data sets. Such synthetic data sets can avoid privacy concerns and be extremely useful for researchers and policy makers on urban mobility and intelligent transportation.", "keywords": "ride-sharing;generative modeling;parallelization;application", "primary_area": "", "supplementary_material": "", "author": "Abhinav Jauhri;Brad Stocks;Jian Hui Li;Koichi Yamada;John Paul Shen", "authorids": "ajauhri@cmu.edu;brad.stocks@sv.cmu.edu;jian.hui.li@intel.com;koichi.yamada@intel.com;jpshen@cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\njauhri2019using,\ntitle={Using {GAN}s for Generation of Realistic City-Scale Ride Sharing/Hailing Data Sets},\nauthor={Abhinav Jauhri and Brad Stocks and Jian Hui Li and Koichi Yamada and John Paul Shen},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eMBn09Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1eMBn09Km", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "461;607;272", "wc_reply_reviewers": "0;62;0", "wc_reply_authors": "854;839;589", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 446.6666666666667, 137.1382109001313 ], "wc_reply_reviewers_avg": [ 20.666666666666668, 29.227080289043965 ], "wc_reply_authors_avg": [ 760.6666666666666, 121.54103102336354 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QiCzm0Dn3TEJ:scholar.google.com/&scioq=Using+GANs+for+Generation+of+Realistic+City-Scale+Ride+Sharing/Hailing+Data+Sets&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "H1eRBoC9FX", "title": "Unsupervised Meta-Learning for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "Remove the burden of task distribution specification in meta-reinforcement learning by using unsupervised exploration", "abstract": "Meta-learning is a powerful tool that learns how to quickly adapt a model to new tasks. In the context of reinforcement learning, meta-learning algorithms can acquire reinforcement learning procedures to solve new problems more efficiently by meta-learning prior tasks. The performance of meta-learning algorithms critically depends on the tasks available for meta-training: in the same way that supervised learning algorithms generalize best to test points drawn from the same distribution as the training points, meta-learning methods generalize best to tasks from the same distribution as the meta-training tasks. In effect, meta-reinforcement learning offloads the design burden from algorithm design to task design. If we can automate the process of task design as well, we can devise a meta-learning algorithm that is truly automated. In this work, we take a step in this direction, proposing a family of unsupervised meta-learning algorithms for reinforcement learning. We describe a general recipe for unsupervised meta-reinforcement learning, and describe an effective instantiation of this approach based on a recently proposed unsupervised exploration technique and model-agnostic meta-learning. We also discuss practical and conceptual considerations for developing unsupervised meta-learning methods. Our experimental results demonstrate that unsupervised meta-reinforcement learning effectively acquires accelerated reinforcement learning procedures without the need for manual task design, significantly exceeds the performance of learning from scratch, and even matches performance of meta-learning methods that use hand-specified task distributions.", "keywords": "Meta-Learning;Reinforcement Learning;Exploration;Unsupervised", "primary_area": "", "supplementary_material": "", "author": "Abhishek Gupta;Benjamin Eysenbach;Chelsea Finn;Sergey Levine", "authorids": "abhigupta@berkeley.edu;eysenbachbe@gmail.com;cbfinn@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngupta2019unsupervised,\ntitle={Unsupervised Meta-Learning for Reinforcement Learning},\nauthor={Abhishek Gupta and Benjamin Eysenbach and Chelsea Finn and Sergey Levine},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eRBoC9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1eRBoC9FX", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;2;3", "wc_review": "599;559;273", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 477.0, 145.17116334405628 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.3273268353539886, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15092894714895568430&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1eRIoA5Y7", "title": "Low-Cost Parameterizations of Deep Convolutional Neural Networks", "track": "main", "status": "Withdraw", "tldr": "This paper introduces efficient and economic parametrizations of convolutional neural networks motivated by partial differential equations ", "abstract": "Convolutional Neural Networks (CNNs) filter the input data using a series of spatial convolution operators with compactly supported stencils and point-wise nonlinearities.\nCommonly, the convolution operators couple features from all channels.\nFor wide networks, this leads to immense computational cost in the training of and prediction with CNNs.\nIn this paper, we present novel ways to parameterize the convolution more efficiently, aiming to decrease the number of parameters in CNNs and their computational complexity.\nWe propose new architectures that use a sparser coupling between the channels and thereby reduce both the number of trainable weights and the computational cost of the CNN.\nOur architectures arise as new types of residual neural network (ResNet) that can be seen as discretizations of a Partial Differential Equations (PDEs) and thus have predictable theoretical properties. Our first architecture involves a convolution operator with a special sparsity structure, and is applicable to a large class of CNNs. Next, we present an architecture that can be seen as a discretization of a diffusion reaction PDE, and use it with three different convolution operators. We outline in our experiments that the proposed architectures, although considerably reducing the number of trainable weights, yield comparable accuracy to existing CNNs that are fully coupled in the channel dimension.\n", "keywords": "Deep Learning;Classification;Partial Differential Equations", "primary_area": "", "supplementary_material": "", "author": "Eran Treister;Lars Ruthotto;Michal Sharoni;Sapir Zafrani;Eldad Haber", "authorids": "erant@bgu.ac.il;lruthotto@emory.edu;sharmic@post.bgu.ac.il;sapirza@post.bgu.ac.il;ehaber@eos.ubc.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1eRIoA5Y7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;5", "wc_review": "595;240;327", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 387.3333333333333, 151.0768752067047 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14426655219465762158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Stochastic Optimization of Sorting Networks via Continuous Relaxations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/920", "id": "H1eSS3CcKX", "author_site": "Aditya Grover, Eric J. Wang, Aaron Zweig, Stefano Ermon", "tldr": "We provide a continuous relaxation to the sorting operator, enabling end-to-end, gradient-based stochastic optimization.", "abstract": "Sorting input objects is an important step in many machine learning pipelines. However, the sorting operator is non-differentiable with respect to its inputs, which prohibits end-to-end gradient-based optimization. In this work, we propose NeuralSort, a general-purpose continuous relaxation of the output of the sorting operator from permutation matrices to the set of unimodal row-stochastic matrices, where every row sums to one and has a distinct argmax. This relaxation permits straight-through optimization of any computational graph involve a sorting operation. Further, we use this relaxation to enable gradient-based stochastic optimization over the combinatorially large space of permutations by deriving a reparameterized gradient estimator for the Plackett-Luce family of distributions over permutations. We demonstrate the usefulness of our framework on three tasks that require learning semantic orderings of high-dimensional objects, including a fully differentiable, parameterized extension of the k-nearest neighbors algorithm", "keywords": "continuous relaxations;sorting;permutation;stochastic computation graphs;Plackett-Luce", "primary_area": "", "supplementary_material": "", "author": "Aditya Grover;Eric Wang;Aaron Zweig;Stefano Ermon", "authorids": "adityag@cs.stanford.edu;ejwang@cs.stanford.edu;azweig@cs.stanford.edu;ermon@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngrover2018stochastic,\ntitle={Stochastic Optimization of Sorting Networks via Continuous Relaxations},\nauthor={Aditya Grover and Eric Wang and Aaron Zweig and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eSS3CcKX},\n}", "github": "[![github](/images/github_icon.svg) ermongroup/neuralsort](https://github.com/ermongroup/neuralsort)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;4", "wc_review": "270;464;296", "wc_reply_reviewers": "0;0;6", "wc_reply_authors": "798;622;362", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 343.3333333333333, 85.98191024214854 ], "wc_reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "wc_reply_authors_avg": [ 594.0, 179.0940162782293 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 195, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10619362619006891050&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H1eSS3CcKX", "pdf": "https://openreview.net/pdf?id=H1eSS3CcKX", "email": ";;;", "author_num": 4 }, { "id": "H1eZ6sRcFm", "title": "Variational Autoencoders for Text Modeling without Weakening the Decoder", "track": "main", "status": "Withdraw", "tldr": "We propose a model of variational autoencoders for text modeling without weakening the decoder, which improves the quality of text generation and interpretability of acquired representations.", "abstract": "Previous work (Bowman et al., 2015; Yang et al., 2017) has found difficulty developing generative models based on variational autoencoders (VAEs) for text. To address the problem of the decoder ignoring information from the encoder (posterior collapse), these previous models weaken the capacity of the decoder to force the model to use information from latent variables. However, this strategy is not ideal as it degrades the quality of generated text and increases hyper-parameters. In this paper, we propose a new VAE for text utilizing a multimodal prior distribution, a modified encoder, and multi-task learning. We show our model can generate well-conditioned sentences without weakening the capacity of the decoder. Also, the multimodal prior distribution improves the interpretability of acquired representations.", "keywords": "variational autoencoders;generative model;deep neural network;text modeling;unsupervised learning;multimodal", "primary_area": "", "supplementary_material": "", "author": "Ryo Kamoi;Hiroyasu Fukutomi", "authorids": "ryo_kamoi_st@keio.jp;hiroyasu.fukutomi@datasection.co.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1eZ6sRcFm", "pdf_size": 0, "rating": "1;4;4", "confidence": "4;5;3", "wc_review": "605;1052;233", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 630.0, 334.82234095113785 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OeukyN6fdIsJ:scholar.google.com/&scioq=Variational+Autoencoders+for+Text+Modeling+without+Weakening+the+Decoder&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "H1eadi0cFQ", "title": "Escaping Flat Areas via Function-Preserving Structural Network Modifications", "track": "main", "status": "Reject", "tldr": "If optimization gets stuck in a saddle, we add a filter to a CNN in a specific way in order to escape the saddle.", "abstract": "Hierarchically embedding smaller networks in larger networks, e.g.~by increasing the number of hidden units, has been studied since the 1990s. The main interest was in understanding possible redundancies in the parameterization, as well as in studying how such embeddings affect critical points. We take these results as a point of departure to devise a novel strategy for escaping from flat regions of the error surface and to address the slow-down of gradient-based methods experienced in plateaus of saddle points. The idea is to expand the dimensionality of a network in a way that guarantees the existence of new escape directions. We call this operation the opening of a tunnel. One may then continue with the larger network either temporarily, i.e.~closing the tunnel later, or permanently, i.e.~iteratively growing the network, whenever needed. We develop our method for fully-connected as well as convolutional layers. Moreover, we present a practical version of our algorithm that requires no network structure modification and can be deployed as plug-and-play into any current deep learning framework. Experimentally, our method shows significant speed-ups.", "keywords": "deep learning;cnn;structural modification;optimization;saddle point", "primary_area": "", "supplementary_material": "", "author": "Yannic Kilcher;Gary B\u00e9cigneul;Thomas Hofmann", "authorids": "yannic.kilcher@inf.ethz.ch;garybecigneul06@gmail.com;thomas.hofmann@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkilcher2019escaping,\ntitle={Escaping Flat Areas via Function-Preserving Structural Network Modifications},\nauthor={Yannic Kilcher and Gary B\u00e9cigneul and Thomas Hofmann},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eadi0cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1eadi0cFQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "wc_review": "449;319;624", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "491;0;700", "reply_reviewers": "0;0;0", "reply_authors": "1;0;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 464.0, 124.96666222103664 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 397.0, 293.40188592895356 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9239198573541294884&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Adaptivity of deep ReLU network for learning in Besov and mixed smooth Besov spaces: optimal rate and curse of dimensionality", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/895", "id": "H1ebTsActm", "tldr": "", "abstract": "Deep learning has shown high performances in various types of tasks from visual recognition to natural language processing,\nwhich indicates superior flexibility and adaptivity of deep learning.\nTo understand this phenomenon theoretically, we develop a new approximation and estimation error analysis of \ndeep learning with the ReLU activation for functions in a Besov space and its variant with mixed smoothness.\nThe Besov space is a considerably general function space including the Holder space and Sobolev space, and especially can capture spatial inhomogeneity of smoothness. Through the analysis in the Besov space, it is shown that deep learning can achieve the minimax optimal rate and outperform any non-adaptive (linear) estimator such as kernel ridge regression,\nwhich shows that deep learning has higher adaptivity to the spatial inhomogeneity of the target function than other estimators such as linear ones. In addition to this, it is shown that deep learning can avoid the curse of dimensionality if the target function is in a mixed smooth Besov space. We also show that the dependency of the convergence rate on the dimensionality is tight due to its minimax optimality. These results support high adaptivity of deep learning and its superior ability as a feature extractor.\n", "keywords": "deep learning theory;approximation analysis;generalization error analysis;Besov space;minimax optimality", "primary_area": "", "supplementary_material": "", "author": "Taiji Suzuki", "authorids": "taiji@mist.i.u-tokyo.ac.jp", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nsuzuki2018adaptivity,\ntitle={Adaptivity of deep Re{LU} network for learning in Besov and mixed smooth Besov spaces: optimal rate and curse of dimensionality},\nauthor={Taiji Suzuki},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ebTsActm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "2;2;2", "wc_review": "453;176;438", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "762;372;937", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 2.0, 0.0 ], "wc_review_avg": [ 355.6666666666667, 127.19101994856223 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 690.3333333333334, 236.1614325460917 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 304, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5691160361865711721&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1ebTsActm", "pdf": "https://openreview.net/pdf?id=H1ebTsActm", "email": "", "author_num": 1 }, { "id": "H1ecDoR5Y7", "title": "Local Stability and Performance of Simple Gradient Penalty $\\mu$-Wasserstein GAN", "track": "main", "status": "Reject", "tldr": "This paper deals with stability of simple gradient penalty $\\mu$-WGAN optimization by introducing a concept of measure valued differentiation.", "abstract": "Wasserstein GAN(WGAN) is a model that minimizes the Wasserstein distance between a data distribution and sample distribution. Recent studies have proposed stabilizing the training process for the WGAN and implementing the Lipschitz constraint. In this study, we prove the local stability of optimizing the simple gradient penalty $\\mu$-WGAN(SGP $\\mu$-WGAN) under suitable assumptions regarding the equilibrium and penalty measure $\\mu$. The measure valued differentiation concept is employed to deal with the derivative of the penalty terms, which is helpful for handling abstract singular measures with lower dimensional support. Based on this analysis, we claim that penalizing the data manifold or sample manifold is the key to regularizing the original WGAN with a gradient penalty. Experimental results obtained with unintuitive penalty measures that satisfy our assumptions are also provided to support our theoretical results.", "keywords": "WGAN;gradient penalty;stability;measure valued differentiation", "primary_area": "", "supplementary_material": "", "author": "Cheolhyeong Kim;Seungtae Park;Hyung Ju Hwang", "authorids": "tyty4@postech.ac.kr;swash21@postech.ac.kr;hjhwang@postech.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkim2019local,\ntitle={Local Stability and Performance of Simple Gradient Penalty $\\mu$-Wasserstein {GAN}},\nauthor={Cheolhyeong Kim and Seungtae Park and Hyung Ju Hwang},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ecDoR5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1ecDoR5Y7", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;4", "wc_review": "243;278;301", "wc_reply_reviewers": "0;46;0", "wc_reply_authors": "540;1373;753", "reply_reviewers": "0;1;0", "reply_authors": "1;3;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 274.0, 23.84673283002656 ], "wc_reply_reviewers_avg": [ 15.333333333333334, 21.684607956387456 ], "wc_reply_authors_avg": [ 888.6666666666666, 353.34245271250506 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16612410064768650666&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Generating Multiple Objects at Spatially Distinct Locations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/736", "id": "H1edIiA9KQ", "author_site": "Tobias Hinz, Stefan Heinrich, Stefan Wermter", "tldr": "Extend GAN architecture to obtain control over locations and identities of multiple objects within generated images.", "abstract": "Recent improvements to Generative Adversarial Networks (GANs) have made it possible to generate realistic images in high resolution based on natural language descriptions such as image captions. Furthermore, conditional GANs allow us to control the image generation process through labels or even natural language descriptions. However, fine-grained control of the image layout, i.e. where in the image specific objects should be located, is still difficult to achieve. This is especially true for images that should contain multiple distinct objects at different spatial locations. We introduce a new approach which allows us to control the location of arbitrarily many objects within an image by adding an object pathway to both the generator and the discriminator. Our approach does not need a detailed semantic layout but only bounding boxes and the respective labels of the desired objects are needed. The object pathway focuses solely on the individual objects and is iteratively applied at the locations specified by the bounding boxes. The global pathway focuses on the image background and the general image layout. We perform experiments on the Multi-MNIST, CLEVR, and the more complex MS-COCO data set. Our experiments show that through the use of the object pathway we can control object locations within images and can model complex scenes with multiple objects at various locations. We further show that the object pathway focuses on the individual objects and learns features relevant for these, while the global pathway focuses on global image characteristics and the image background.", "keywords": "controllable image generation;text-to-image synthesis;generative model;generative adversarial network;gan", "primary_area": "", "supplementary_material": "", "author": "Tobias Hinz;Stefan Heinrich;Stefan Wermter", "authorids": "hinz@informatik.uni-hamburg.de;heinrich@informatik.uni-hamburg.de;wermter@informatik.uni-hamburg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhinz2018generating,\ntitle={Generating Multiple Objects at Spatially Distinct Locations},\nauthor={Tobias Hinz and Stefan Heinrich and Stefan Wermter},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1edIiA9KQ},\n}", "github": "[![github](/images/github_icon.svg) tohinz/multiple-objects-gan](https://github.com/tohinz/multiple-objects-gan)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "wc_review": "221;1156;103", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1192;1753;72", "reply_reviewers": "0;0;0", "reply_authors": "2;3;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 493.3333333333333, 471.04588122838123 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1005.6666666666666, 698.7991286644698 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13574885695794039292&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1edIiA9KQ", "pdf": "https://openreview.net/pdf?id=H1edIiA9KQ", "email": ";;", "author_num": 3 }, { "id": "H1eiZnAqKm", "title": "The Expressive Power of Gated Recurrent Units as a Continuous Dynamical System", "track": "main", "status": "Reject", "tldr": "We classify the the dynamical features one and two GRU cells can and cannot capture in continuous time, and verify our findings experimentally with k-step time series prediction. ", "abstract": "Gated recurrent units (GRUs) were inspired by the common gated recurrent unit, long short-term memory (LSTM), as a means of capturing temporal structure with less complex memory unit architecture. Despite their incredible success in tasks such as natural and artificial language processing, speech, video, and polyphonic music, very little is understood about the specific dynamic features representable in a GRU network. As a result, it is difficult to know a priori how successful a GRU-RNN will perform on a given data set. In this paper, we develop a new theoretical framework to analyze one and two dimensional GRUs as a continuous dynamical system, and classify the dynamical features obtainable with such system.\nWe found rich repertoire that includes stable limit cycles over time (nonlinear oscillations), multi-stable state transitions with various topologies, and homoclinic orbits. In addition, we show that any finite dimensional GRU cannot precisely replicate the dynamics of a ring attractor, or more generally, any continuous attractor, and is limited to finitely many isolated fixed points in theory. These findings were then experimentally verified in two dimensions by means of time series prediction.", "keywords": "Gated Recurrent Units;Recurrent Neural Network;Time Series Predictions;interpretable;Nonlinear Dynamics;Dynamical Systems", "primary_area": "", "supplementary_material": "", "author": "Ian D. Jordan;Piotr Aleksander Sokol;Il Memming Park", "authorids": "ian.jordan@stonybrook.edu;piotr.sokol@stonybrook.edu;memming.park@stonybrook.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njordan2019the,\ntitle={The Expressive Power of Gated Recurrent Units as a Continuous Dynamical System},\nauthor={Ian D. Jordan and Piotr Aleksander Sokol and Il Memming Park},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eiZnAqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1eiZnAqKm", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "419;237;1073", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "430;180;1050", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 576.3333333333334, 358.9701318432434 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 553.3333333333334, 365.7260662785134 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4184840251979598412&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Near-Optimal Representation Learning for Hierarchical Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/713", "id": "H1emus0qF7", "author_site": "Ofir Nachum, Shixiang Gu, Honglak Lee, Sergey Levine", "tldr": "We translate a bound on sub-optimality of representations to a practical training objective in the context of hierarchical reinforcement learning.", "abstract": "We study the problem of representation learning in goal-conditioned hierarchical reinforcement learning. In such hierarchical structures, a higher-level controller solves tasks by iteratively communicating goals which a lower-level policy is trained to reach. Accordingly, the choice of representation -- the mapping of observation space to goal space -- is crucial. To study this problem, we develop a notion of sub-optimality of a representation, defined in terms of expected reward of the optimal hierarchical policy using this representation. We derive expressions which bound the sub-optimality and show how these expressions can be translated to representation learning objectives which may be optimized in practice. Results on a number of difficult continuous-control tasks show that our approach to representation learning yields qualitatively better representations as well as quantitatively better hierarchical policies, compared to existing methods.", "keywords": "representation hierarchy reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ofir Nachum;Shixiang Gu;Honglak Lee;Sergey Levine", "authorids": "ofirnachum@google.com;shanegu@google.com;honglak@google.com;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nnachum2018nearoptimal,\ntitle={Near-Optimal Representation Learning for Hierarchical Reinforcement Learning},\nauthor={Ofir Nachum and Shixiang Gu and Honglak Lee and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1emus0qF7},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/models](https://github.com/tensorflow/models) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=H1emus0qF7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "pdf_size": 0, "rating": "7;8;9", "confidence": "5;3;5", "wc_review": "336;507;775", "wc_reply_reviewers": "157;2;0", "wc_reply_authors": "837;851;307", "reply_reviewers": "2;1;0", "reply_authors": "3;2;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 539.3333333333334, 180.67343160766302 ], "wc_reply_reviewers_avg": [ 53.0, 73.5436378394941 ], "wc_reply_authors_avg": [ 665.0, 253.20874129197566 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 258, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17682749665983906973&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H1emus0qF7", "pdf": "https://openreview.net/pdf?id=H1emus0qF7", "email": ";;;", "author_num": 4 }, { "title": "Understanding Composition of Word Embeddings via Tensor Decomposition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/746", "id": "H1eqjiCctX", "author_site": "Abraham Frandsen, Rong Ge", "tldr": "We present a generative model for compositional word embeddings that captures syntactic relations, and provide empirical verification and evaluation.", "abstract": "Word embedding is a powerful tool in natural language processing. In this paper we consider the problem of word embedding composition \\--- given vector representations of two words, compute a vector for the entire phrase. We give a generative model that can capture specific syntactic relations between words. Under our model, we prove that the correlations between three words (measured by their PMI) form a tensor that has an approximate low rank Tucker decomposition. The result of the Tucker decomposition gives the word embeddings as well as a core tensor, which can be used to produce better compositions of the word embeddings. We also complement our theoretical results with experiments that verify our assumptions, and demonstrate the effectiveness of the new composition method.", "keywords": "word embeddings;semantic composition;tensor decomposition", "primary_area": "", "supplementary_material": "", "author": "Abraham Frandsen;Rong Ge", "authorids": "abef@cs.duke.edu;rongge@cs.duke.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nfrandsen2018understanding,\ntitle={Understanding Composition of Word Embeddings via Tensor Decomposition},\nauthor={Abraham Frandsen and Rong Ge},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eqjiCctX},\n}", "github": "[![github](/images/github_icon.svg) abefrandsen/syntactic-rand-walk](https://github.com/abefrandsen/syntactic-rand-walk)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;2", "wc_review": "450;507;200", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "354;366;91", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 385.6666666666667, 133.33249999739581 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 270.3333333333333, 126.90241220017145 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9072436238425463642&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1eqjiCctX", "pdf": "https://openreview.net/pdf?id=H1eqjiCctX", "email": ";", "author_num": 2 }, { "id": "H1eqviAqYX", "title": "Why Do Neural Response Generation Models Prefer Universal Replies?", "track": "main", "status": "Reject", "tldr": "Analyze the reason for neural response generative models preferring universal replies; Propose a method to avoid it.", "abstract": "Recent advances in neural Sequence-to-Sequence (Seq2Seq) models reveal a purely data-driven approach to the response generation task. Despite its diverse variants and applications, the existing Seq2Seq models are prone to producing short and generic replies, which blocks such neural network architectures from being utilized in practical open-domain response generation tasks. In this research, we analyze this critical issue from the perspective of the optimization goal of models and the specific characteristics of human-to-human conversational corpora. Our analysis is conducted by decomposing the goal of Neural Response Generation (NRG) into the optimizations of word selection and ordering. It can be derived from the decomposing that Seq2Seq based NRG models naturally tend to select common words to compose responses, and ignore the semantic of queries in word ordering. On the basis of the analysis, we propose a max-marginal ranking regularization term to avoid Seq2Seq models from producing the generic and uninformative responses. The empirical experiments on benchmarks with several metrics have validated our analysis and proposed methodology.", "keywords": "Neural Response Generation;Universal Replies;Optimization Goal Analysis;Max-Marginal Ranking Regularization", "primary_area": "", "supplementary_material": "", "author": "Bowen Wu;Nan Jiang;Zhifeng Gao;Zongsheng Wang;Suke Li;Wenge Rong;Baoxun Wang", "authorids": "jasonwbw@yahoo.com;nanjiang@buaa.edu.cn;gao_zhifeng@pku.edu.cn;jasonwang0512@gmail.com;lisuke@ss.pku.edu.cn;w.rong@buaa.edu.cn;baoxun.wang@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nwu2019why,\ntitle={Why Do Neural Response Generation Models Prefer Universal Replies?},\nauthor={Bowen Wu and Nan Jiang and Zhifeng Gao and Zongsheng Wang and Suke Li and Wenge Rong and Baoxun Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=H1eqviAqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1eqviAqYX", "pdf_size": 0, "rating": "1;3;7", "confidence": "5;4;3", "wc_review": "333;577;168", "wc_reply_reviewers": "128;334;0", "wc_reply_authors": "1195;1407;354", "reply_reviewers": "1;1;0", "reply_authors": "3;3;1", "rating_avg": [ 3.6666666666666665, 2.494438257849294 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 359.3333333333333, 168.00859766359844 ], "wc_reply_reviewers_avg": [ 154.0, 137.58875923078406 ], "wc_reply_authors_avg": [ 985.3333333333334, 454.7323999403996 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.9819805060619656, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10176896758067535375&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Structured Neural Summarization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/998", "id": "H1ersoRqtm", "author_site": "Patrick Fernandes, Miltiadis Allamanis, Marc Brockschmidt", "tldr": "One simple trick to improve sequence models: Compose them with a graph model", "abstract": "Summarization of long sequences into a concise statement is a core problem in natural language processing, requiring non-trivial understanding of the input. Based on the promising results of graph neural networks on highly structured data, we develop a framework to extend existing sequence encoders with a graph component that can reason about long-distance relationships in weakly structured data such as text. In an extensive evaluation, we show that the resulting hybrid sequence-graph models outperform both pure sequence models as well as pure graph models on a range of summarization tasks.", "keywords": "Summarization;Graphs;Source Code", "primary_area": "", "supplementary_material": "", "author": "Patrick Fernandes;Miltiadis Allamanis;Marc Brockschmidt", "authorids": "t-pafern@microsoft.com;miallama@microsoft.com;mabrocks@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nfernandes2018structured,\ntitle={Structured Neural Summarization},\nauthor={Patrick Fernandes and Miltiadis Allamanis and Marc Brockschmidt},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ersoRqtm},\n}", "github": "[![github](/images/github_icon.svg) CoderPat/structured-neural-summarization](https://github.com/CoderPat/structured-neural-summarization) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=H1ersoRqtm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "wc_review": "264;198;490", "wc_reply_reviewers": "0;0;65", "wc_reply_authors": "635;69;936", "reply_reviewers": "0;0;2", "reply_authors": "2;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 317.3333333333333, 125.03155157354838 ], "wc_reply_reviewers_avg": [ 21.666666666666668, 30.641293851417057 ], "wc_reply_authors_avg": [ 546.6666666666666, 359.42021213182886 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 279, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5961913139611201410&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H1ersoRqtm", "pdf": "https://openreview.net/pdf?id=H1ersoRqtm", "email": ";;", "author_num": 3 }, { "title": "Graph Wavelet Neural Network", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/710", "id": "H1ewdiR5tQ", "author_site": "Bingbing Xu, Huawei Shen, Qi Cao, Yunqi Qiu, Xueqi Cheng", "tldr": "We present graph wavelet neural network (GWNN), a novel graph convolutional neural network (CNN), leveraging graph wavelet transform to address the shortcoming of previous spectral graph CNN methods that depend on graph Fourier transform.", "abstract": "We present graph wavelet neural network (GWNN), a novel graph convolutional neural network (CNN), leveraging graph wavelet transform to address the shortcomings of previous spectral graph CNN methods that depend on graph Fourier transform. Different from graph Fourier transform, graph wavelet transform can be obtained via a fast algorithm without requiring matrix eigendecomposition with high computational cost. Moreover, graph wavelets are sparse and localized in vertex domain, offering high efficiency and good interpretability for graph convolution. The proposed GWNN significantly outperforms previous spectral graph CNNs in the task of graph-based semi-supervised classification on three benchmark datasets: Cora, Citeseer and Pubmed.", "keywords": "graph convolution;graph wavelet transform;graph Fourier transform;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Bingbing Xu;Huawei Shen;Qi Cao;Yunqi Qiu;Xueqi Cheng", "authorids": "xubingbing@ict.ac.cn;shenhuawei@ict.ac.cn;caoqi@ict.ac.cn;qiuyunqi@ict.ac.cn;cxq@ict.ac.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nxu2018graph,\ntitle={Graph Wavelet Neural Network},\nauthor={Bingbing Xu and Huawei Shen and Qi Cao and Yunqi Qiu and Xueqi Cheng},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ewdiR5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;4;5", "wc_review": "360;635;311", "wc_reply_reviewers": "364;67;0", "wc_reply_authors": "1949;1308;691", "reply_reviewers": "2;2;0", "reply_authors": "5;4;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 435.3333333333333, 142.5957767802243 ], "wc_reply_reviewers_avg": [ 143.66666666666666, 158.18203297326644 ], "wc_reply_authors_avg": [ 1316.0, 513.6075025412564 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 3.3333333333333335, 1.699673171197595 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 492, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10385380643777669724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=H1ewdiR5tQ", "pdf": "https://openreview.net/pdf?id=H1ewdiR5tQ", "email": ";;;;", "author_num": 5 }, { "id": "H1f7S3C9YQ", "title": "SynonymNet: Multi-context Bilateral Matching for Entity Synonyms", "track": "main", "status": "Reject", "tldr": "We introduce SynonymNet, a deep model for entity synonym discovery by a bilateral matching among multiple pieces of contexts in which an entity is mentioned.", "abstract": "Being able to automatically discover synonymous entities from a large free-text corpus has transformative effects on structured knowledge discovery. Existing works either require structured annotations, or fail to incorporate context information effectively, which lower the efficiency of information usage. In this paper, we propose a framework for synonym discovery from free-text corpus without structured annotation. As one of the key components in synonym discovery, we introduce a novel neural network model SynonymNet to determine whether or not two given entities are synonym with each other. Instead of using entities features, SynonymNet makes use of multiple pieces of contexts in which the entity is mentioned, and compares the context-level similarity via a bilateral matching schema to determine synonymity. Experimental results demonstrate that the proposed model achieves state-of-the-art results on both generic and domain-specific synonym datasets: Wiki+Freebase, PubMed+UMLS and MedBook+MKG, with up to 4.16% improvement in terms of Area Under the Curve (AUC) and 3.19% in terms of Mean Average Precision (MAP) compare to the best baseline method.", "keywords": "deep learning;entity synonym", "primary_area": "", "supplementary_material": "", "author": "Chenwei Zhang;Yaliang Li;Nan Du;Wei Fan;Philip S. Yu", "authorids": "czhang99@uic.edu;yaliangli@tencent.com;ndu@tencent.com;davidwfan@tencent.com;psyu@uic.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2019synonymnet,\ntitle={SynonymNet: Multi-context Bilateral Matching for Entity Synonyms},\nauthor={Chenwei Zhang and Yaliang Li and Nan Du and Wei Fan and Philip S. Yu},\nyear={2019},\nurl={https://openreview.net/forum?id=H1f7S3C9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1f7S3C9YQ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;5", "wc_review": "305;1016;316", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "599;925;11", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 545.6666666666666, 332.6062069307921 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 511.6666666666667, 378.214518788243 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7846778543457677217&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "H1fF0iR9KX", "title": "Geometry aware convolutional filters for omnidirectional images representation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Due to their wide field of view, omnidirectional cameras are frequently used by autonomous vehicles, drones and robots for navigation and other computer vision tasks. The images captured by such cameras, are often analysed and classified with techniques designed for planar images that unfortunately fail to properly handle the native geometry of such images. That results in suboptimal performance, and lack of truly meaningful visual features. In this paper we aim at improving popular deep convolutional neural networks so that they can properly take into account the specific properties of omnidirectional data. In particular we propose an algorithm that adapts convolutional layers, which often serve as a core building block of a CNN, to the properties of omnidirectional images. Thus, our filters have a shape and size that adapts with the location on the omnidirectional image. We show that our method is not limited to spherical surfaces and is able to incorporate the knowledge about any kind of omnidirectional geometry inside the deep learning network. As depicted by our experiments, our method outperforms the existing deep neural network techniques for omnidirectional image classification and compression tasks.", "keywords": "omnidirectional images;classification;deep learning;graph signal processing", "primary_area": "", "supplementary_material": "", "author": "Renata Khasanova;Pascal Frossard", "authorids": "renata.khasanova@epfl.ch;pascal.frossard@epfl.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkhasanova2019geometry,\ntitle={Geometry aware convolutional filters for omnidirectional images representation},\nauthor={Renata Khasanova and Pascal Frossard},\nyear={2019},\nurl={https://openreview.net/forum?id=H1fF0iR9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=H1fF0iR9KX", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;4", "wc_review": "430;629;174", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "684;640;387", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 411.0, 186.23819873126635 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 570.3333333333334, 130.8748341822148 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6219932625056515178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "A rotation-equivariant convolutional neural network model of primary visual cortex", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/922", "id": "H1fU8iAqKX", "author_site": "Alexander Ecker, Fabian H Sinz, Emmanouil Froudarakis, Paul Fahey, Santiago Cadena, Edgar Walker, Erick M Cobos, Jacob Reimer, Andreas Tolias, Matthias Bethge", "tldr": "A rotation-equivariant CNN model of V1 that outperforms previous models and suggest functional groupings of V1 neurons.", "abstract": "Classical models describe primary visual cortex (V1) as a filter bank of orientation-selective linear-nonlinear (LN) or energy models, but these models fail to predict neural responses to natural stimuli accurately. Recent work shows that convolutional neural networks (CNNs) can be trained to predict V1 activity more accurately, but it remains unclear which features are extracted by V1 neurons beyond orientation selectivity and phase invariance. Here we work towards systematically studying V1 computations by categorizing neurons into groups that perform similar computations. We present a framework for identifying common features independent of individual neurons' orientation selectivity by using a rotation-equivariant convolutional neural network, which automatically extracts every feature at multiple different orientations. We fit this rotation-equivariant CNN to responses of a population of 6000 neurons to natural images recorded in mouse primary visual cortex using two-photon imaging. We show that our rotation-equivariant network outperforms a regular CNN with the same number of feature maps and reveals a number of common features, which are shared by many V1 neurons and are pooled sparsely to predict neural activity. Our findings are a first step towards a powerful new tool to study the nonlinear functional organization of visual cortex.", "keywords": "rotation equivariance;equivariance;primary visual cortex;V1;neuroscience;system identification", "primary_area": "", "supplementary_material": "", "author": "Alexander S. Ecker;Fabian H. Sinz;Emmanouil Froudarakis;Paul G. Fahey;Santiago A. Cadena;Edgar Y. Walker;Erick Cobos;Jacob Reimer;Andreas S. Tolias;Matthias Bethge", "authorids": "alexander.ecker@uni-tuebingen.de;sinz@bcm.edu;froudara@bcm.edu;paul.fahey@bcm.edu;sa.cadena721@gmail.com;eywalker@bcm.edu;emcobost@gmail.com;reimer@bcm.edu;astolias@bcm.edu;matthias.bethge@bethgelab.org", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\necker2018a,\ntitle={A rotation-equivariant convolutional neural network model of primary visual cortex},\nauthor={Alexander S. Ecker and Fabian H. Sinz and Emmanouil Froudarakis and Paul G. Fahey and Santiago A. Cadena and Edgar Y. Walker and Erick Cobos and Jacob Reimer and Andreas S. Tolias and Matthias Bethge},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1fU8iAqKX},\n}", "github": "[![github](/images/github_icon.svg) aecker/cnn-sys-ident](https://github.com/aecker/cnn-sys-ident/tree/master/analysis/iclr2019)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;3", "wc_review": "547;593;273", "wc_reply_reviewers": "632;283;0", "wc_reply_authors": "1661;605;247", "reply_reviewers": "4;2;0", "reply_authors": "7;3;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 471.0, 141.26098777322304 ], "wc_reply_reviewers_avg": [ 305.0, 258.4814629072396 ], "wc_reply_authors_avg": [ 837.6666666666666, 600.2495777220968 ], "reply_reviewers_avg": [ 2.0, 1.632993161855452 ], "reply_authors_avg": [ 3.6666666666666665, 2.494438257849294 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16775727253632927156&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1fU8iAqKX", "pdf": "https://openreview.net/pdf?id=H1fU8iAqKX", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "H1faSn0qY7", "title": "DL2: Training and Querying Neural Networks with Logic", "track": "main", "status": "Reject", "tldr": "A differentiable loss for logic constraints for training and querying neural networks.", "abstract": "We present DL2, a system for training and querying neural networks with logical constraints. The key idea is to translate these constraints into a differentiable loss with desirable mathematical properties and to then either train with this loss in an iterative manner or to use the loss for querying the network for inputs subject to the constraints. We empirically demonstrate that DL2 is effective in both training and querying scenarios, across a range of constraints and data sets.", "keywords": "neural networks;training with constraints;querying networks;semantic training", "primary_area": "", "supplementary_material": "", "author": "Marc Fischer;Mislav Balunovic;Dana Drachsler-Cohen;Timon Gehr;Ce Zhang;Martin Vechev", "authorids": "marcfisc@student.ethz.ch;bmislav@student.ethz.ch;dana.drachsler@inf.ethz.ch;timon.gehr@inf.ethz.ch;ce.zhang@inf.ethz.ch;martin.vechev@inf.ethz.ch", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nfischer2019dl,\ntitle={{DL}2: Training and Querying Neural Networks with Logic},\nauthor={Marc Fischer and Mislav Balunovic and Dana Drachsler-Cohen and Timon Gehr and Ce Zhang and Martin Vechev},\nyear={2019},\nurl={https://openreview.net/forum?id=H1faSn0qY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1faSn0qY7", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;2;4", "wc_review": "300;256;654", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "299;485;802", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 403.3333333333333, 178.155986583543 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 528.6666666666666, 207.65730315969037 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 231, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14132347304439794371&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "H1fevoAcKX", "title": "Globally Soft Filter Pruning For Efficient Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper propose a cumulative saliency based Globally Soft Filter Pruning (GSFP) scheme to prune redundant filters of Convolutional Neural Networks (CNNs).Specifically, the GSFP adopts a robust pruning method, which measures the global redundancy of the filter in the whole model by using the soft pruning strategy. In addition, in the model recovery process after pruning, we use the cumulative saliency strategy to improve the accuracy of pruning. GSFP has two advantages over previous works:(1) More accurate pruning guidance. For a pre-trained CNN model, the saliency of the filter varies with different input data. Therefore, accumulating the saliency of the filter over the entire data set can provide more accurate guidance for pruning. On the other hand, pruning from a global perspective is more accurate than local pruning. (2) More robust pruning strategy. We propose a reasonable normalization formula to prevent certain layers of filters in the network from being completely clipped due to excessive pruning rate.", "keywords": "Filter Pruning;Model Compression;Efficient Convolutional Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Ke Xu;Xiaoyun Wang;Qun Jia;Jianjing An;Dong Wang", "authorids": "17112071@bjtu.edu.cn;16120304@bjtu.edu.cn;16120347@bjtu.edu.cn;16112065@bjtu.edu.cn;wangdong@bjtu.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nxu2019globally,\ntitle={Globally Soft Filter Pruning For Efficient Convolutional Neural Networks},\nauthor={Ke Xu and Xiaoyun Wang and Qun Jia and Jianjing An and Dong Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=H1fevoAcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1fevoAcKX", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;4", "wc_review": "432;256;67", "wc_reply_reviewers": "0;82;0", "wc_reply_authors": "370;355;402", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 251.66666666666666, 149.0421267815095 ], "wc_reply_reviewers_avg": [ 27.333333333333332, 38.6551707048646 ], "wc_reply_authors_avg": [ 375.6666666666667, 19.601587237318874 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3870900949181577834&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "H1fs4oRqKm", "title": "UNSUPERVISED MONOCULAR DEPTH ESTIMATION WITH CLEAR BOUNDARIES", "track": "main", "status": "Withdraw", "tldr": "This paper propose a mask method which solves the previous blurred results of unsupervised monocular depth estimation caused by occlusion", "abstract": "Unsupervised monocular depth estimation has made great progress after deep\nlearning is involved. Training with binocular stereo images is considered as a\ngood option as the data can be easily obtained. However, the depth or disparity\nprediction results show poor performance for the object boundaries. The main\nreason is related to the handling of occlusion areas during the training. In this paper,\nwe propose a novel method to overcome this issue. Exploiting disparity maps\nproperty, we generate an occlusion mask to block the back-propagation of the occlusion\nareas during image warping. We also design new networks with flipped\nstereo images to induce the networks to learn occluded boundaries. It shows that\nour method achieves clearer boundaries and better evaluation results on KITTI\ndriving dataset and Virtual KITTI dataset.", "keywords": "monocular depth estimation;unsupervised learning;image warping", "primary_area": "", "supplementary_material": "", "author": "Yihan Hu;Heng Luo;Yifeng Geng", "authorids": "y4hu@eng.ucsd.edu;heng.luo@horizon.ai;yifeng.geng@horizon.ai", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1fs4oRqKm", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;3", "wc_review": "246;305;237", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 262.6666666666667, 30.158838763380057 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fAwNMFtOQaIJ:scholar.google.com/&scioq=UNSUPERVISED+MONOCULAR+DEPTH+ESTIMATION+WITH+CLEAR+BOUNDARIES&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1fsUiRcKQ", "title": "Fast adversarial training for semi-supervised learning", "track": "main", "status": "Reject", "tldr": "We propose a fast and efficient semi-supervised learning method using adversarial training.", "abstract": "In semi-supervised learning, Bad GAN approach is one of the most attractive method due to the intuitional simplicity and powerful performances. Bad GAN learns a classifier with bad samples distributed on complement of the support of the input data. But Bad GAN needs additional architectures, a generator and a density estimation model, which involves huge computation and memory consumption cost. VAT is another good semi-supervised learning algorithm, which\nutilizes unlabeled data to improve the invariance of the classifier with respect to perturbation of inputs. In this study, we propose a new method by combining the ideas of Bad GAN and VAT. The proposed method generates bad samples of high-quality by use of the adversarial training used in VAT. We give theoretical explanations why the adversarial training is good at both generating bad samples and semi-supervised learning. An advantage of the proposed method is to achieve the competitive performances with much fewer computations. We demonstrate advantages our method by various experiments with well known benchmark image datasets.", "keywords": "Deep learning;Semi-supervised learning;Adversarial training", "primary_area": "", "supplementary_material": "", "author": "Dongha Kim;Yongchan Choi;Jae-Joon Han;Changkyu Choi;Yongdai Kim", "authorids": "dongha0718@hanmail.net;pminer32@gmail.com;jae-joon.han@samsung.com;changkyu_choi@samsung.com;ydkim0903@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkim2019fast,\ntitle={Fast adversarial training for semi-supervised learning},\nauthor={Dongha Kim and Yongchan Choi and Jae-Joon Han and Changkyu Choi and Yongdai Kim},\nyear={2019},\nurl={https://openreview.net/forum?id=H1fsUiRcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1fsUiRcKQ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "420;518;249", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "974;1082;264", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 395.6666666666667, 111.1585454304896 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 773.3333333333334, 362.84186944483434 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10227122811299067292&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Supervised Community Detection with Line Graph Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1059", "id": "H1g0Z3A9Fm", "author_site": "Zhengdao Chen, Xiang Li, Joan Bruna", "tldr": "We propose a novel graph neural network architecture based on the non-backtracking matrix defined over the edge adjacencies and demonstrate its effectiveness in community detection tasks on graphs.", "abstract": "Community detection in graphs can be solved via spectral methods or posterior inference under certain probabilistic graphical models. Focusing on random graph families such as the stochastic block model, recent research has unified both approaches and identified both statistical and computational detection thresholds in terms of the signal-to-noise ratio. By recasting community detection as a node-wise classification problem on graphs, we can also study it from a learning perspective. We present a novel family of Graph Neural Networks (GNNs) for solving community detection problems in a supervised learning setting. We show that, in a data-driven manner and without access to the underlying generative models, they can match or even surpass the performance of the belief propagation algorithm on binary and multiclass stochastic block models, which is believed to reach the computational threshold in these cases. In particular, we propose to augment GNNs with the non-backtracking operator defined on the line graph of edge adjacencies. The GNNs are achieved good performance on real-world datasets. In addition, we perform the first analysis of the optimization landscape of using (linear) GNNs to solve community detection problems, demonstrating that under certain simplifications and assumptions, the loss value at any local minimum is close to the loss value at the global minimum/minima.", "keywords": "community detection;graph neural networks;belief propagation;energy landscape;non-backtracking matrix", "primary_area": "", "supplementary_material": "", "author": "Zhengdao Chen;Lisha Li;Joan Bruna", "authorids": "zc1216@nyu.edu;lapis.lazuli.8@gmail.com;bruna@cims.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018supervised,\ntitle={Supervised Community Detection with Line Graph Neural Networks},\nauthor={Zhengdao Chen and Lisha Li and Joan Bruna},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1g0Z3A9Fm},\n}", "github": "[![github](/images/github_icon.svg) zhengdao-chen/GNN4CD](https://github.com/zhengdao-chen/GNN4CD) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=H1g0Z3A9Fm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;8;9", "confidence": "4;4;4", "wc_review": "347;402;257", "wc_reply_reviewers": "0;58;0", "wc_reply_authors": "400;265;173", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 335.3333333333333, 59.768070256803696 ], "wc_reply_reviewers_avg": [ 19.333333333333332, 27.34146220587984 ], "wc_reply_authors_avg": [ 279.3333333333333, 93.22493705489369 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 437, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5008209229610559765&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1g0Z3A9Fm", "pdf": "https://openreview.net/pdf?id=H1g0Z3A9Fm", "email": ";;", "author_num": 3 }, { "id": "H1g0piA9tQ", "title": "Evaluation Methodology for Attacks Against Confidence Thresholding Models", "track": "main", "status": "Reject", "tldr": "We present metrics and an optimal attack for evaluating models that defend against adversarial examples using confidence thresholding", "abstract": "Current machine learning algorithms can be easily fooled by adversarial examples. One possible solution path is to make models that use confidence thresholding to avoid making mistakes. Such models refuse to make a prediction when they are not confident of their answer. We propose to evaluate such models in terms of tradeoff curves with the goal of high success rate on clean examples and low failure rate on adversarial examples. Existing untargeted attacks developed for models that do not use confidence thresholding tend to underestimate such models' vulnerability. We propose the MaxConfidence family of attacks, which are optimal in a variety of theoretical settings, including one realistic setting: attacks against linear models. Experiments show the attack attains good results in practice. We show that simple defenses are able to perform well on MNIST but not on CIFAR, contributing further to previous calls that MNIST should be retired as a benchmarking dataset for adversarial robustness research. We release code for these evaluations as part of the cleverhans (Papernot et al 2018) library (ICLR reviewers should be careful not to look at who contributed these features to cleverhans to avoid de-anonymizing this submission).", "keywords": "adversarial examples", "primary_area": "", "supplementary_material": "", "author": "Ian Goodfellow;Yao Qin;David Berthelot", "authorids": "goodfellow@google.com;yaoqin@google.com;dberth@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngoodfellow2019evaluation,\ntitle={Evaluation Methodology for Attacks Against Confidence Thresholding Models},\nauthor={Ian Goodfellow and Yao Qin and David Berthelot},\nyear={2019},\nurl={https://openreview.net/forum?id=H1g0piA9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1g0piA9tQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;4", "wc_review": "598;238;396", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;76;0", "reply_reviewers": "0;0;0", "reply_authors": "0;1;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 410.6666666666667, 147.33484162123892 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 25.333333333333332, 35.82674358011841 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=48210926468945123&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Multiple-Attribute Text Rewriting", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1011", "id": "H1g2NhC5KQ", "author_site": "Guillaume Lample, Sandeep Subramanian, Eric Smith, Ludovic Denoyer, Marc'Aurelio Ranzato, Y-Lan Boureau", "tldr": "A system for rewriting text conditioned on multiple controllable attributes", "abstract": "The dominant approach to unsupervised \"style transfer'' in text is based on the idea of learning a latent representation, which is independent of the attributes specifying its \"style''. In this paper, we show that this condition is not necessary and is not always met in practice, even with domain adversarial training that explicitly aims at learning such disentangled representations. We thus propose a new model that controls several factors of variation in textual data where this condition on disentanglement is replaced with a simpler mechanism based on back-translation. Our method allows control over multiple attributes, like gender, sentiment, product type, etc., and a more fine-grained control on the trade-off between content preservation and change of style with a pooling operator in the latent space. Our experiments demonstrate that the fully entangled model produces better generations, even when tested on new and more challenging benchmarks comprising reviews with multiple sentences and multiple attributes.", "keywords": "controllable text generation;generative models;conditional generative models;style transfer", "primary_area": "", "supplementary_material": "", "author": "Guillaume Lample;Sandeep Subramanian;Eric Smith;Ludovic Denoyer;Marc'Aurelio Ranzato;Y-Lan Boureau", "authorids": "glample@fb.com;sandeep.subramanian.1@umontreal.ca;ems@fb.com;ludovic.denoyer@lip6.fr;ranzato@fb.com;ylan@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nlample2018multipleattribute,\ntitle={Multiple-Attribute Text Rewriting},\nauthor={Guillaume Lample and Sandeep Subramanian and Eric Smith and Ludovic Denoyer and Marc'Aurelio Ranzato and Y-Lan Boureau},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1g2NhC5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "wc_review": "219;633;315", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "274;1510;18", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 389.0, 176.92936443677178 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 600.6666666666666, 651.4339738010462 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 278, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10354201005697789334&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H1g2NhC5KQ", "pdf": "https://openreview.net/pdf?id=H1g2NhC5KQ", "email": ";;;;;", "author_num": 6 }, { "title": "Wasserstein Barycenter Model Ensembling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1048", "id": "H1g4k309F7", "author_site": "Pierre Dognin, Igor Melnyk, Youssef Mroueh, Jarret Ross, Cicero Nogueira dos Santos, Tom Sercu", "tldr": "we propose to use Wasserstein barycenters for semantic model ensembling", "abstract": "In this paper we propose to perform model ensembling in a multiclass or a multilabel learning setting using Wasserstein (W.) barycenters. Optimal transport metrics, such as the Wasserstein distance, allow incorporating semantic side information such as word embeddings. Using W. barycenters to find the consensus between models allows us to balance confidence and semantics in finding the agreement between the models. We show applications of Wasserstein ensembling in attribute-based classification, multilabel learning and image captioning generation. These results show that the W. ensembling is a viable alternative to the basic geometric or arithmetic mean ensembling.", "keywords": "Wasserstein barycenter model ensembling", "primary_area": "", "supplementary_material": "", "author": "Pierre Dognin*;Igor Melnyk*;Youssef Mroueh*;Jarret Ross*;Cicero Dos Santos*;Tom Sercu*", "authorids": "pdognin@us.ibm.com;igor.melnyk@ibm.com;mroueh@us.ibm.com;rossja@us.ibm.com;cicerons@us.ibm.com;tom.sercu1@ibm.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ndognin2018wasserstein,\ntitle={Wasserstein Barycenter Model Ensembling},\nauthor={Pierre Dognin and Igor Melnyk and Youssef Mroueh and Jarret Ross and Cicero Dos Santos and Tom Sercu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1g4k309F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "wc_review": "464;261;354", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "769;720;810", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 359.6666666666667, 82.97121321411555 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 766.3333333333334, 36.790699307780976 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9651886521360061542&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1g4k309F7", "pdf": "https://openreview.net/pdf?id=H1g4k309F7", "email": ";;;;;", "author_num": 6 }, { "title": "Policy Transfer with Strategy Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/918", "id": "H1g6osRcFQ", "author_site": "Wenhao Yu, C. Liu, Greg Turk", "tldr": "We propose a policy transfer algorithm that can overcome large and challenging discrepancies in the system dynamics such as latency, actuator modeling error, etc.", "abstract": "Computer simulation provides an automatic and safe way for training robotic control\npolicies to achieve complex tasks such as locomotion. However, a policy\ntrained in simulation usually does not transfer directly to the real hardware due\nto the differences between the two environments. Transfer learning using domain\nrandomization is a promising approach, but it usually assumes that the target environment\nis close to the distribution of the training environments, thus relying\nheavily on accurate system identification. In this paper, we present a different\napproach that leverages domain randomization for transferring control policies to\nunknown environments. The key idea that, instead of learning a single policy in\nthe simulation, we simultaneously learn a family of policies that exhibit different\nbehaviors. When tested in the target environment, we directly search for the best\npolicy in the family based on the task performance, without the need to identify\nthe dynamic parameters. We evaluate our method on five simulated robotic control\nproblems with different discrepancies in the training and testing environment\nand demonstrate that our method can overcome larger modeling errors compared\nto training a robust policy or an adaptive policy.", "keywords": "transfer learning;reinforcement learning;modeling error;strategy optimization", "primary_area": "", "supplementary_material": "", "author": "Wenhao Yu;C. Karen Liu;Greg Turk", "authorids": "wyu68@gatech.edu;karenliu@cc.gatech.edu;turk@cc.gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyu2018policy,\ntitle={Policy Transfer with Strategy Optimization},\nauthor={Wenhao Yu and C. Karen Liu and Greg Turk},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1g6osRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "572;175;298", "wc_reply_reviewers": "0;0;84", "wc_reply_authors": "487;154;822", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 348.3333333333333, 165.93640013236663 ], "wc_reply_reviewers_avg": [ 28.0, 39.59797974644666 ], "wc_reply_authors_avg": [ 487.6666666666667, 272.71026546297975 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3099719291478959869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1g6osRcFQ", "pdf": "https://openreview.net/pdf?id=H1g6osRcFQ", "email": ";;", "author_num": 3 }, { "id": "H1gDgn0qY7", "title": "A Study of Robustness of Neural Nets Using Approximate Feature Collisions", "track": "main", "status": "Reject", "tldr": "", "abstract": "In recent years, various studies have focused on the robustness of neural nets. While it is known that neural nets are not robust to examples with adversarially chosen perturbations as a result of linear operations on the input data, we show in this paper there could be a convex polytope within which all examples are misclassified by neural nets due to the properties of ReLU activation functions. We propose a way to find such polytopes empirically and demonstrate that such polytopes exist in practice. Furthermore, we show that such polytopes exist even after constraining the examples to be a composition of image patches, resulting in perceptibly different examples at different locations in the polytope that are all misclassified. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ke Li*;Tianhao Zhang*;Jitendra Malik", "authorids": "ke.li@eecs.berkeley.edu;bryanzhang@berkeley.edu;malik@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli*2019a,\ntitle={A Study of Robustness of Neural Nets Using Approximate Feature Collisions},\nauthor={Ke Li* and Tianhao Zhang* and Jitendra Malik},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gDgn0qY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1gDgn0qY7", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;3", "wc_review": "854;439;199", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1245;835;276", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 497.3333333333333, 270.5652519366734 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 785.3333333333334, 397.1484469173824 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gs7YLZm-XyoJ:scholar.google.com/&scioq=A+Study+of+Robustness+of+Neural+Nets+Using+Approximate+Feature+Collisions&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1gFuiA9KX", "title": "Skip-gram word embeddings in hyperbolic space", "track": "main", "status": "Reject", "tldr": "", "abstract": "Embeddings of tree-like graphs in hyperbolic space were recently shown to surpass their Euclidean counterparts in performance by a large margin.\nInspired by these results, we present an algorithm for learning word embeddings in hyperbolic space from free text. An objective function based on the hyperbolic distance is derived and included in the skip-gram negative-sampling architecture from word2vec. The hyperbolic word embeddings are then evaluated on word similarity and analogy benchmarks. The results demonstrate the potential of hyperbolic word embeddings, particularly in low dimensions, though without clear superiority over their Euclidean counterparts. We further discuss subtleties in the formulation of the analogy task in curved spaces.", "keywords": "word embeddings;hyperbolic;skip-gram", "primary_area": "", "supplementary_material": "", "author": "Matthias Leimeister;Benjamin J. Wilson", "authorids": "matthias@lateral.io;benjamin@lateral.io", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nleimeister2019skipgram,\ntitle={Skip-gram word embeddings in hyperbolic space},\nauthor={Matthias Leimeister and Benjamin J. Wilson},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gFuiA9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1gFuiA9KX", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;3", "wc_review": "234;250;351", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "83;232;149", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 278.3333333333333, 51.79661078573471 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 154.66666666666666, 60.960825307915755 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1336407241440959501&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "code2seq: Generating Sequences from Structured Representations of Code", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/646", "id": "H1gKYo09tX", "author_site": "Uri Alon, Shaked Brody, Omer Levy, Eran Yahav", "tldr": "We leverage the syntactic structure of source code to generate natural language sequences.", "abstract": "The ability to generate natural language sequences from source code snippets has a variety of applications such as code summarization, documentation, and retrieval. Sequence-to-sequence (seq2seq) models, adopted from neural machine translation (NMT), have achieved state-of-the-art performance on these tasks by treating source code as a sequence of tokens. We present code2seq: an alternative approach that leverages the syntactic structure of programming languages to better encode source code. Our model represents a code snippet as the set of compositional paths in its abstract syntax tree (AST) and uses attention to select the relevant paths while decoding.\nWe demonstrate the effectiveness of our approach for two tasks, two programming languages, and four datasets of up to 16M examples. Our model significantly outperforms previous models that were specifically designed for programming languages, as well as general state-of-the-art NMT models. An interactive online demo of our model is available at http://code2seq.org. Our code, data and trained models are available at http://github.com/tech-srl/code2seq.", "keywords": "source code;programs;code2seq", "primary_area": "", "supplementary_material": "", "author": "Uri Alon;Shaked Brody;Omer Levy;Eran Yahav", "authorids": "urialon1@gmail.com;shakedbr@cs.technion.ac.il;omerlevy@gmail.com;yahave@cs.technion.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nalon2018codeseq,\ntitle={code2seq: Generating Sequences from Structured Representations of Code},\nauthor={Uri Alon and Omer Levy and Eran Yahav},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gKYo09tX},\n}", "github": "[![github](/images/github_icon.svg) tech-srl/code2seq](https://github.com/tech-srl/code2seq) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=H1gKYo09tX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "521;503;806", "wc_reply_reviewers": "373;341;0", "wc_reply_authors": "1136;1075;1116", "reply_reviewers": "1;2;0", "reply_authors": "2;3;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 610.0, 138.787607515945 ], "wc_reply_reviewers_avg": [ 238.0, 168.7977093051522 ], "wc_reply_authors_avg": [ 1109.0, 25.39028685672272 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 919, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14844338714783082531&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1gKYo09tX", "pdf": "https://openreview.net/pdf?id=H1gKYo09tX", "email": ";;;", "author_num": 4 }, { "title": "Predict then Propagate: Graph Neural Networks meet Personalized PageRank", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1117", "id": "H1gL-2A9Ym", "author_site": "Johannes Gasteiger, Aleksandar Bojchevski, Stephan G\u00fcnnemann", "tldr": "Personalized propagation of neural predictions (PPNP) improves graph neural networks by separating them into prediction and propagation via personalized PageRank.", "abstract": "Neural message passing algorithms for semi-supervised classification on graphs have recently achieved great success. However, for classifying a node these methods only consider nodes that are a few propagation steps away and the size of this utilized neighborhood is hard to extend. In this paper, we use the relationship between graph convolutional networks (GCN) and PageRank to derive an improved propagation scheme based on personalized PageRank. We utilize this propagation procedure to construct a simple model, personalized propagation of neural predictions (PPNP), and its fast approximation, APPNP. Our model's training time is on par or faster and its number of parameters on par or lower than previous models. It leverages a large, adjustable neighborhood for classification and can be easily combined with any neural network. We show that this model outperforms several recently proposed methods for semi-supervised classification in the most thorough study done so far for GCN-like models. Our implementation is available online.", "keywords": "Graph;GCN;GNN;Neural network;Graph neural network;Message passing neural network;Semi-supervised classification;Semi-supervised learning;PageRank;Personalized PageRank", "primary_area": "", "supplementary_material": "", "author": "Johannes Gasteiger;Aleksandar Bojchevski;Stephan G\u00fcnnemann", "authorids": "klicpera@in.tum.de;a.bojchevski@in.tum.de;guennemann@in.tum.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngasteiger2018combining,\ntitle={Combining Neural Networks with Personalized PageRank for Classification on Graphs},\nauthor={Johannes Gasteiger and Aleksandar Bojchevski and Stephan G\u00fcnnemann},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gL-2A9Ym},\n}", "github": "[![github](/images/github_icon.svg) klicperajo/ppnp](https://github.com/klicperajo/ppnp) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=H1gL-2A9Ym)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "300;480;194", "wc_reply_reviewers": "114;195;0", "wc_reply_authors": "947;481;64", "reply_reviewers": "1;1;0", "reply_authors": "3;2;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 324.6666666666667, 118.05460130333289 ], "wc_reply_reviewers_avg": [ 103.0, 79.98749902328488 ], "wc_reply_authors_avg": [ 497.3333333333333, 360.6682070207402 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2182, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12842465886565513517&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=H1gL-2A9Ym", "pdf": "https://openreview.net/pdf?id=H1gL-2A9Ym", "email": ";;", "author_num": 3 }, { "title": "Slimmable Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/796", "id": "H1gMCsAqY7", "author_site": "Jiahui Yu, Linjie Yang, Ning Xu, Jianchao Yang, Thomas Huang", "tldr": "We present a simple and general method to train a single neural network executable at different widths (number of channels in a layer), permitting instant and adaptive accuracy-efficiency trade-offs at runtime.", "abstract": "We present a simple and general method to train a single neural network executable at different widths (number of channels in a layer), permitting instant and adaptive accuracy-efficiency trade-offs at runtime. Instead of training individual networks with different width configurations, we train a shared network with switchable batch normalization. At runtime, the network can adjust its width on the fly according to on-device benchmarks and resource constraints, rather than downloading and offloading different models. Our trained networks, named slimmable neural networks, achieve similar (and in many cases better) ImageNet classification accuracy than individually trained models of MobileNet v1, MobileNet v2, ShuffleNet and ResNet-50 at different widths respectively. We also demonstrate better performance of slimmable models compared with individual ones across a wide range of applications including COCO bounding-box object detection, instance segmentation and person keypoint detection without tuning hyper-parameters. Lastly we visualize and discuss the learned features of slimmable networks. Code and models are available at: https://github.com/JiahuiYu/slimmable_networks", "keywords": "Slimmable neural networks;mobile deep learning;accuracy-efficiency trade-offs", "primary_area": "", "supplementary_material": "", "author": "Jiahui Yu;Linjie Yang;Ning Xu;Jianchao Yang;Thomas Huang", "authorids": "jyu79@illinois.edu;linjie.yang@snap.com;ning.xu@snap.com;jianchao.yang@bytedance.com;huang@ifp.uiuc.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyu2018slimmable,\ntitle={Slimmable Neural Networks},\nauthor={Jiahui Yu and Linjie Yang and Ning Xu and Jianchao Yang and Thomas Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gMCsAqY7},\n}", "github": "[![github](/images/github_icon.svg) JiahuiYu/slimmable_networks](https://github.com/JiahuiYu/slimmable_networks) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=H1gMCsAqY7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;4;5", "wc_review": "196;107;199", "wc_reply_reviewers": "5;0;0", "wc_reply_authors": "318;248;48", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 167.33333333333334, 42.679685513784605 ], "wc_reply_reviewers_avg": [ 1.6666666666666667, 2.357022603955158 ], "wc_reply_authors_avg": [ 204.66666666666666, 114.40668201153677 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 731, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15212173000600372424&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1gMCsAqY7", "pdf": "https://openreview.net/pdf?id=H1gMCsAqY7", "email": ";;;;", "author_num": 5 }, { "id": "H1gNHs05FX", "title": "Clinical Risk: wavelet reconstruction networks for marked point processes", "track": "main", "status": "Reject", "tldr": "Wavelet reconstructions on relative time, used in absolute-time point process models, improve risk prediction of complications and adherence in diabetes.", "abstract": "Timestamped sequences of events, pervasive in domains with data logs, e.g., health records, are often modeled as point processes with rate functions over time. Leading classical methods for risk scores such as Cox and Hawkes processes use such data but make strong assumptions about the shape and form of multivariate influences, resulting in time-to-event distributions irreflective of many real world processes. Recent methods in point processes and recurrent neural networks capably model rate functions but may be complex and difficult to interrogate. Our work develops a high-performing, interrogable model. We introduce wavelet reconstruction networks, a multivariate point process with a sparse wavelet reconstruction kernel to model rate functions from marked, timestamped data. We show they achieve improved performance and interrogability over baselines in forecasting complications and scheduled care visits in patients with diabetes.", "keywords": "point processes;wavelets;temporal neural networks;Hawkes processes", "primary_area": "", "supplementary_material": "", "author": "Jeremy C. Weiss", "authorids": "jeremy.weiss@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nweiss2019clinical,\ntitle={Clinical Risk: wavelet reconstruction networks for marked point processes},\nauthor={Jeremy C. Weiss},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gNHs05FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1gNHs05FX", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "wc_review": "653;410;267", "wc_reply_reviewers": "267;672;0", "wc_reply_authors": "1011;991;190", "reply_reviewers": "1;3;0", "reply_authors": "2;4;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 443.3333333333333, 159.3368200455319 ], "wc_reply_reviewers_avg": [ 313.0, 276.2643661422877 ], "wc_reply_authors_avg": [ 730.6666666666666, 382.3962459136973 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13469041600361618551&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Analysing Mathematical Reasoning Abilities of Neural Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/933", "id": "H1gR5iR5FX", "author_site": "David Saxton, Edward Grefenstette, Felix Hill, Pushmeet Kohli", "tldr": "A dataset for testing mathematical reasoning (and algebraic generalization), and results on current sequence-to-sequence models.", "abstract": "Mathematical reasoning---a core ability within human intelligence---presents some unique challenges as a domain: we do not come to understand and solve mathematical problems primarily on the back of experience and evidence, but on the basis of inferring, learning, and exploiting laws, axioms, and symbol manipulation rules. In this paper, we present a new challenge for the evaluation (and eventually the design) of neural architectures and similar system, developing a task suite of mathematics problems involving sequential questions and answers in a free-form textual input/output format. The structured nature of the mathematics domain, covering arithmetic, algebra, probability and calculus, enables the construction of training and test spits designed to clearly illuminate the capabilities and failure-modes of different architectures, as well as evaluate their ability to compose and relate knowledge and learned processes. Having described the data generation process and its potential future expansions, we conduct a comprehensive analysis of models from two broad classes of the most powerful sequence-to-sequence architectures and find notable differences in their ability to resolve mathematical problems and generalize their knowledge.\n", "keywords": "mathematics;dataset;algebraic;reasoning", "primary_area": "", "supplementary_material": "", "author": "David Saxton;Edward Grefenstette;Felix Hill;Pushmeet Kohli", "authorids": "saxton@google.com;etg@google.com;felixhill@google.com;pushmeet@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsaxton2018analysing,\ntitle={Analysing Mathematical Reasoning Abilities of Neural Models},\nauthor={David Saxton and Edward Grefenstette and Felix Hill and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gR5iR5FX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=H1gR5iR5FX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;3", "wc_review": "393;782;323", "wc_reply_reviewers": "0;0;48", "wc_reply_authors": "249;658;490", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 499.3333333333333, 201.90811991816693 ], "wc_reply_reviewers_avg": [ 16.0, 22.627416997969522 ], "wc_reply_authors_avg": [ 465.6666666666667, 167.85774400432715 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 496, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5177820928273150256&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H1gR5iR5FX", "pdf": "https://openreview.net/pdf?id=H1gR5iR5FX", "email": ";;;", "author_num": 4 }, { "id": "H1gRM2A5YX", "title": "Analysis of Memory Organization for Dynamic Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "An increasing number of neural memory networks have been developed, leading to the need for a systematic approach to analyze and compare their underlying memory capabilities. Thus, in this paper, we propose a taxonomy for four popular dynamic models: vanilla recurrent neural network, long short-term memory, neural stack and neural RAM and their variants. Based on this taxonomy, we create a framework to analyze memory organization and then compare these network architectures. This analysis elucidates how different mapping functions capture the information in the past of the input, and helps to open the dynamic neural network black box from the perspective of memory usage. Four representative tasks that would fit optimally the characteristics of each memory network are carefully selected to show each network's expressive power. We also discuss how to use this taxonomy to help users select the most parsimonious type of memory network for a specific task. Two natural language processing applications are used to evaluate the methodology in a realistic setting. \n", "keywords": "memory analysis;recurrent neural network;LSTM;neural Turing machine;neural stack;differentiable neural computers", "primary_area": "", "supplementary_material": "", "author": "Ying Ma;Jose Principe", "authorids": "mayingbit2011@gmail.com;principe@cnel.ufl.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nma2019analysis,\ntitle={Analysis of Memory Organization for Dynamic Neural Networks},\nauthor={Ying Ma and Jose Principe},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gRM2A5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1gRM2A5YX", "pdf_size": 0, "rating": "3;5;7", "confidence": "5;5;3", "wc_review": "304;280;517", "wc_reply_reviewers": "482;39;95", "wc_reply_authors": "1826;1350;602", "reply_reviewers": "1;1;1", "reply_authors": "3;2;1", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 367.0, 106.51760417883985 ], "wc_reply_reviewers_avg": [ 205.33333333333334, 196.96418174096752 ], "wc_reply_authors_avg": [ 1259.3333333333333, 503.79184414023837 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S1mvhBprOIsJ:scholar.google.com/&scioq=Analysis+of+Memory+Organization+for+Dynamic+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "RotDCF: Decomposition of Convolutional Filters for Rotation-Equivariant Deep Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1053", "id": "H1gTEj09FX", "author_site": "Xiuyuan Cheng, Qiang Qiu, Robert Calderbank, Guillermo Sapiro", "tldr": "", "abstract": "Explicit encoding of group actions in deep features makes it possible for convolutional neural networks (CNNs) to handle global deformations of images, which is critical to success in many vision tasks. This paper proposes to decompose the convolutional filters over joint steerable bases across the space and the group geometry simultaneously, namely a rotation-equivariant CNN with decomposed convolutional filters (RotDCF). This decomposition facilitates computing the joint convolution, which is proved to be necessary for the group equivariance. It significantly reduces the model size and computational complexity while preserving performance, and truncation of the bases expansion serves implicitly to regularize the filters. On datasets involving in-plane and out-of-plane object rotations, RotDCF deep features demonstrate greater robustness and interpretability than regular CNNs. The stability of the equivariant representation to input variations is also proved theoretically. The RotDCF framework can be extended to groups other than rotations, providing a general approach which achieves both group equivariance and representation stability at a reduced model size.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiuyuan Cheng;Qiang Qiu;Robert Calderbank;Guillermo Sapiro", "authorids": "xiuyuan.cheng@duke.edu;qiang.qiu@duke.edu;robert.calderbank@duke.edu;guillermo.sapiro@duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ncheng2018rotdcf,\ntitle={Rot{DCF}: Decomposition of Convolutional Filters for Rotation-Equivariant Deep Networks},\nauthor={Xiuyuan Cheng and Qiang Qiu and Robert Calderbank and Guillermo Sapiro},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gTEj09FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;2", "wc_review": "507;235;392", "wc_reply_reviewers": "196;30;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "1;1;0", "reply_authors": "0;0;0", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 378.0, 111.48393008262073 ], "wc_reply_reviewers_avg": [ 75.33333333333333, 86.19873677857595 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6799055083001221032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1gTEj09FX", "pdf": "https://openreview.net/pdf?id=H1gTEj09FX", "email": ";;;", "author_num": 4 }, { "id": "H1gZV30qKQ", "title": "Transfer Value or Policy? A Value-centric Framework Towards Transferrable Continuous Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Transferring learned knowledge from one environment to another is an important step towards practical reinforcement learning (RL). In this paper, we investigate the problem of transfer learning across environments with different dynamics while accomplishing the same task in the continuous control domain. We start by illustrating the limitations of policy-centric methods (policy gradient, actor- critic, etc.) when transferring knowledge across environments. We then propose a general model-based value-centric (MVC) framework for continuous RL. MVC learns a dynamics approximator and a value approximator simultaneously in the source domain, and makes decision based on both of them. We evaluate MVC against popular baselines on 5 benchmark control tasks in a training from scratch setting and a transfer learning setting. Our experiments demonstrate MVC achieves comparable performance with the baselines when it is trained from scratch, while it significantly surpasses them when it is used in the transfer setting.\n", "keywords": "Reinforcement Learning;Transfer Learning;Control;Value function", "primary_area": "", "supplementary_material": "", "author": "Xingchao Liu;Tongzhou Mu;Hao Su", "authorids": "liuxc1996@gmail.com;t3mu@eng.ucsd.edu;haosu@eng.ucsd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliu2019transfer,\ntitle={Transfer Value or Policy? A Value-centric Framework Towards Transferrable Continuous Reinforcement Learning},\nauthor={Xingchao Liu and Tongzhou Mu and Hao Su},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gZV30qKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=H1gZV30qKQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;2;3", "wc_review": "434;829;394", "wc_reply_reviewers": "0;70;91", "wc_reply_authors": "1167;1412;462", "reply_reviewers": "0;1;1", "reply_authors": "2;3;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 552.3333333333334, 196.3132417563545 ], "wc_reply_reviewers_avg": [ 53.666666666666664, 38.90444133457716 ], "wc_reply_authors_avg": [ 1013.6666666666666, 402.70612389461155 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tbZjB1NXNsIJ:scholar.google.com/&scioq=Transfer+Value+or+Policy%3F+A+Value-centric+Framework+Towards+Transferrable+Continuous+Reinforcement+Learning&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "title": "Execution-Guided Neural Program Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/760", "id": "H1gfOiAqYm", "author_site": "Xinyun Chen, Chang Liu, Dawn Song", "tldr": "", "abstract": "Neural program synthesis from input-output examples has attracted an increasing interest from both the machine learning and the programming language community. Most existing neural program synthesis approaches employ an encoder-decoder architecture, which uses an encoder to compute the embedding of the given input-output examples, as well as a decoder to generate the program from the embedding following a given syntax. Although such approaches achieve a reasonable performance on simple tasks such as FlashFill, on more complex tasks such as Karel, the state-of-the-art approach can only achieve an accuracy of around 77%. We observe that the main drawback of existing approaches is that the semantic information is greatly under-utilized. In this work, we propose two simple yet principled techniques to better leverage the semantic information, which are execution-guided synthesis and synthesizer ensemble. These techniques are general enough to be combined with any existing encoder-decoder-style neural program synthesizer. Applying our techniques to the Karel dataset, we can boost the accuracy from around 77% to more than 90%.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyun Chen;Chang Liu;Dawn Song", "authorids": "xinyun.chen@berkeley.edu;liuchang@eecs.berkeley.edu;dawnsong.travel@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018executionguided,\ntitle={Execution-Guided Neural Program Synthesis},\nauthor={Xinyun Chen and Chang Liu and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gfOiAqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;4;2", "wc_review": "402;810;154", "wc_reply_reviewers": "0;70;96", "wc_reply_authors": "220;893;533", "reply_reviewers": "0;1;1", "reply_authors": "1;3;2", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 455.3333333333333, 270.4531177281728 ], "wc_reply_reviewers_avg": [ 55.333333333333336, 40.54078878802872 ], "wc_reply_authors_avg": [ 548.6666666666666, 274.97434223739657 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17780553618296819799&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=H1gfOiAqYm", "pdf": "https://openreview.net/pdf?id=H1gfOiAqYm", "email": ";;", "author_num": 3 }, { "id": "H1gh_sC9tm", "title": "Prior Networks for Detection of Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "We show that it is possible to successfully detect a range of adversarial attacks using measures of uncertainty derived from Prior Networks.", "abstract": "Adversarial examples are considered a serious issue for safety critical applications of AI, such as finance, autonomous vehicle control and medicinal applications. Though significant work has resulted in increased robustness of systems to these attacks, systems are still vulnerable to well-crafted attacks. To address this problem\nseveral adversarial attack detection methods have been proposed. However, system can still be vulnerable to adversarial samples that are designed to specifically evade these detection methods. One recent detection scheme that has shown good performance is based on uncertainty estimates derived from Monte-Carlo dropout ensembles. Prior Networks, a new method of estimating predictive uncertainty, have been shown to outperform Monte-Carlo dropout on a range of tasks. One of the advantages of this approach is that the behaviour of a Prior Network can be explicitly tuned to, for example, predict high uncertainty in regions where there are no training data samples. In this work Prior Networks are applied to adversarial attack detection using measures of uncertainty in a similar fashion to Monte-Carlo Dropout. Detection based on measures of uncertainty derived from DNNs and Monte-Carlo dropout ensembles are used as a baseline. Prior Networks are shown to significantly out-perform these baseline approaches over a range of adversarial attacks in both detection of whitebox and blackbox configurations. Even when the adversarial attacks are constructed with full knowledge of the detection mechanism, it is shown to be highly challenging to successfully generate an adversarial sample.", "keywords": "Uncertainty;Prior Networks;Adversarial Attacks;Detection", "primary_area": "", "supplementary_material": "", "author": "Andrey Malinin;Mark Gales", "authorids": "am969@cam.ac.uk;mjfg@eng.cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmalinin2019prior,\ntitle={Prior Networks for Detection of Adversarial Attacks},\nauthor={Andrey Malinin and Mark Gales},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gh_sC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1gh_sC9tm", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "wc_review": "695;618;680", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 664.3333333333334, 33.32999983331666 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7670212357212021769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1glKiCqtm", "title": "The Effectiveness of Pre-Trained Code Embeddings", "track": "main", "status": "Reject", "tldr": "Researchers exploring natural language processing techniques applied to source code are not using any form of pre-trained embeddings, we show that they should be.", "abstract": "Word embeddings are widely used in machine learning based natural language processing systems. It is common to use pre-trained word embeddings which provide benefits such as reduced training time and improved overall performance. There has been a recent interest in applying natural language processing techniques to programming languages. However, none of this recent work uses pre-trained embeddings on code tokens. Using extreme summarization as the downstream task, we show that using pre-trained embeddings on code tokens provides the same benefits as it does to natural languages, achieving: over 1.9x speedup, 5\\% improvement in test loss, 4\\% improvement in F1 scores, and resistance to over-fitting. We also show that the choice of language used for the embeddings does not have to match that of the task to achieve these benefits and that even embeddings pre-trained on human languages provide these benefits to programming languages. ", "keywords": "machine learning;deep learning;summarization;embeddings;word embeddings;source code;programming languages;programming language processing", "primary_area": "", "supplementary_material": "", "author": "Ben Trevett;Donald Reay;N. K. Taylor", "authorids": "bbt1@hw.ac.uk;n.k.taylor@hw.ac.uk;d.s.reay@hw.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntrevett2019the,\ntitle={The Effectiveness of Pre-Trained Code Embeddings},\nauthor={Ben Trevett and Donald Reay and N. K. Taylor},\nyear={2019},\nurl={https://openreview.net/forum?id=H1glKiCqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1glKiCqtm", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "359;902;187", "wc_reply_reviewers": "0;82;0", "wc_reply_authors": "372;316;65", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 482.6666666666667, 304.71443826784594 ], "wc_reply_reviewers_avg": [ 27.333333333333332, 38.6551707048646 ], "wc_reply_authors_avg": [ 251.0, 133.49406978089576 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3052390176923046870&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Dynamic Sparse Graph for Efficient Deep Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/650", "id": "H1goBoR9F7", "author_site": "Liu Liu, Lei Deng, Xing Hu, Maohua Zhu, Guoqi Li, Yufei Ding, Yuan Xie", "tldr": "We construct dynamic sparse graph via dimension-reduction search to reduce compute and memory cost in both DNN training and inference.", "abstract": "We propose to execute deep neural networks (DNNs) with dynamic and sparse graph (DSG) structure for compressive memory and accelerative execution during both training and inference. The great success of DNNs motivates the pursuing of lightweight models for the deployment onto embedded devices. However, most of the previous studies optimize for inference while neglect training or even complicate it. Training is far more intractable, since (i) the neurons dominate the memory cost rather than the weights in inference; (ii) the dynamic activation makes previous sparse acceleration via one-off optimization on fixed weight invalid; (iii) batch normalization (BN) is critical for maintaining accuracy while its activation reorganization damages the sparsity. To address these issues, DSG activates only a small amount of neurons with high selectivity at each iteration via a dimensionreduction search and obtains the BN compatibility via a double-mask selection. Experiments show significant memory saving (1.7-4.5x) and operation reduction (2.3-4.4x) with little accuracy loss on various benchmarks.", "keywords": "Sparsity;compression;training;acceleration", "primary_area": "", "supplementary_material": "", "author": "Liu Liu;Lei Deng;Xing Hu;Maohua Zhu;Guoqi Li;Yufei Ding;Yuan Xie", "authorids": "liu_liu@ucsb.edu;leideng@ucsb.edu;huxing@ece.ucsb.edu;maohuazhu@ucsb.edu;liguoqi@mail.tsinghua.edu.cn;yufeiding@cs.ucsb.edu;yuanxie@ucsb.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nliu2018dynamic,\ntitle={Dynamic Sparse Graph for Efficient Deep Learning},\nauthor={Liu Liu and Lei Deng and Xing Hu and Maohua Zhu and Guoqi Li and Yufei Ding and Yuan Xie},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1goBoR9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;2;3", "wc_review": "446;331;523", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1348;473;713", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 433.3333333333333, 78.89374006993344 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 844.6666666666666, 369.15067685461753 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=961887975812995994&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=H1goBoR9F7", "pdf": "https://openreview.net/pdf?id=H1goBoR9F7", "email": ";;;;;;", "author_num": 7 }, { "title": "Fixup Initialization: Residual Learning Without Normalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/643", "id": "H1gsz30cKX", "author_site": "Hongyi Zhang, Yann Dauphin, Tengyu Ma", "tldr": "All you need to train deep residual networks is a good initialization; normalization layers are not necessary.", "abstract": "Normalization layers are a staple in state-of-the-art deep neural network architectures. They are widely believed to stabilize training, enable higher learning rate, accelerate convergence and improve generalization, though the reason for their effectiveness is still an active research topic. In this work, we challenge the commonly-held beliefs by showing that none of the perceived benefits is unique to normalization. Specifically, we propose fixed-update initialization (Fixup), an initialization motivated by solving the exploding and vanishing gradient problem at the beginning of training via properly rescaling a standard initialization. We find training residual networks with Fixup to be as stable as training with normalization -- even for networks with 10,000 layers. Furthermore, with proper regularization, Fixup enables residual networks without normalization to achieve state-of-the-art performance in image classification and machine translation.", "keywords": "deep learning;residual networks;initialization;batch normalization;layer normalization", "primary_area": "", "supplementary_material": "", "author": "Hongyi Zhang;Yann N. Dauphin;Tengyu Ma", "authorids": "hongyiz@mit.edu;yann@dauphin.io;tengyuma@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhang2018residual,\ntitle={Residual Learning Without Normalization via Better Initialization},\nauthor={Hongyi Zhang and Yann N. Dauphin and Tengyu Ma},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gsz30cKX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=H1gsz30cKX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;3;3", "wc_review": "1164;292;240", "wc_reply_reviewers": "0;0;21", "wc_reply_authors": "2142;100;545", "reply_reviewers": "0;0;1", "reply_authors": "5;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 565.3333333333334, 423.8532240712055 ], "wc_reply_reviewers_avg": [ 7.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 929.0, 876.748918828342 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 413, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10342250007176178945&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=H1gsz30cKX", "pdf": "https://openreview.net/pdf?id=H1gsz30cKX", "email": ";;", "author_num": 3 }, { "id": "H1gupiC5KQ", "title": "The wisdom of the crowd: reliable deep reinforcement learning through ensembles of Q-functions", "track": "main", "status": "Reject", "tldr": "Examined how a simple ensemble approach can tackle the biggest challenges in Q-learning.", "abstract": "Reinforcement learning agents learn by exploring the environment and then exploiting what they have learned.\nThis frees the human trainers from having to know the preferred action or intrinsic value of each encountered state.\nThe cost of this freedom is reinforcement learning is slower and more unstable than supervised learning.\nWe explore the possibility that ensemble methods can remedy these shortcomings and do so by investigating a novel technique which harnesses the wisdom of the crowds by bagging Q-function approximator estimates.\n\nOur results show that this proposed approach improves all three tasks and reinforcement learning approaches attempted.\nWe are able to demonstrate that this is a direct result of the increased stability of the action portion of the state-action-value function used by Q-learning to select actions and by policy gradient methods to train the policy.\n", "keywords": "reinforcement learning;ensembles;deep learning;neural network", "primary_area": "", "supplementary_material": "", "author": "Daniel Elliott;Charles Anderson", "authorids": "daniel.elliott18@alumni.colostate.edu;chuck.anderson@colostate.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nelliott2019the,\ntitle={The wisdom of the crowd: reliable deep reinforcement learning through ensembles of Q-functions},\nauthor={Daniel Elliott and Charles Anderson},\nyear={2019},\nurl={https://openreview.net/forum?id=H1gupiC5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1gupiC5KQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;3", "wc_review": "132;129;212", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 157.66666666666666, 38.43898484033567 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9960499802061749266&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "H1l-SjA5t7", "title": "Explicit Information Placement on Latent Variables using Auxiliary Generative Modelling Task", "track": "main", "status": "Reject", "tldr": "We propose a method that can explicitly place information into a specific subset of the latent variables in deep generative models. We demonstrate the use of the method in a task of disentangling global structure from local features in images. ", "abstract": "Deep latent variable models, such as variational autoencoders, have been successfully used to disentangle factors of variation in image datasets. The structure of the representations learned by such models is usually observed after training and iteratively refined by tuning the network architecture and loss function. Here we propose a method that can explicitly place information into a specific subset of the latent variables. We demonstrate the use of the method in a task of disentangling global structure from local features in images. One subset of the latent variables is encouraged to represent local features through an auxiliary modelling task. In this auxiliary task, the global structure of an image is destroyed by dividing it into pixel patches which are then randomly shuffled. The full set of latent variables is trained to model the original data, obliging the remainder of the latent representation to model the global structure. We demonstrate that this approach successfully disentangles the latent variables for global structure from local structure by observing the generative samples of SVHN and CIFAR10. We also clustering the disentangled global structure of SVHN and found that the emerging clusters represent meaningful groups of global structures \u2013 including digit identities and the number of digits presence. Finally, we discuss the problem of evaluating the clustering accuracy when ground truth categories are not expressive enough.", "keywords": "disentanglement;vae;clustering;prior imposition;deep generative models", "primary_area": "", "supplementary_material": "", "author": "Nat Dilokthanakul;Nick Pawlowski;Murray Shanahan", "authorids": "n.dilokthanakul14@imperial.ac.uk;n.pawlowski16@imperial.ac.uk;m.shanahan@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndilokthanakul2019explicit,\ntitle={Explicit Information Placement on Latent Variables using Auxiliary Generative Modelling Task},\nauthor={Nat Dilokthanakul and Nick Pawlowski and Murray Shanahan},\nyear={2019},\nurl={https://openreview.net/forum?id=H1l-SjA5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1l-SjA5t7", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "581;228;364", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "700;167;31", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 391.0, 145.37079028012013 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 299.3333333333333, 288.70323094062445 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:andGt2o3UpMJ:scholar.google.com/&scioq=Explicit+Information+Placement+on+Latent+Variables+using+Auxiliary+Generative+Modelling+Task&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "ProbGAN: Towards Probabilistic GAN with Theoretical Guarantees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1088", "id": "H1l7bnR5Ym", "author_site": "Hao He, Hao Wang, Guang-He Lee, Yonglong Tian", "tldr": "A novel probabilistic treatment for GAN with theoretical guarantee.", "abstract": "Probabilistic modelling is a principled framework to perform model aggregation, which has been a primary mechanism to combat mode collapse in the context of Generative Adversarial Networks (GAN). In this paper, we propose a novel probabilistic framework for GANs, ProbGAN, which iteratively learns a distribution over generators with a carefully crafted prior. Learning is efficiently triggered by a tailored stochastic gradient Hamiltonian Monte Carlo with a novel gradient approximation to perform Bayesian inference. Our theoretical analysis further reveals that our treatment is the first probabilistic framework that yields an equilibrium where generator distributions are faithful to the data distribution. Empirical evidence on synthetic high-dimensional multi-modal data and image databases (CIFAR-10, STL-10, and ImageNet) demonstrates the superiority of our method over both start-of-the-art multi-generator GANs and other probabilistic treatment for GANs.", "keywords": "Generative Adversarial Networks;Bayesian Deep Learning;Mode Collapse;Inception Score;Generator;Discriminator;CIFAR-10;STL-10;ImageNet", "primary_area": "", "supplementary_material": "", "author": "Hao He;Hao Wang;Guang-He Lee;Yonglong Tian", "authorids": "haohe@mit.edu;hwang87@mit.edu;guanghe@mit.edu;yonglong@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhe2018bayesian,\ntitle={Bayesian Modelling and Monte Carlo Inference for {GAN}},\nauthor={Hao He and Hao Wang and Guang-He Lee and Yonglong Tian},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1l7bnR5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;9", "confidence": "4;3;4", "wc_review": "663;710;380", "wc_reply_reviewers": "289;0;0", "wc_reply_authors": "1083;851;101", "reply_reviewers": "1;0;0", "reply_authors": "2;2;1", "rating_avg": [ 6.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 584.3333333333334, 145.75397840958655 ], "wc_reply_reviewers_avg": [ 96.33333333333333, 136.23590650860817 ], "wc_reply_authors_avg": [ 678.3333333333334, 419.07941437817675 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.2773500981126145, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=569574917827071543&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H1l7bnR5Ym", "pdf": "https://openreview.net/pdf?id=H1l7bnR5Ym", "email": ";;;", "author_num": 4 }, { "id": "H1lADsCcFQ", "title": "LEARNING ADVERSARIAL EXAMPLES WITH RIEMANNIAN GEOMETRY", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial examples, referred to as augmented data points generated by imperceptible perturbation of input samples, have recently drawn much attention. Well-crafted adversarial examples may even mislead state-of-the-art deep models to make wrong predictions easily. To alleviate this problem, many studies focus on investigating how adversarial examples can be generated and/or resisted. All the existing work handles this problem in the Euclidean space, which may however be unable to describe data geometry. In this paper, we propose a generalized framework that addresses the learning problem of adversarial examples with Riemannian geometry. Specifically, we define the local coordinate systems on Riemannian manifold, develop a novel model called Adversarial Training with Riemannian Manifold, and design a series of theory that manages to learn the adversarial examples in the Riemannian space feasibly and efficiently. The proposed work is important in that (1) it is a generalized learning methodology since Riemmanian manifold space would be degraded to the Euclidean space in a special case; (2) it is the first work to tackle the adversarial example problem tractably through the perspective of geometry; (3) from the perspective of geometry, our method leads to the steepest direction of the loss function. We also provide a series of theory showing that our proposed method can truly find the decent direction for the loss function with a comparable computational time against traditional adversarial methods. Finally, the proposed framework demonstrates superior performance to the traditional counterpart methods on benchmark data including MNIST, CIFAR-10 and SVHN.", "keywords": "Adversarial training;Adversarial examples;Riemannian Geometry;Machine Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Shufei Zhang;Kaizhu Huang;Rui Zhang;Amir Hussain", "authorids": "shufei.zhang@xjtlu.edu.cn;kaizhu.huang@xjtlu.edu.cn;rui.zhang02@xjtlu.edu.cn;ahu@cs.stir.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2019learning,\ntitle={{LEARNING} {ADVERSARIAL} {EXAMPLES} {WITH} {RIEMANNIAN} {GEOMETRY}},\nauthor={Shufei Zhang and Kaizhu Huang and Rui Zhang and Amir Hussain},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lADsCcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1lADsCcFQ", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;5;2", "wc_review": "973;499;241", "wc_reply_reviewers": "0;508;0", "wc_reply_authors": "776;2069;293", "reply_reviewers": "0;6;0", "reply_authors": "1;7;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 571.0, 303.14353036144445 ], "wc_reply_reviewers_avg": [ 169.33333333333334, 239.4734965618441 ], "wc_reply_authors_avg": [ 1046.0, 749.763962857645 ], "reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "reply_authors_avg": [ 3.0, 2.8284271247461903 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vRdqvlAel18J:scholar.google.com/&scioq=LEARNING+ADVERSARIAL+EXAMPLES+WITH+RIEMANNIAN+GEOMETRY&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1lC8o0cKX", "title": "Unsupervised Emergence of Spatial Structure from Sensorimotor Prediction", "track": "main", "status": "Reject", "tldr": "A practical evaluation of hypotheses previously laid out about the unsupervised emergence of spatial representations from sensorimotor prediction.", "abstract": "Despite its omnipresence in robotics application, the nature of spatial knowledge and the mechanisms that underlie its emergence in autonomous agents are still poorly understood. Recent theoretical work suggests that the concept of space can be grounded by capturing invariants induced by the structure of space in an agent's raw sensorimotor experience. Moreover, it is hypothesized that capturing these invariants is beneficial for a naive agent trying to predict its sensorimotor experience. Under certain exploratory conditions, spatial representations should thus emerge as a byproduct of learning to predict.\nWe propose a simple sensorimotor predictive scheme, apply it to different agents and types of exploration, and evaluate the pertinence of this hypothesis. We show that a naive agent can capture the topology and metric regularity of its spatial configuration without any a priori knowledge, nor extraneous supervision.", "keywords": "spatial perception;grounding;sensorimotor prediction;unsupervised learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Alban Laflaqui\u00e8re;Michael Garcia Ortiz", "authorids": "alban.laflaquiere@gmail.com;mgarciaortiz@softbankrobotics.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlaflaqui\u00e8re2019unsupervised,\ntitle={Unsupervised Emergence of Spatial Structure from Sensorimotor Prediction},\nauthor={Alban Laflaqui\u00e8re and Michael Garcia Ortiz},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lC8o0cKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1lC8o0cKX", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;3", "wc_review": "652;822;798", "wc_reply_reviewers": "720;353;0", "wc_reply_authors": "3116;3576;905", "reply_reviewers": "2;2;0", "reply_authors": "6;7;2", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 757.3333333333334, 75.12360185424788 ], "wc_reply_reviewers_avg": [ 357.6666666666667, 293.9572909265713 ], "wc_reply_authors_avg": [ 2532.3333333333335, 1165.9217621931396 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 5.0, 2.160246899469287 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17173800939719382871&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1lFZnR5YX", "title": "Neural Regression Tree", "track": "main", "status": "Reject", "tldr": "A novel neural regression tree for optimal discretization in regression-via-classification problems.", "abstract": "Regression-via-Classification (RvC) is the process of converting a regression problem to a classification one. Current approaches for RvC use ad-hoc discretization strategies and are suboptimal. We propose a neural regression tree model for RvC. In this model, we employ a joint optimization framework where we learn optimal discretization thresholds while simultaneously optimizing the features for each node in the tree. We empirically show the validity of our model by testing it on two challenging regression tasks where we establish the state of the art.", "keywords": "regression-via-classification;discretization;regression tree;neural model;optimization", "primary_area": "", "supplementary_material": "", "author": "Wenbo Zhao;Shahan Ali Memon;Bhiksha Raj;Rita Singh", "authorids": "wzhao1@andrew.cmu.ecu;samemon@cs.cmu.edu;bhikshar@cs.cmu.edu;rsingh@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2019neural,\ntitle={Neural Regression Tree},\nauthor={Wenbo Zhao and Shahan Ali Memon and Bhiksha Raj and Rita Singh},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lFZnR5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=H1lFZnR5YX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;3", "wc_review": "368;604;261", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 411.0, 143.29224217195664 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10964247968742600652&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "H1lGHsA9KX", "title": "A Resizable Mini-batch Gradient Descent based on a Multi-Armed Bandit", "track": "main", "status": "Reject", "tldr": "An optimization algorithm that explores various batch sizes based on probability and automatically exploits successful batch size which minimizes validation loss.", "abstract": "Determining the appropriate batch size for mini-batch gradient descent is always time consuming as it often relies on grid search. This paper considers a resizable mini-batch gradient descent (RMGD) algorithm based on a multi-armed bandit that achieves performance equivalent to that of best fixed batch-size. At each epoch, the RMGD samples a batch size according to a certain probability distribution proportional to a batch being successful in reducing the loss function. Sampling from this probability provides a mechanism for exploring different batch size and exploiting batch sizes with history of success. After obtaining the validation loss at each epoch with the sampled batch size, the probability distribution is updated to incorporate the effectiveness of the sampled batch size. Experimental results show that the RMGD achieves performance better than the best performing single batch size. It is surprising that the RMGD achieves better performance than grid search. Furthermore, it attains this performance in a shorter amount of time than grid search.", "keywords": "Batch size;Optimization;Mini-batch gradient descent;Multi-armed bandit", "primary_area": "", "supplementary_material": "", "author": "Seong Jin Cho;Sunghun Kang;Chang D. Yoo", "authorids": "ipcng00@kaist.ac.kr;sunghun.kang@kaist.ac.kr;cd_yoo@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncho2019a,\ntitle={A Resizable Mini-batch Gradient Descent based on a Multi-Armed Bandit},\nauthor={Seong Jin Cho and Sunghun Kang and Chang D. Yoo},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lGHsA9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1lGHsA9KX", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;4", "wc_review": "224;449;260", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "319;127;482", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 311.0, 98.68130522039117 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 309.3333333333333, 145.08924456653872 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14708283385905437167&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "H1lIzhC9FX", "title": "Learning to remember: Dynamic Generative Memory for Continual Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Continuously trainable models should be able to learn from a stream of data over an undefined period of time. This becomes even more difficult in a strictly incremental context, where data access to previously seen categories is not possible. To that end, we propose making use of a conditional generative adversarial model where the generator is used as a memory module through neural masking to emulate neural plasticity in the human brain. This memory module is further associated with a dynamic capacity expansion mechanism. Taken together, this method facilitates a resource efficient capacity adaption to accommodate new tasks, while retaining previously attained knowledge. The proposed approach outperforms state-of-the-art algorithms on publicly available datasets, overcoming catastrophic forgetting.", "keywords": "Continual Learning;Catastrophic Forgetting;Dynamic Network Expansion", "primary_area": "", "supplementary_material": "", "author": "Oleksiy Ostapenko;Mihai Puscas;Tassilo Klein;Moin Nabi", "authorids": "oleksiy.ostapenko@sap.com;mihai.puscas@sap.com;tassilo.klein@sap.com;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nostapenko2019learning,\ntitle={Learning to remember: Dynamic Generative Memory for Continual Learning},\nauthor={Oleksiy Ostapenko and Mihai Puscas and Tassilo Klein and Moin Nabi},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lIzhC9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1lIzhC9FX", "pdf_size": 0, "rating": "3;4;8", "confidence": "5;5;5", "wc_review": "548;790;117", "wc_reply_reviewers": "0;714;0", "wc_reply_authors": "536;1507;0", "reply_reviewers": "0;2;0", "reply_authors": "1;4;0", "rating_avg": [ 5.0, 2.160246899469287 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 485.0, 278.33912169629815 ], "wc_reply_reviewers_avg": [ 238.0, 336.5828278447966 ], "wc_reply_authors_avg": [ 681.0, 623.715212790795 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 1.699673171197595 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15650789720687566292&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Exploration by random network distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1093", "id": "H1lJJnR5Ym", "author_site": "Yuri Burda, Harrison Edwards, Amos Storkey, Oleg Klimov", "tldr": "A simple exploration bonus is introduced and achieves state of the art performance in 3 hard exploration Atari games.", "abstract": "We introduce an exploration bonus for deep reinforcement learning methods that is easy to implement and adds minimal overhead to the computation performed. The bonus is the error of a neural network predicting features of the observations given by a fixed randomly initialized neural network. We also introduce a method to flexibly combine intrinsic and extrinsic rewards. We find that the random network distillation (RND) bonus combined with this increased flexibility enables significant progress on several hard exploration Atari games. In particular we establish state of the art performance on Montezuma's Revenge, a game famously difficult for deep reinforcement learning methods. To the best of our knowledge, this is the first method that achieves better than average human performance on this game without using demonstrations or having access the underlying state of the game, and occasionally completes the first level. This suggests that relatively simple methods that scale well can be sufficient to tackle challenging exploration problems.", "keywords": "reinforcement learning;exploration;curiosity", "primary_area": "", "supplementary_material": "", "author": "Yuri Burda;Harrison Edwards;Amos Storkey;Oleg Klimov", "authorids": "yburda@openai.com;h.l.edwards@sms.ed.ac.uk;a.storkey@ed.ac.uk;oleg@openai.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nburda2018exploration,\ntitle={Exploration by random network distillation},\nauthor={Yuri Burda and Harrison Edwards and Amos Storkey and Oleg Klimov},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lJJnR5Ym},\n}", "github": "[![github](/images/github_icon.svg) openai/random-network-distillation](https://github.com/openai/random-network-distillation) + [![Papers with Code](/images/pwc_icon.svg) 20 community implementations](https://paperswithcode.com/paper/?openreview=H1lJJnR5Ym)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer5;AnonReviewer4", "pdf_size": 0, "rating": "4;7;9;10", "confidence": "4;4;5;4", "wc_review": "1000;1472;668;549", "wc_reply_reviewers": "338;224;80;0", "wc_reply_authors": "1192;832;424;250", "reply_reviewers": "1;1;1;0", "reply_authors": "2;1;1;1", "rating_avg": [ 7.5, 2.29128784747792 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "wc_review_avg": [ 922.25, 357.8507894360441 ], "wc_reply_reviewers_avg": [ 160.5, 130.17200159788587 ], "wc_reply_authors_avg": [ 674.5, 365.9108497981441 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.3779644730092272, "gs_citation": 1708, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=126098205768710278&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=H1lJJnR5Ym", "pdf": "https://openreview.net/pdf?id=H1lJJnR5Ym", "email": ";;;", "author_num": 4 }, { "id": "H1lJws05K7", "title": "On the Selection of Initialization and Activation Function for Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "How to effectively choose Initialization and Activation function for deep neural networks", "abstract": "The weight initialization and the activation function of deep neural networks have a crucial impact on the performance of the training procedure. An inappropriate selection can lead to the loss of information of the input during forward propagation and the exponential vanishing/exploding of gradients during back-propagation. Understanding the theoretical properties of untrained random networks is key to identifying which deep networks may be trained successfully as recently demonstrated by Schoenholz et al. (2017) who showed that for deep feedforward neural networks only a specific choice of hyperparameters known as the `edge of chaos' can lead to good performance.\nWe complete this analysis by providing quantitative results showing that, for a class of ReLU-like activation functions, the information propagates indeed deeper for an initialization at the edge of chaos. By further extending this analysis, we identify a class of activation functions that improve the information propagation over ReLU-like functions. This class includes the Swish activation, $\\phi_{swish}(x) = x \\cdot \\text{sigmoid}(x)$, used in Hendrycks & Gimpel (2016),\nElfwing et al. (2017) and Ramachandran et al. (2017). This provides a theoretical grounding for the excellent empirical performance of $\\phi_{swish}$ observed in these contributions. We complement those previous results by illustrating the benefit of using a random initialization on the edge of chaos in this context.", "keywords": "Deep Neural Networks;Initialization;Gaussian Processes", "primary_area": "", "supplementary_material": "", "author": "Soufiane Hayou;Arnaud Doucet;Judith Rousseau", "authorids": "soufiane.hayou@stats.ox.ac.uk;doucet@stats.ox.ac.uk;judith.rousseau@stats.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhayou2019on,\ntitle={On the Selection of Initialization and Activation Function for Deep Neural Networks},\nauthor={Soufiane Hayou and Arnaud Doucet and Judith Rousseau},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lJws05K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1lJws05K7", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "wc_review": "1760;364;510", "wc_reply_reviewers": "1722;0;0", "wc_reply_authors": "3881;259;291", "reply_reviewers": "5;0;0", "reply_authors": "6;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 878.0, 626.5099094720423 ], "wc_reply_reviewers_avg": [ 574.0, 811.7585848021565 ], "wc_reply_authors_avg": [ 1477.0, 1699.934900714338 ], "reply_reviewers_avg": [ 1.6666666666666667, 2.357022603955158 ], "reply_authors_avg": [ 2.6666666666666665, 2.3570226039551585 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11123772475069103763&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H1lPUiRcYQ", "title": "Computing committor functions for the study of rare events using deep learning with importance sampling", "track": "main", "status": "Reject", "tldr": "Computing committor functions for rare events", "abstract": "The committor function is a central object of study in understanding transitions between metastable states in complex systems. However, computing the committor function for realistic systems at low temperatures is a challenging task, due to the curse of dimensionality and the scarcity of transition data. In this paper, we introduce a computational approach that overcomes these issues and achieves good performance on complex benchmark problems with rough energy landscapes. The new approach combines deep learning, importance sampling and feature engineering techniques. This establishes an alternative practical method for studying rare transition events among metastable states of complex, high dimensional systems.", "keywords": "committor function;rare event;deep learning;importance sampling", "primary_area": "", "supplementary_material": "", "author": "Qianxiao Li;Bo Lin;Weiqing Ren", "authorids": "liqix@ihpc.a-star.edu.sg;linbo94@u.nus.edu;matrw@nus.edu.sg", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2019computing,\ntitle={Computing committor functions for the study of rare events using deep learning with importance sampling},\nauthor={Qianxiao Li and Bo Lin and Weiqing Ren},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lPUiRcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer5;AnonReviewer2", "site": "https://openreview.net/forum?id=H1lPUiRcYQ", "pdf_size": 0, "rating": "5;6;6;7", "confidence": "4;4;4;4", "wc_review": "222;531;418;479", "wc_reply_reviewers": "0;0;0;200", "wc_reply_authors": "750;427;171;445", "reply_reviewers": "0;0;0;1", "reply_authors": "1;1;1;2", "rating_avg": [ 6.0, 0.7071067811865476 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 412.5, 117.03097880475921 ], "wc_reply_reviewers_avg": [ 50.0, 86.60254037844386 ], "wc_reply_authors_avg": [ 448.25, 205.1723360982177 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14607009700389785626&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H1lS8oA5YQ", "title": "Feature Attribution As Feature Selection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Feature attribution methods identify \"relevant\" features as an explanation of a complex machine learning model. Several feature attribution methods have been proposed; however, only a few studies have attempted to define the \"relevance\" of each feature mathematically. In this study, we formalize the feature attribution problem as a feature selection problem. In our proposed formalization, there arise two possible definitions of relevance. We name the feature attribution problems based on these two relevances as Exclusive Feature Selection (EFS) and Inclusive Feature Selection (IFS). We show that several existing feature attribution methods can be interpreted as approximation algorithms for EFS and IFS. Moreover, through exhaustive experiments, we show that IFS is better suited as the formalization for the feature attribution problem than EFS.", "keywords": "feature attribution;feature selection", "primary_area": "", "supplementary_material": "", "author": "Satoshi Hara;Koichi Ikeno;Tasuku Soma;Takanori Maehara", "authorids": "satohara@ar.sanken.osaka-u.ac.jp;k1keno@ar.sanken.osaka-u.ac.jp;tasuku_soma@mist.i.u-tokyo.ac.jp;takanori.maehara@riken.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhara2019feature,\ntitle={Feature Attribution As Feature Selection},\nauthor={Satoshi Hara and Koichi Ikeno and Tasuku Soma and Takanori Maehara},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lS8oA5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1lS8oA5YQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;2", "wc_review": "211;89;91", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "498;383;282", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 130.33333333333334, 57.045790574083746 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 387.6666666666667, 88.24335039474016 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15278407501931633789&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "H1lUOsA9Fm", "title": "Synthnet: Learning synthesizers end-to-end", "track": "main", "status": "Reject", "tldr": "A convolutional autoregressive generative model that generates high fidelity audio, behchmarked on music", "abstract": "Learning synthesizers and generating music in the raw audio domain is a challenging task. We investigate the learned representations of convolutional autoregressive generative models. Consequently, we show that mappings between musical notes and the harmonic style (instrument timbre) can be learned based on the raw audio music recording and the musical score (in binary piano roll format). Our proposed architecture, SynthNet uses minimal training data (9 minutes), is substantially better in quality and converges 6 times faster than the baselines. The quality of the generated waveforms (generation accuracy) is sufficiently high that they are almost identical to the ground truth. Therefore, we are able to directly measure generation error during training, based on the RMSE of the Constant-Q transform. Mean opinion scores are also provided. We validate our work using 7 distinct harmonic styles and also provide visualizations and links to all generated audio.", "keywords": "audio;synthesizers;music;convolutional neural networks;generative models;autoregressive models", "primary_area": "", "supplementary_material": "", "author": "Florin Schimbinschi;Christian Walder;Sarah Erfani;James Bailey", "authorids": "florinsch@student.unimelb.edu.au;christian.walder@data61.csiro.au;sarah.erfani@unimelb.edu.au;baileyj@unimelb.edu.au", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nschimbinschi2019synthnet,\ntitle={Synthnet: Learning synthesizers end-to-end},\nauthor={Florin Schimbinschi and Christian Walder and Sarah Erfani and James Bailey},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lUOsA9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=H1lUOsA9Fm", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;3", "wc_review": "874;469;579", "wc_reply_reviewers": "151;0;0", "wc_reply_authors": "2805;1337;1359", "reply_reviewers": "2;0;0", "reply_authors": "10;4;5", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 640.6666666666666, 170.99382704907475 ], "wc_reply_reviewers_avg": [ 50.333333333333336, 71.18208263944578 ], "wc_reply_authors_avg": [ 1833.6666666666667, 686.8951076320815 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 6.333333333333333, 2.6246692913372702 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:06IGLrSyBUMJ:scholar.google.com/&scioq=Synthnet:+Learning+synthesizers+end-to-end&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "H1ldNoC9tX", "title": "Classification from Positive, Unlabeled and Biased Negative Data", "track": "main", "status": "Reject", "tldr": "This paper studied the PUbN classification problem, where we incorporate biased negative (bN) data, i.e., negative data that is not fully representative of the true underlying negative distribution, into positive-unlabeled (PU) learning.", "abstract": "Positive-unlabeled (PU) learning addresses the problem of learning a binary classifier from positive (P) and unlabeled (U) data. It is often applied to situations where negative (N) data are difficult to be fully labeled. However, collecting a non-representative N set that contains only a small portion of all possible N data can be much easier in many practical situations. This paper studies a novel classification framework which incorporates such biased N (bN) data in PU learning. The fact that the training N data are biased also makes our work very different from those of standard semi-supervised learning. We provide an empirical risk minimization-based method to address this PUbN classification problem. Our approach can be regarded as a variant of traditional example-reweighting algorithms, with the weight of each example computed through a preliminary step that draws inspiration from PU learning. We also derive an estimation error bound for the proposed method. Experimental results demonstrate the effectiveness of our algorithm in not only PUbN learning scenarios but also ordinary PU leaning scenarios on several benchmark datasets.", "keywords": "positive-unlabeled learning;dataset shift;empirical risk minimization", "primary_area": "", "supplementary_material": "", "author": "Yu-Guan Hsieh;Gang Niu;Masashi Sugiyama", "authorids": "yu-guan.hsieh@ens.fr;gang.niu@riken.jp;sugi@k.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhsieh2019classification,\ntitle={Classification from Positive, Unlabeled and Biased Negative Data},\nauthor={Yu-Guan Hsieh and Gang Niu and Masashi Sugiyama},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ldNoC9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1ldNoC9tX", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;5;3", "wc_review": "287;488;128", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1473;279;143", "reply_reviewers": "0;0;0", "reply_authors": "6;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 301.0, 147.3024100278064 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 631.6666666666666, 597.4977452305202 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6666666666666665, 2.3570226039551585 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14812922163751509802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "H1lnJ2Rqt7", "title": "LARGE BATCH SIZE TRAINING OF NEURAL NETWORKS WITH ADVERSARIAL TRAINING AND SECOND-ORDER INFORMATION", "track": "main", "status": "Reject", "tldr": "Large batch size training using adversarial training and second order information", "abstract": "Stochastic Gradient Descent (SGD) methods using randomly selected batches are widely-used to train neural network (NN) models. Performing design exploration to find the best NN for a particular task often requires extensive training with different models on a large dataset, which is very computationally expensive. The most straightforward method to accelerate this computation is to distribute the batch of SGD over multiple processors. However, large batch training often times leads to degradation in accuracy, poor generalization, and even poor robustness to adversarial attacks. Existing solutions for large batch training either do not work or require massive hyper-parameter tuning. To address this issue, we propose a novel large batch training method which combines recent results in adversarial training (to regularize against ``sharp minima'') and second order optimization (to use curvature information to change batch size adaptively during training). We extensively evaluate our method on Cifar-10/100, SVHN, TinyImageNet, and ImageNet datasets, using multiple NNs, including residual networks as well as compressed networks such as SqueezeNext. Our new approach exceeds the performance of the existing solutions in terms of both accuracy and the number of SGD iterations (up to 1\\% and $3\\times$, respectively). We emphasize that this is achieved without any additional hyper-parameter tuning to tailor our method to any of these experiments.\n", "keywords": "adversarial training;large batch size;neural network", "primary_area": "", "supplementary_material": "", "author": "Zhewei Yao;Amir Gholami;Kurt Keutzer;Michael Mahoney", "authorids": "zheweiy@berkeley.edu;amirgh@berkeley.edu;keutzer@berkeley.edu;mmahoney@stat.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyao2019large,\ntitle={{LARGE} {BATCH} {SIZE} {TRAINING} {OF} {NEURAL} {NETWORKS} {WITH} {ADVERSARIAL} {TRAINING} {AND} {SECOND}-{ORDER} {INFORMATION}},\nauthor={Zhewei Yao and Amir Gholami and Kurt Keutzer and Michael Mahoney},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lnJ2Rqt7},\n}", "github": "[![github](/images/github_icon.svg) amirgholami/hessianflow](https://github.com/amirgholami/hessianflow)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1lnJ2Rqt7", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;5;4", "wc_review": "1046;215;393", "wc_reply_reviewers": "0;0;14", "wc_reply_authors": "1055;553;794", "reply_reviewers": "0;0;1", "reply_authors": "2;1;2", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 551.3333333333334, 357.25092333291764 ], "wc_reply_reviewers_avg": [ 4.666666666666667, 6.599663291074443 ], "wc_reply_authors_avg": [ 800.6666666666666, 204.99485088384264 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7051726978608415406&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "H1lo3sC9KX", "title": "Asynchronous SGD without gradient delay for efficient distributed training", "track": "main", "status": "Reject", "tldr": "A method for an efficient asynchronous distributed training of deep learning models along with theoretical regret bounds.", "abstract": "Asynchronous distributed gradient descent algorithms for training of deep neural\nnetworks are usually considered as inefficient, mainly because of the Gradient delay\nproblem. In this paper, we propose a novel asynchronous distributed algorithm\nthat tackles this limitation by well-thought-out averaging of model updates, computed\nby workers. The algorithm allows computing gradients along the process\nof gradient merge, thus, reducing or even completely eliminating worker idle time\ndue to communication overhead, which is a pitfall of existing asynchronous methods.\nWe provide theoretical analysis of the proposed asynchronous algorithm,\nand show its regret bounds. According to our analysis, the crucial parameter for\nkeeping high convergence rate is the maximal discrepancy between local parameter\nvectors of any pair of workers. As long as it is kept relatively small, the\nconvergence rate of the algorithm is shown to be the same as the one of a sequential\nonline learning. Furthermore, in our algorithm, this discrepancy is bounded\nby an expression that involves the staleness parameter of the algorithm, and is\nindependent on the number of workers. This is the main differentiator between\nour approach and other solutions, such as Elastic Asynchronous SGD or Downpour\nSGD, in which that maximal discrepancy is bounded by an expression that\ndepends on the number of workers, due to gradient delay problem. To demonstrate\neffectiveness of our approach, we conduct a series of experiments on image\nclassification task on a cluster with 4 machines, equipped with a commodity communication\nswitch and with a single GPU card per machine. Our experiments\nshow a linear scaling on 4-machine cluster without sacrificing the test accuracy,\nwhile eliminating almost completely worker idle time. Since our method allows\nusing commodity communication switch, it paves a way for large scale distributed\ntraining performed on commodity clusters.", "keywords": "SGD;distributed asynchronous training;deep learning;optimisation", "primary_area": "", "supplementary_material": "", "author": "Roman Talyansky;Pavel Kisilev;Zach Melamed;Natan Peterfreund;Uri Verner", "authorids": "roma.talyansky@gmail.com;pavel.kisilev@huawei.com;zach.melamed@huawei.com;natan.peterfreund@gmail.com;uri.verner@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntalyansky2019asynchronous,\ntitle={Asynchronous {SGD} without gradient delay for efficient distributed training},\nauthor={Roman Talyansky and Pavel Kisilev and Zach Melamed and Natan Peterfreund and Uri Verner},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lo3sC9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1lo3sC9KX", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;4", "wc_review": "257;277;40", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 191.33333333333334, 107.31987493263098 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12890796433690984484&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Unsupervised Learning of the Set of Local Maxima", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/745", "id": "H1lqZhRcFm", "author_site": "Lior Wolf, Sagie Benaim, Tomer Galanti", "tldr": "", "abstract": "This paper describes a new form of unsupervised learning, whose input is a set of unlabeled points that are assumed to be local maxima of an unknown value function $v$ in an unknown subset of the vector space. Two functions are learned: (i) a set indicator $c$, which is a binary classifier, and (ii) a comparator function $h$ that given two nearby samples, predicts which sample has the higher value of the unknown function $v$. Loss terms are used to ensure that all training samples $\\vx$ are a local maxima of $v$, according to $h$ and satisfy $c(\\vx)=1$. Therefore, $c$ and $h$ provide training signals to each other: a point $\\vx'$ in the vicinity of $\\vx$ satisfies $c(\\vx)=-1$ or is deemed by $h$ to be lower in value than $\\vx$. We present an algorithm, show an example where it is more efficient to use local maxima as an indicator function than to employ conventional classification, and derive a suitable generalization bound. Our experiments show that the method is able to outperform one-class classification algorithms in the task of anomaly detection and also provide an additional signal that is extracted in a completely unsupervised way.\n", "keywords": "Unsupervised Learning;One-class Classification;Multi-player Optimization", "primary_area": "", "supplementary_material": "", "author": "Lior Wolf;Sagie Benaim;Tomer Galanti", "authorids": "wolf@fb.com;sagiebenaim@gmail.com;tomer22g@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nwolf2018unsupervised,\ntitle={Unsupervised Learning of the Set of Local Maxima},\nauthor={Lior Wolf and Sagie Benaim and Tomer Galanti},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lqZhRcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "8;8;8", "confidence": "3;4;3", "wc_review": "452;697;116", "wc_reply_reviewers": "0;72;0", "wc_reply_authors": "907;580;457", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 421.6666666666667, 238.16007688574135 ], "wc_reply_reviewers_avg": [ 24.0, 33.94112549695428 ], "wc_reply_authors_avg": [ 648.0, 189.899973670351 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4956525565743484630&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H1lqZhRcFm", "pdf": "https://openreview.net/pdf?id=H1lqZhRcFm", "email": ";;", "author_num": 3 }, { "id": "H1ltQ3R9KQ", "title": "Causal Reasoning from Meta-reinforcement learning", "track": "main", "status": "Reject", "tldr": "meta-learn a learning algorithm capable of causal reasoning", "abstract": "Discovering and exploiting the causal structure in the environment is a crucial challenge for intelligent agents. Here we explore whether modern deep reinforcement learning can be used to train agents to perform causal reasoning. We adopt a meta-learning approach, where the agent learns a policy for conducting experiments via causal interventions, in order to support a subsequent task which rewards making accurate causal inferences.We also found the agent could make sophisticated counterfactual predictions, as well as learn to draw causal inferences from purely observational data. Though powerful formalisms for causal reasoning have been developed, applying them in real-world domains can be difficult because fitting to large amounts of high dimensional data often requires making idealized assumptions. Our results suggest that causal reasoning in complex settings may benefit from powerful learning-based approaches. More generally, this work may offer new strategies for structured exploration in reinforcement learning, by providing agents with the ability to perform\u2014and interpret\u2014experiments.", "keywords": "meta-learning;causal reasoning;deep reinforcement learning;artificial intelligence", "primary_area": "", "supplementary_material": "", "author": "Ishita Dasgupta;Jane Wang;Silvia Chiappa;Jovana Mitrovic;Pedro Ortega;David Raposo;Edward Hughes;Peter Battaglia;Matthew Botvinick;Zeb Kurth-Nelson", "authorids": "ishitadasgupta@g.harvard.edu;wangjane@google.com;csilvia@google.com;mitrovic@google.com;pedroortega@google.com;draposo@google.com;edwardhughes@google.com;peterbattaglia@google.com;botvinick@google.com;zebk@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@misc{\ndasgupta2019causal,\ntitle={Causal Reasoning from Meta-reinforcement learning},\nauthor={Ishita Dasgupta and Jane Wang and Silvia Chiappa and Jovana Mitrovic and Pedro Ortega and David Raposo and Edward Hughes and Peter Battaglia and Matthew Botvinick and Zeb Kurth-Nelson},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ltQ3R9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=H1ltQ3R9KQ", "pdf_size": 0, "rating": "4;4;5;7", "confidence": "4;3;4;4", "wc_review": "800;743;912;138", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "2215;2094;1372;235", "reply_reviewers": "0;0;0;0", "reply_authors": "3;3;2;1", "rating_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 648.25, 300.80091007176156 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1479.0, 787.227095570268 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 0.82915619758885 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": 0.4714045207910316, "gs_citation": 138, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5583259569123387116&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H1lug3R5FX", "title": "On the Geometry of Adversarial Examples", "track": "main", "status": "Reject", "tldr": "We present a geometric framework for proving robustness guarantees and highlight the importance of codimension in adversarial examples. ", "abstract": "Adversarial examples are a pervasive phenomenon of machine learning models where seemingly imperceptible perturbations to the input lead to misclassifications for otherwise statistically accurate models. We propose a geometric framework, drawing on tools from the manifold reconstruction literature, to analyze the high-dimensional geometry of adversarial examples. In particular, we highlight the importance of codimension: for low-dimensional data manifolds embedded in high-dimensional space there are many directions off the manifold in which to construct adversarial examples. Adversarial examples are a natural consequence of learning a decision boundary that classifies the low-dimensional data manifold well, but classifies points near the manifold incorrectly. Using our geometric framework we prove (1) a tradeoff between robustness under different norms, (2) that adversarial training in balls around the data is sample inefficient, and (3) sufficient sampling conditions under which nearest neighbor classifiers and ball-based adversarial training are robust.", "keywords": "adversarial examples;high-dimensional geometry", "primary_area": "", "supplementary_material": "", "author": "Marc Khoury;Dylan Hadfield-Menell", "authorids": "khoury@eecs.berkeley.edu;dhm@berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkhoury2019on,\ntitle={On the Geometry of Adversarial Examples},\nauthor={Marc Khoury and Dylan Hadfield-Menell},\nyear={2019},\nurl={https://openreview.net/forum?id=H1lug3R5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=H1lug3R5FX", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;3;4", "wc_review": "296;320;348", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "783;781;517", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 321.3333333333333, 21.249836600678968 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 693.6666666666666, 124.92486630860256 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10042339335230217617&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "On the Convergence of A Class of Adam-Type Algorithms for Non-Convex Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/774", "id": "H1x-x309tm", "author_site": "Xiangyi Chen, Sijia Liu, Ruoyu Sun, Mingyi Hong", "tldr": "We analyze convergence of Adam-type algorithms and provide mild sufficient conditions to guarantee their convergence, we also show violating the conditions can makes an algorithm diverge.", "abstract": "This paper studies a class of adaptive gradient based momentum algorithms that update the search directions and learning rates simultaneously using past gradients. This class, which we refer to as the ''``Adam-type'', includes the popular algorithms such as Adam, AMSGrad, AdaGrad. Despite their popularity in training deep neural networks (DNNs), the convergence of these algorithms for solving non-convex problems remains an open question. In this paper, we develop an analysis framework and a set of mild sufficient conditions that guarantee the convergence of the Adam-type methods, with a convergence rate of order $O(\\log{T}/\\sqrt{T})$ for non-convex stochastic optimization. Our convergence analysis applies to a new algorithm called AdaFom (AdaGrad with First Order Momentum). We show that the conditions are essential, by identifying concrete examples in which violating the conditions makes an algorithm diverge. Besides providing one of the first comprehensive analysis for Adam-type methods in the non-convex setting, our results can also help the practitioners to easily monitor the progress of algorithms and determine their convergence behavior. ", "keywords": "nonconvex optimization;Adam;convergence analysis", "primary_area": "", "supplementary_material": "", "author": "Xiangyi Chen;Sijia Liu;Ruoyu Sun;Mingyi Hong", "authorids": "chen5719@umn.edu;sijia.liu@ibm.com;ruoyus@illinois.edu;mhong@umn.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchen2018on,\ntitle={On the Convergence of A Class of Adam-Type Algorithms for Non-Convex Optimization},\nauthor={Xiangyi Chen and Sijia Liu and Ruoyu Sun and Mingyi Hong},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1x-x309tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;2;3", "wc_review": "151;189;675", "wc_reply_reviewers": "0;0;257", "wc_reply_authors": "153;69;1581", "reply_reviewers": "0;0;2", "reply_authors": "1;1;3", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 338.3333333333333, 238.56422382429616 ], "wc_reply_reviewers_avg": [ 85.66666666666667, 121.15096184329514 ], "wc_reply_authors_avg": [ 601.0, 693.8126548283766 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 406, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16342443701076005816&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=H1x-x309tm", "pdf": "https://openreview.net/pdf?id=H1x-x309tm", "email": ";;;", "author_num": 4 }, { "id": "H1x1noAqKX", "title": "Discriminative out-of-distribution detection for semantic segmentation", "track": "main", "status": "Reject", "tldr": "We present a novel approach for detecting out-of-distribution pixels in semantic segmentation.", "abstract": "Most classification and segmentation datasets assume a closed-world scenario in which predictions are expressed as distribution over a predetermined set of visual classes. However, such assumption implies unavoidable and often unnoticeable failures in presence of out-of-distribution (OOD) input. These failures are bound to happen in most real-life applications since current visual ontologies are far from being comprehensive. We propose to address this issue by discriminative detection \nof OOD pixels in input data. Different from recent approaches, we avoid to bring any decisions by only observing the training dataset of the primary model trained to solve the desired computer vision task. Instead, we train a dedicated OOD model\nwhich discriminates the primary training set from a much larger \"background\" dataset which approximates the variety of the visual world. We perform our experiments on high resolution natural images in a dense prediction setup. We use several road driving datasets as our training distribution, while we approximate the background distribution with the ILSVRC dataset. We evaluate our approach on WildDash test, which is currently the only public test dataset with out-of-distribution images.\nThe obtained results show that the proposed approach succeeds to identify out-of-distribution pixels while outperforming previous work by a wide margin.", "keywords": "out-of-distribution detection;semantic segmentation", "primary_area": "", "supplementary_material": "", "author": "Petra Bevandi\u0107;Sini\u0161a \u0160egvi\u0107;Ivan Kre\u0161o;Marin Or\u0161i\u0107", "authorids": "petra.bevandic@fer.hr;sinisa.segvic@fer.hr;ivan.kreso@fer.hr;marin.orsic@fer.hr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbevandi\u01072019discriminative,\ntitle={Discriminative out-of-distribution detection for semantic segmentation},\nauthor={Petra Bevandi\u0107 and Sini\u0161a \u0160egvi\u0107 and Ivan Kre\u0161o and Marin Or\u0161i\u0107},\nyear={2019},\nurl={https://openreview.net/forum?id=H1x1noAqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1x1noAqKX", "pdf_size": 0, "rating": "3;4;7", "confidence": "3;4;5", "wc_review": "327;605;128", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "747;967;315", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 353.3333333333333, 195.622652630574 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 676.3333333333334, 270.8275383995423 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9607689228305228, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17407322409280728402&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "H1x3SnAcYQ", "title": "A Better Baseline for Second Order Gradient Estimation in Stochastic Computation Graphs", "track": "main", "status": "Reject", "tldr": "We extend the DiCE formalism of higher order gradient estimation with a new baseline for variance reduction of second order derivatives, improving sample efficiency by two orders of magnitude. ", "abstract": "Motivated by the need for higher order gradients in multi-agent reinforcement learning and meta-learning, this paper studies the construction of baselines for second order Monte Carlo gradient estimators in order to reduce the sample variance. Following the construction of a stochastic computation graph (SCG), the Infinitely Differentiable Monte-Carlo Estimator (DiCE) can generate correct estimates of arbitrary order gradients through differentiation. However, a baseline term that serves as a control variate for reducing variance is currently provided only for first order gradient estimation, limiting the utility of higher-order gradient estimates. To improve the sample efficiency of DiCE, we propose a new baseline term for higher order gradient estimation. This term may be easily included in the objective, and produces unbiased variance-reduced estimators under (automatic) differentiation, without affecting the estimate of the objective itself or of the first order gradient. We provide theoretical analysis and numerical evaluations of our baseline term, which demonstrate that it can dramatically reduce the variance of second order gradient estimators produced by DiCE. This computational tool can be easily used to estimate second order gradients with unprecedented efficiency wherever automatic differentiation is utilised, and has the potential to unlock applications of higher order gradients in reinforcement learning and meta-learning.", "keywords": "Reinforcement learning;meta-learning;higher order derivatives;gradient estimation;stochastic computation graphs", "primary_area": "", "supplementary_material": "", "author": "Jingkai Mao;Jakob Foerster;Tim Rockt\u00e4schel;Gregory Farquhar;Maruan Al-Shedivat;Shimon Whiteson", "authorids": "jingkai.mao@gmail.com;jakobfoerster@gmail.com;tim.rocktaeschel@gmail.com;gregory.farquhar@cs.ox.ac.uk;alshedivat@cs.cmu.edu;shimon.whitesone@cs.ox.ac.uk", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nmao2019a,\ntitle={A Better Baseline for Second Order Gradient Estimation in Stochastic Computation Graphs},\nauthor={Jingkai Mao and Jakob Foerster and Tim Rockt\u00e4schel and Gregory Farquhar and Maruan Al-Shedivat and Shimon Whiteson},\nyear={2019},\nurl={https://openreview.net/forum?id=H1x3SnAcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=H1x3SnAcYQ", "pdf_size": 0, "rating": "3;5;6;6", "confidence": "4;4;3;3", "wc_review": "1384;392;189;262", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1028;354;55;248", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "rating_avg": [ 5.0, 1.224744871391589 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 556.75, 483.11560469519094 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 421.25, 366.3409443401051 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8164965809277259, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HC1yrgNF9egJ:scholar.google.com/&scioq=A+Better+Baseline+for+Second+Order+Gradient+Estimation+in+Stochastic+Computation+Graphs&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "H1xAH2RqK7", "title": "Generative Adversarial Models for Learning Private and Fair Representations", "track": "main", "status": "Reject", "tldr": "We present Generative Adversarial Privacy and Fairness (GAPF), a data-driven framework for learning private and fair representations with certified privacy/fairness guarantees", "abstract": "We present Generative Adversarial Privacy and Fairness (GAPF), a data-driven framework for learning private and fair representations of the data. GAPF leverages recent advances in adversarial learning to allow a data holder to learn \"universal\" representations that decouple a set of sensitive attributes from the rest of the dataset. Under GAPF, finding the optimal decorrelation scheme is formulated as a constrained minimax game between a generative decorrelator and an adversary. We show that for appropriately chosen adversarial loss functions, GAPF provides privacy guarantees against strong information-theoretic adversaries and enforces demographic parity. We also evaluate the performance of GAPF on multi-dimensional Gaussian mixture models and real datasets, and show how a designer can certify that representations learned under an adversary with a fixed architecture perform well against more complex adversaries. ", "keywords": "Data Privacy;Fairness;Adversarial Learning;Generative Adversarial Networks;Minimax Games;Information Theory", "primary_area": "", "supplementary_material": "", "author": "Chong Huang;Xiao Chen;Peter Kairouz;Lalitha Sankar;Ram Rajagopal", "authorids": "chuang83@asu.edu;markcx@stanford.edu;kairouzp@stanford.edu;lsankar@asu.edu;ramr@stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhuang2019generative,\ntitle={Generative Adversarial Models for Learning Private and Fair Representations},\nauthor={Chong Huang and Xiao Chen and Peter Kairouz and Lalitha Sankar and Ram Rajagopal},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xAH2RqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=H1xAH2RqK7", "pdf_size": 0, "rating": "4;4;7", "confidence": "3;3;3", "wc_review": "349;368;291", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1166;538;1187", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 336.0, 32.751590292177674 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 963.6666666666666, 301.113858126051 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14754157113834006502&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Minimum Divergence vs. Maximum Margin: an Empirical Comparison on Seq2Seq Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/819", "id": "H1xD9sR5Fm", "author_site": "Huan Zhang, hai zhao", "tldr": "", "abstract": "Sequence to sequence (seq2seq) models have become a popular framework for neural sequence prediction. While traditional seq2seq models are trained by Maximum Likelihood Estimation (MLE), much recent work has made various attempts to optimize evaluation scores directly to solve the mismatch between training and evaluation, since model predictions are usually evaluated by a task specific evaluation metric like BLEU or ROUGE scores instead of perplexity. This paper puts this existing work into two categories, a) minimum divergence, and b) maximum margin. We introduce a new training criterion based on the analysis of existing work, and empirically compare models in the two categories. Our experimental results show that our new training criterion can usually work better than existing methods, on both the tasks of machine translation and sentence summarization. ", "keywords": "sequence to sequence;training criteria", "primary_area": "", "supplementary_material": "", "author": "Huan Zhang;Hai Zhao", "authorids": "zhanghuan0468@gmail.com;zhaohai@cs.sjtu.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhang2018minimum,\ntitle={Minimum Divergence vs. Maximum Margin: an Empirical Comparison on Seq2Seq Models},\nauthor={Huan Zhang and Hai Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xD9sR5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;4", "wc_review": "57;410;1218", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "164;405;556", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 561.6666666666666, 485.95770277760687 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 375.0, 161.43316470498453 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1741946152253085868&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=H1xD9sR5Fm", "pdf": "https://openreview.net/pdf?id=H1xD9sR5Fm", "email": ";", "author_num": 2 }, { "id": "H1xEtoRqtQ", "title": "Scaling shared model governance via model splitting", "track": "main", "status": "Reject", "tldr": "We study empirically how hard it is to recover missing parts of trained models", "abstract": "Currently the only techniques for sharing governance of a deep learning model are homomorphic encryption and secure multiparty computation. Unfortunately, neither of these techniques is applicable to the training of large neural networks due to their large computational and communication overheads. As a scalable technique for shared model governance, we propose splitting deep learning model between multiple parties. This paper empirically investigates the security guarantee of this technique, which is introduced as the problem of model completion: Given the entire training data set or an environment simulator, and a subset of the parameters of a trained deep learning model, how much training is required to recover the model\u2019s original performance? We define a metric for evaluating the hardness of the model completion problem and study it empirically in both supervised learning on ImageNet and reinforcement learning on Atari and DeepMind Lab. Our experiments show that (1) the model completion problem is harder in reinforcement learning than in supervised learning because of the unavailability of the trained agent\u2019s trajectories, and (2) its hardness depends not primarily on the number of parameters of the missing part, but more so on their type and location. Our results suggest that model splitting might be a feasible technique for shared model governance in some settings where training is very expensive.", "keywords": "deep learning;reinforcement learning;multi-party computation", "primary_area": "", "supplementary_material": "", "author": "Miljan Martic;Jan Leike;Andrew Trask;Matteo Hessel;Shane Legg;Pushmeet Kohli", "authorids": "miljanm@google.com;leike@google.com;atrask@google.com;mtthss@google.com;legg@google.com;pushmeet@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nmartic2019scaling,\ntitle={Scaling shared model governance via model splitting},\nauthor={Miljan Martic and Jan Leike and Andrew Trask and Matteo Hessel and Shane Legg and Pushmeet Kohli},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xEtoRqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1xEtoRqtQ", "pdf_size": 0, "rating": "4;5;9", "confidence": "3;4;4", "wc_review": "455;712;336", "wc_reply_reviewers": "0;446;0", "wc_reply_authors": "446;1603;278", "reply_reviewers": "0;3;0", "reply_authors": "1;4;1", "rating_avg": [ 6.0, 2.160246899469287 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 501.0, 156.90974050920698 ], "wc_reply_reviewers_avg": [ 148.66666666666666, 210.24641627280013 ], "wc_reply_authors_avg": [ 775.6666666666666, 589.0197129317679 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.654653670707977, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5548482158961043330&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "H1xEwsR9FX", "title": "Convolutional CRFs for Semantic Segmentation", "track": "main", "status": "Reject", "tldr": "We propose Convolutional CRFs a fast, powerful and trainable alternative to Fully Connected CRFs.", "abstract": "For the challenging semantic image segmentation task the best performing models\nhave traditionally combined the structured modelling capabilities of Conditional\nRandom Fields (CRFs) with the feature extraction power of CNNs. In more recent\nworks however, CRF post-processing has fallen out of favour. We argue that this\nis mainly due to the slow training and inference speeds of CRFs, as well as the\ndifficulty of learning the internal CRF parameters. To overcome both issues we\npropose to add the assumption of conditional independence to the framework of\nfully-connected CRFs. This allows us to reformulate the inference in terms of\nconvolutions, which can be implemented highly efficiently on GPUs.Doing so\nspeeds up inference and training by two orders of magnitude. All parameters of\nthe convolutional CRFs can easily be optimized using backpropagation. Towards\nthe goal of facilitating further CRF research we have made our implementations\npublicly available.", "keywords": "conditional random fields;semantic segmentation;computer vision;structured learning", "primary_area": "", "supplementary_material": "", "author": "Marvin Teichmann;Roberto Cipolla", "authorids": "mttt2@cam.ac.uk;cipolla@eng.cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nteichmann2019convolutional,\ntitle={Convolutional {CRF}s for Semantic Segmentation},\nauthor={Marvin Teichmann and Roberto Cipolla},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xEwsR9FX},\n}", "github": "[![github](/images/github_icon.svg) MarvinTeichmann/ConvCRF](https://github.com/MarvinTeichmann/ConvCRF)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1xEwsR9FX", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "wc_review": "2036;344;239", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "3530;340;462", "reply_reviewers": "0;0;0", "reply_authors": "6;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 873.0, 823.4816330677935 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1444.0, 1475.8653958497252 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6666666666666665, 2.3570226039551585 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 146, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10289538019317864764&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "H1xL_iR9Km", "title": "GradMix: Multi-source Transfer across Domains and Tasks", "track": "main", "status": "Withdraw", "tldr": "We propose a gradient-based method to transfer knowledge from multiple sources across different domains and tasks.", "abstract": "The machine learning and computer vision community is witnessing an unprecedented rate of new tasks being proposed and addressed, thanks to the power of deep convolutional networks to find complex mappings from X to Y. The advent of each task often accompanies the release of a large-scale human-labeled dataset, for supervised training of the deep network. However, it is expensive and time-consuming to manually label sufficient amount of training data. Therefore, it is important to develop algorithms that can leverage off-the-shelf labeled dataset to learn useful knowledge for the target task. While previous works mostly focus on transfer learning from a single source, we study multi-source transfer across domains and tasks (MS-DTT), in a semi-supervised setting. We propose GradMix, a model-agnostic method applicable to any model trained with gradient-based learning rule. GradMix transfers knowledge via gradient descent, by weighting and mixing the gradients from all sources during training. Our method follows a meta-learning objective, by assigning layer-wise weights to the source gradients, such that the combined gradient follows the direction that can minimize the loss for a small set of samples from the target dataset. In addition, we propose to adaptively adjust the learning rate for each mini-batch based on its importance to the target task, and a pseudo-labeling method to leverage the unlabeled samples in the target domain. We perform experiments on two MS-DTT tasks: digit recognition and action recognition, and demonstrate the advantageous performance of the proposed method against multiple baselines.", "keywords": "Transfer Learning;Domain Adaptation;Multi-source Learning", "primary_area": "", "supplementary_material": "", "author": "Junnan Li;Ziwei Xu;Yongkang Wong;Qi Zhao;Mohan S. Kankanhalli", "authorids": "lijunnan@u.nus.edu;ziwei-xu@comp.nus.edu.sg;yongkang.wong@nus.edu.sg;qzhao@cs.umn.edu;mohan@comp.nus.edu.sg", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1xL_iR9Km", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;5;4", "wc_review": "271;605;160", "wc_reply_reviewers": "0;53;0", "wc_reply_authors": "26;85;111", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 345.3333333333333, 189.12136021319455 ], "wc_reply_reviewers_avg": [ 17.666666666666668, 24.984439601924677 ], "wc_reply_authors_avg": [ 74.0, 35.56215216584433 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6400111356187024019&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "H1xLsjAqtX", "title": "Robust Text Classifier on Test-Time Budgets", "track": "main", "status": "Withdraw", "tldr": "Modular framework for document classification and data aggregation technique for making the framework robust to various distortion, and noise and focus only on the important words. ", "abstract": "In this paper, we design a generic framework for learning a robust text classification model that achieves accuracy comparable to standard full models under test-time\nbudget constraints. We take a different approach from existing methods and learn to dynamically delete a large fraction of unimportant words by a low-complexity selector such that the high-complexity classifier only needs to process a small fraction of important words. In addition, we propose a new data aggregation method to train the classifier, allowing it to make accurate predictions even on fragmented sequence of words. Our end-to-end method achieves state-of-the-art performance while its computational complexity scales linearly with the small fraction of important words in the whole corpus. Besides, a single deep neural network classifier trained by our framework can be dynamically tuned to different budget levels at inference time.", "keywords": "Data Aggregation;Budget Learning;Speed Up;Faster Inference;Robust Classifier", "primary_area": "", "supplementary_material": "", "author": "Md Rizwan Parvez;Tolga Bolukbasi;Kai-Wei Chang;Venkatesh Saligrama", "authorids": "rizwan@cs.ucla.edu;tolgab@bu.edu;kwchang@cs.ucla.edu;srv@bu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1xLsjAqtX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "271;758;169", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 399.3333333333333, 257.0114567269111 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14503243718284396893&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "H1xQSjCqFQ", "title": "Excitation Dropout: Encouraging Plasticity in Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "We propose a guided dropout regularizer for deep networks based on the evidence of a network prediction.", "abstract": "We propose a guided dropout regularizer for deep networks based on the evidence of a network prediction: the firing of neurons in specific paths. In this work, we utilize the evidence at each neuron to determine the probability of dropout, rather than dropping out neurons uniformly at random as in standard dropout. In essence, we dropout with higher probability those neurons which contribute more to decision making at training time. This approach penalizes high saliency neurons that are most relevant for model prediction, i.e. those having stronger evidence. By dropping such high-saliency neurons, the network is forced to learn alternative paths in order to maintain loss minimization, resulting in a plasticity-like behavior, a characteristic of human brains too. We demonstrate better generalization ability, an increased utilization of network neurons, and a higher resilience to network compression using several metrics over four image/video recognition benchmarks.", "keywords": "Dropout;Saliency;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Andrea Zunino;Sarah Adel Bargal;Pietro Morerio;Jianming Zhang;Stan Sclaroff;Vittorio Murino", "authorids": "andrea.zunino@iit.it;sbargal@bu.edu;pietro.morerio@iit.it;jianmzha@adobe.com;sclaroff@bu.edu;vittorio.murino@iit.it", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzunino2019excitation,\ntitle={Excitation Dropout: Encouraging Plasticity in Deep Neural Networks},\nauthor={Andrea Zunino and Sarah Adel Bargal and Pietro Morerio and Jianming Zhang and Stan Sclaroff and Vittorio Murino},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xQSjCqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1xQSjCqFQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "wc_review": "2029;69;255", "wc_reply_reviewers": "761;0;0", "wc_reply_authors": "2422;199;525", "reply_reviewers": "2;0;0", "reply_authors": "5;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 784.3333333333334, 883.381885458127 ], "wc_reply_reviewers_avg": [ 253.66666666666666, 358.73884032197515 ], "wc_reply_authors_avg": [ 1048.6666666666667, 980.1708467858493 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4178315625774183390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "title": "GANSynth: Adversarial Neural Audio Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1004", "id": "H1xQVn09FX", "author_site": "Jesse Engel, Kumar Agrawal, Shuo Chen, Ishaan Gulrajani, Chris Donahue, Adam Roberts", "tldr": "High-quality audio synthesis with GANs", "abstract": "Efficient audio synthesis is an inherently difficult machine learning task, as human perception is sensitive to both global structure and fine-scale waveform coherence. Autoregressive models, such as WaveNet, model local structure at the expense of global latent structure and slow iterative sampling, while Generative Adversarial Networks (GANs), have global latent conditioning and efficient parallel sampling, but struggle to generate locally-coherent audio waveforms. Herein, we demonstrate that GANs can in fact generate high-fidelity and locally-coherent audio by modeling log magnitudes and instantaneous frequencies with sufficient frequency resolution in the spectral domain. Through extensive empirical investigations on the NSynth dataset, we demonstrate that GANs are able to outperform strong WaveNet baselines on automated and human evaluation metrics, and efficiently generate audio several orders of magnitude faster than their autoregressive counterparts.\n", "keywords": "GAN;Audio;WaveNet;NSynth;Music", "primary_area": "", "supplementary_material": "", "author": "Jesse Engel;Kumar Krishna Agrawal;Shuo Chen;Ishaan Gulrajani;Chris Donahue;Adam Roberts", "authorids": "jesseengel@google.com;kumarkagrawal@gmail.com;chenshuo@google.com;igul222@gmail.com;christopherdonahue@gmail.com;adarob@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nengel2018gansynth,\ntitle={{GANS}ynth: Adversarial Neural Audio Synthesis},\nauthor={Jesse Engel and Kumar Krishna Agrawal and Shuo Chen and Ishaan Gulrajani and Chris Donahue and Adam Roberts},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xQVn09FX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=H1xQVn09FX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;3", "wc_review": "152;420;201", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "166;191;368", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 257.6666666666667, 116.51704691684485 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 241.66666666666666, 89.91230295250045 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 604, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1141907515038951552&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=H1xQVn09FX", "pdf": "https://openreview.net/pdf?id=H1xQVn09FX", "email": ";;;;;", "author_num": 6 }, { "title": "Smoothing the Geometry of Probabilistic Box Embeddings", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1010", "id": "H1xSNiRcF7", "author_site": "Xiang Li, Luke Vilnis, Dongxu Zhang, Michael Boratko, Andrew McCallum", "tldr": "Improve hierarchical embedding models using kernel smoothing", "abstract": "There is growing interest in geometrically-inspired embeddings for learning hierarchies, partial orders, and lattice structures, with natural applications to transitive relational data such as entailment graphs. Recent work has extended these ideas beyond deterministic hierarchies to probabilistically calibrated models, which enable learning from uncertain supervision and inferring soft-inclusions among concepts, while maintaining the geometric inductive bias of hierarchical embedding models. We build on the Box Lattice model of Vilnis et al. (2018), which showed promising results in modeling soft-inclusions through an overlapping hierarchy of sets, parameterized as high-dimensional hyperrectangles (boxes). However, the hard edges of the boxes present difficulties for standard gradient based optimization; that work employed a special surrogate function for the disjoint case, but we find this method to be fragile. In this work, we present a novel hierarchical embedding model, inspired by a relaxation of box embeddings into parameterized density functions using Gaussian convolutions over the boxes. Our approach provides an alternative surrogate to the original lattice measure that improves the robustness of optimization in the disjoint case, while also preserving the desirable properties with respect to the original lattice. We demonstrate increased or matching performance on WordNet hypernymy prediction, Flickr caption entailment, and a MovieLens-based market basket dataset. We show especially marked improvements in the case of sparse data, where many conditional probabilities should be low, and thus boxes should be nearly disjoint.", "keywords": "embeddings;order embeddings;knowledge graph embedding;relational learning", "primary_area": "", "supplementary_material": "", "author": "Xiang Li;Luke Vilnis;Dongxu Zhang;Michael Boratko;Andrew McCallum", "authorids": "xiangl@cs.umass.edu;luke@cs.umass.edu;dongxuzhang@cs.umass.edu;mboratko@math.umass.edu;mccallum@cs.umass.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2018smoothing,\ntitle={Smoothing the Geometry of Probabilistic Box Embeddings},\nauthor={Xiang Li and Luke Vilnis and Dongxu Zhang and Michael Boratko and Andrew McCallum},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xSNiRcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;8;8", "confidence": "3;4;3", "wc_review": "248;403;319", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "306;1065;125", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 323.3333333333333, 63.352628639667 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 498.6666666666667, 407.2184780788918 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8166135549126473331&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H1xSNiRcF7", "pdf": "https://openreview.net/pdf?id=H1xSNiRcF7", "email": ";;;;", "author_num": 5 }, { "title": "Sliced Wasserstein Auto-Encoders", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1081", "id": "H1xaJn05FQ", "author_site": "Soheil Kolouri, Phillip Pope, Charles Martin, Gustavo Rohde", "tldr": "In this paper we use the sliced-Wasserstein distance to shape the latent distribution of an auto-encoder into any samplable prior distribution. ", "abstract": "In this paper we use the geometric properties of the optimal transport (OT) problem and the Wasserstein distances to define a prior distribution for the latent space of an auto-encoder. We introduce Sliced-Wasserstein Auto-Encoders (SWAE), that enable one to shape the distribution of the latent space into any samplable probability distribution without the need for training an adversarial network or having a likelihood function specified. In short, we regularize the auto-encoder loss with the sliced-Wasserstein distance between the distribution of the encoded training samples and a samplable prior distribution. We show that the proposed formulation has an efficient numerical solution that provides similar capabilities to Wasserstein Auto-Encoders (WAE) and Variational Auto-Encoders (VAE), while benefiting from an embarrassingly simple implementation. We provide extensive error analysis for our algorithm, and show its merits on three benchmark datasets.", "keywords": "optimal transport;Wasserstein distances;auto-encoders;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Soheil Kolouri;Phillip E. Pope;Charles E. Martin;Gustavo K. Rohde", "authorids": "skolouri@hrl.com;pepope@hrl.com;cemartin@hrl.com;gustavo@virginia.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkolouri2018sliced,\ntitle={Sliced Wasserstein Auto-Encoders},\nauthor={Soheil Kolouri and Phillip E. Pope and Charles E. Martin and Gustavo K. Rohde},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xaJn05FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "wc_review": "138;258;828", "wc_reply_reviewers": "0;0;154", "wc_reply_authors": "641;419;1112", "reply_reviewers": "0;0;1", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 408.0, 300.9983388658482 ], "wc_reply_reviewers_avg": [ 51.333333333333336, 72.59629620181887 ], "wc_reply_authors_avg": [ 724.0, 288.9394400216073 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=H1xaJn05FQ", "pdf": "https://openreview.net/pdf?id=H1xaJn05FQ", "email": ";;;", "author_num": 4 }, { "title": "Learning Two-layer Neural Networks with Symmetric Inputs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/872", "id": "H1xipsA5K7", "author_site": "Rong Ge, Rohith Kuditipudi, Zhize Li, Xiang Wang", "tldr": "We give an algorithm for learning a two-layer neural network with symmetric input distribution. ", "abstract": "We give a new algorithm for learning a two-layer neural network under a very general class of input distributions. Assuming there is a ground-truth two-layer network \ny = A \\sigma(Wx) + \\xi,\nwhere A, W are weight matrices, \\xi represents noise, and the number of neurons in the hidden layer is no larger than the input or output, our algorithm is guaranteed to recover the parameters A, W of the ground-truth network. The only requirement on the input x is that it is symmetric, which still allows highly complicated and structured input. \n\nOur algorithm is based on the method-of-moments framework and extends several results in tensor decompositions. We use spectral algorithms to avoid the complicated non-convex optimization in learning neural networks. Experiments show that our algorithm can robustly learn the ground-truth neural network with a small number of samples for many symmetric input distributions.", "keywords": "Neural Network;Optimization;Symmetric Inputs;Moment-of-moments", "primary_area": "", "supplementary_material": "", "author": "Rong Ge;Rohith Kuditipudi;Zhize Li;Xiang Wang", "authorids": "rongge@cs.duke.edu;rohith.kuditipudi@duke.edu;zz-li14@mails.tsinghua.edu.cn;xwang@cs.duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nge2018learning,\ntitle={Learning Two-layer Neural Networks with Symmetric Inputs},\nauthor={Rong Ge and Rohith Kuditipudi and Zhize Li and Xiang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xipsA5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;5;4", "wc_review": "540;405;207", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "384;79;41", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 384.0, 136.75525584049777 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 168.0, 153.5208997715512 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=205914550108929480&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1xipsA5K7", "pdf": "https://openreview.net/pdf?id=H1xipsA5K7", "email": ";;;", "author_num": 4 }, { "id": "H1xk8jAqKQ", "title": "Backplay: 'Man muss immer umkehren'", "track": "main", "status": "Reject", "tldr": "Learn by working backwards from a single demonstration, even an inefficient one, and progressively have the agent do more of the solving itself.", "abstract": "Model-free reinforcement learning (RL) requires a large number of trials to learn a good policy, especially in environments with sparse rewards. We explore a method to improve the sample efficiency when we have access to demonstrations. Our approach, Backplay, uses a single demonstration to construct a curriculum for a given task. Rather than starting each training episode in the environment's fixed initial state, we start the agent near the end of the demonstration and move the starting point backwards during the course of training until we reach the initial state. Our contributions are that we analytically characterize the types of environments where Backplay can improve training speed, demonstrate the effectiveness of Backplay both in large grid worlds and a complex four player zero-sum game (Pommerman), and show that Backplay compares favorably to other competitive methods known to improve sample efficiency. This includes reward shaping, behavioral cloning, and reverse curriculum generation.", "keywords": "Exploration;Games;Pommerman;Bomberman;AI;Reinforcement Learning;Machine Learning", "primary_area": "", "supplementary_material": "", "author": "Cinjon Resnick;Roberta Raileanu;Sanyam Kapoor;Alexander Peysakhovich;Kyunghyun Cho;Joan Bruna", "authorids": "cinjon.resnick@gmail.com;raileanu.roberta@gmail.com;sanyam@nyu.edu;alexpeys@fb.com;kyunghyun.cho@nyu.edu;bruna@cims.nyu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nresnick2019backplay,\ntitle={Backplay: 'Man muss immer umkehren'},\nauthor={Cinjon Resnick and Roberta Raileanu and Sanyam Kapoor and Alexander Peysakhovich and Kyunghyun Cho and Joan Bruna},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xk8jAqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=H1xk8jAqKQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "wc_review": "754;252;593", "wc_reply_reviewers": "149;0;55", "wc_reply_authors": "1227;214;810", "reply_reviewers": "1;0;1", "reply_authors": "2;1;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 533.0, 209.28608808677816 ], "wc_reply_reviewers_avg": [ 68.0, 61.51964455900787 ], "wc_reply_authors_avg": [ 750.3333333333334, 415.7020834951663 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10499586927241131199&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "H1xmqiAqFm", "title": "Investigating CNNs' Learning Representation under label noise", "track": "main", "status": "Reject", "tldr": "Are CNNs robust or fragile to label noise? Practically, robust.", "abstract": "Deep convolutional neural networks (CNNs) are known to be robust against label noise on extensive datasets. However, at the same time, CNNs are capable of memorizing all labels even if they are random, which means they can memorize corrupted labels. Are CNNs robust or fragile to label noise? Much of researches focusing on such memorization uses class-independent label noise to simulate label corruption, but this setting is simple and unrealistic. In this paper, we investigate the behavior of CNNs under class-dependently simulated label noise, which is generated based on the conceptual distance between classes of a large dataset (i.e., ImageNet-1k). Contrary to previous knowledge, we reveal CNNs are more robust to such class-dependent label noise than class-independent label noise. We also demonstrate the networks under class-dependent noise situations learn similar representation to the no noise situation, compared to class-independent noise situations.", "keywords": "learning with noisy labels;deep learning;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Ryuichiro Hataya;Hideki Nakayama", "authorids": "hataya@nlab.ci.i.u-tokyo.ac.jp;nakayama@nlab.ci.i.u-tokyo.aco.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhataya2019investigating,\ntitle={Investigating {CNN}s' Learning Representation under label noise},\nauthor={Ryuichiro Hataya and Hideki Nakayama},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xmqiAqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1xmqiAqFm", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;5;4", "wc_review": "161;305;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "474;240;185", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 232.33333333333334, 58.79531349426491 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 299.6666666666667, 125.3005276214838 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7986643660847057765&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "H1xpe2C5Km", "title": "Trace-back along capsules and its application on semantic segmentation", "track": "main", "status": "Reject", "tldr": "A capsule-based semantic segmentation, in which the probabilities of the class labels are traced back through capsule pipeline. ", "abstract": "In this paper, we propose a capsule-based neural network model to solve the semantic segmentation problem. By taking advantage of the extractable part-whole dependencies available in capsule layers, we derive the probabilities of the class labels for individual capsules through a recursive, layer-by-layer procedure. We model this procedure as a traceback pipeline and take it as a central piece to build an end-to-end segmentation network. Under the proposed framework, image-level class labels and object boundaries are jointly sought in an explicit manner, which poses a significant advantage over the state-of-the-art fully convolutional network (FCN) solutions. Experiments conducted on modified MNIST and neuroimages demonstrate that our model considerably enhance the segmentation performance compared to the leading FCN variant.\n", "keywords": "capsule;capsule network;semantic segmentation;FCN", "primary_area": "", "supplementary_material": "", "author": "Tao Sun;Zhewei Wang;C. D. Smith;Jundong Liu", "authorids": "zw340113@ohio.edu;ts202115@ohio.edu;cdsmith.uk@gmail.com;liuj1@ohio.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsun2019traceback,\ntitle={Trace-back along capsules and its application on semantic segmentation \t\t},\nauthor={Tao Sun and Zhewei Wang and C. D. Smith and Jundong Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xpe2C5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=H1xpe2C5Km", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;4", "wc_review": "490;168;460", "wc_reply_reviewers": "257;0;181", "wc_reply_authors": "2217;375;898", "reply_reviewers": "1;0;1", "reply_authors": "4;1;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 372.6666666666667, 145.2385011703929 ], "wc_reply_reviewers_avg": [ 146.0, 107.79919603905526 ], "wc_reply_authors_avg": [ 1163.3333333333333, 775.0450882941514 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12150000832328588179&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Learning to Understand Goal Specifications by Modelling Reward", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/734", "id": "H1xsSjC9Ym", "author_site": "Dzmitry Bahdanau, Felix Hill, Jan Leike, Edward Hughes, Arian Hosseini, Pushmeet Kohli, Edward Grefenstette", "tldr": "We propose AGILE, a framework for training agents to perform instructions from examples of respective goal-states.", "abstract": "Recent work has shown that deep reinforcement-learning agents can learn to follow language-like instructions from infrequent environment rewards. However, this places on environment designers the onus of designing language-conditional reward functions which may not be easily or tractably implemented as the complexity of the environment and the language scales. To overcome this limitation, we present a framework within which instruction-conditional RL agents are trained using rewards obtained not from the environment, but from reward models which are jointly trained from expert examples. As reward models improve, they learn to accurately reward agents for completing tasks for environment configurations---and for instructions---not present amongst the expert data. This framework effectively separates the representation of what instructions require from how they can be executed.\nIn a simple grid world, it enables an agent to learn a range of commands requiring interaction with blocks and understanding of spatial relations and underspecified abstract arrangements. We further show the method allows our agent to adapt to changes in the environment without requiring new expert examples.", "keywords": "instruction following;reward modelling;language understanding", "primary_area": "", "supplementary_material": "", "author": "Dzmitry Bahdanau;Felix Hill;Jan Leike;Edward Hughes;Arian Hosseini;Pushmeet Kohli;Edward Grefenstette", "authorids": "dimabgv@gmail.com;felixhill@google.com;leike@google.com;edwardhughes@google.com;seyedarian.hosseini@umontreal.ca;pushmeet@google.com;etg@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nbahdanau2018learning,\ntitle={Learning to Understand Goal Specifications by Modelling Reward},\nauthor={Dzmitry Bahdanau and Felix Hill and Jan Leike and Edward Hughes and Pushmeet Kohli and Edward Grefenstette},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xsSjC9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;5;4", "wc_review": "500;1024;806", "wc_reply_reviewers": "0;500;1466", "wc_reply_authors": "865;1488;3284", "reply_reviewers": "0;1;6", "reply_authors": "2;4;8", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 776.6666666666666, 214.92531002394503 ], "wc_reply_reviewers_avg": [ 655.3333333333334, 608.4873777564238 ], "wc_reply_authors_avg": [ 1879.0, 1025.5245812103515 ], "reply_reviewers_avg": [ 2.3333333333333335, 2.6246692913372702 ], "reply_authors_avg": [ 4.666666666666667, 2.494438257849294 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 175, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12539431874776998736&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=H1xsSjC9Ym", "pdf": "https://openreview.net/pdf?id=H1xsSjC9Ym", "email": ";;;;;;", "author_num": 7 }, { "title": "Do Deep Generative Models Know What They Don't Know?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1015", "id": "H1xwNhCcYm", "author_site": "Eric Nalisnick, Akihiro Matsukawa, Yee Whye Teh, Dilan Gorur, Balaji Lakshminarayanan", "tldr": "", "abstract": "A neural network deployed in the wild may be asked to make predictions for inputs that were drawn from a different distribution than that of the training data. A plethora of work has demonstrated that it is easy to find or synthesize inputs for which a neural network is highly confident yet wrong. Generative models are widely viewed to be robust to such mistaken confidence as modeling the density of the input features can be used to detect novel, out-of-distribution inputs. In this paper we challenge this assumption. We find that the density learned by flow-based models, VAEs, and PixelCNNs cannot distinguish images of common objects such as dogs, trucks, and horses (i.e. CIFAR-10) from those of house numbers (i.e. SVHN), assigning a higher likelihood to the latter when the model is trained on the former. Moreover, we find evidence of this phenomenon when pairing several popular image data sets: FashionMNIST vs MNIST, CelebA vs SVHN, ImageNet vs CIFAR-10 / CIFAR-100 / SVHN. To investigate this curious behavior, we focus analysis on flow-based generative models in particular since they are trained and evaluated via the exact marginal likelihood. We find such behavior persists even when we restrict the flows to constant-volume transformations. These transformations admit some theoretical analysis, and we show that the difference in likelihoods can be explained by the location and variances of the data and the model curvature. \n Our results caution against using the density estimates from deep generative models to identify inputs similar to the training distribution until their behavior for out-of-distribution inputs is better understood.", "keywords": "deep generative models;out-of-distribution inputs;flow-based models;uncertainty;density", "primary_area": "", "supplementary_material": "", "author": "Eric Nalisnick;Akihiro Matsukawa;Yee Whye Teh;Dilan Gorur;Balaji Lakshminarayanan", "authorids": "e.nalisnick@eng.cam.ac.uk;amatsukawa@google.com;ywteh@google.com;dilang@google.com;balajiln@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nnalisnick2018do,\ntitle={Do Deep Generative Models Know What They Don't Know? },\nauthor={Eric Nalisnick and Akihiro Matsukawa and Yee Whye Teh and Dilan Gorur and Balaji Lakshminarayanan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1xwNhCcYm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=H1xwNhCcYm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "wc_review": "322;635;306", "wc_reply_reviewers": "149;629;0", "wc_reply_authors": "543;1044;305", "reply_reviewers": "1;2;0", "reply_authors": "2;3;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 421.0, 151.46176635265635 ], "wc_reply_reviewers_avg": [ 259.3333333333333, 268.3782571090454 ], "wc_reply_authors_avg": [ 630.6666666666666, 307.9981962429145 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 903, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8498584058191576508&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=H1xwNhCcYm", "pdf": "https://openreview.net/pdf?id=H1xwNhCcYm", "email": ";;;;", "author_num": 5 }, { "title": "Identifying and Controlling Important Neurons in Neural Machine Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/654", "id": "H1z-PsR5KX", "author_site": "David A Bau, Yonatan Belinkov, Hassan Sajjad, Nadir Durrani, Fahim Dalvi, James R Glass", "tldr": "Unsupervised methods for finding, analyzing, and controlling important neurons in NMT", "abstract": "Neural machine translation (NMT) models learn representations containing substantial linguistic information. However, it is not clear if such information is fully distributed or if some of it can be attributed to individual neurons. We develop unsupervised methods for discovering important neurons in NMT models. Our methods rely on the intuition that different models learn similar properties, and do not require any costly external supervision. We show experimentally that translation quality depends on the discovered neurons, and find that many of them capture common linguistic phenomena. Finally, we show how to control NMT translations in predictable ways, by modifying activations of individual neurons.", "keywords": "neural machine translation;individual neurons;unsupervised;analysis;correlation;translation control;distributivity;localization", "primary_area": "", "supplementary_material": "", "author": "Anthony Bau;Yonatan Belinkov;Hassan Sajjad;Nadir Durrani;Fahim Dalvi;James Glass", "authorids": "abau@mit.edu;belinkov@mit.edu;hsajjad@hbku.edu.qa;ndurrani@qf.org.qa;faimaduddin@qf.org.qa;glass@mit.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nbau2018identifying,\ntitle={Identifying and Controlling Important Neurons in Neural Machine Translation},\nauthor={Anthony Bau and Yonatan Belinkov and Hassan Sajjad and Nadir Durrani and Fahim Dalvi and James Glass},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1z-PsR5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;10", "confidence": "4;3;3", "wc_review": "474;208;67", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "871;520;27", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 249.66666666666666, 168.74899176653537 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 472.6666666666667, 346.1833168841169 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.6933752452815364, "gs_citation": 218, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10670221460130643181&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13, "openreview": "https://openreview.net/forum?id=H1z-PsR5KX", "pdf": "https://openreview.net/pdf?id=H1z-PsR5KX", "email": ";;;;;", "author_num": 6 }, { "id": "H1zW13R5tm", "title": "Bamboo: Ball-Shape Data Augmentation Against Adversarial Attacks from All Directions", "track": "main", "status": "Withdraw", "tldr": "The first data augmentation method specially designed for improving the general robustness of DNN without any hypothesis on the attacking algorithms.", "abstract": "Deep neural networks (DNNs) are widely adopted in real-world cognitive applications because of their high accuracy. The robustness of DNN models, however, has been recently challenged by adversarial attacks where small disturbance on input samples may result in misclassification. State-of-the-art defending algorithms, such as adversarial training or robust optimization, improve DNNs' resilience to adversarial attacks by paying high computational costs. Moreover, these approaches are usually designed to defend one or a few known attacking techniques only. The effectiveness to defend other types of attacking methods, especially those that have not yet been discovered or explored, cannot be guaranteed. This work aims for a general approach of enhancing the robustness of DNN models under adversarial attacks. In particular, we propose Bamboo -- the first data augmentation method designed for improving the general robustness of DNN without any hypothesis on the attacking algorithms. Bamboo augments the training data set with a small amount of data uniformly sampled on a fixed radius ball around each training data and hence, effectively increase the distance between natural data points and decision boundary. Our experiments show that Bamboo substantially improve the general robustness against arbitrary types of attacks and noises, achieving better results comparing to previous adversarial training methods, robust optimization methods and other data augmentation methods with the same amount of data points.", "keywords": "DNN robustness;Adversarial attack;Data augmentation", "primary_area": "", "supplementary_material": "", "author": "Huanrui Yang;Jingchi Zhang;Hsin-Pai Cheng;Wenhan Wang;Yiran Chen;Hai Li", "authorids": "huanrui.yang@duke.edu;jingchi.zhang@duke.edu;hc218@duke.edu;wenhanw@microsoft.com;yiran.chen@duke.edu;hai.li@duke.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1zW13R5tm", "pdf_size": 0, "rating": "3;4", "confidence": "5;3", "wc_review": "422;12", "wc_reply_reviewers": "0;0", "wc_reply_authors": "0;0", "reply_reviewers": "0;0", "reply_authors": "0;0", "rating_avg": [ 3.5, 0.5 ], "confidence_avg": [ 4.0, 1.0 ], "wc_review_avg": [ 217.0, 205.0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14485900684708131659&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "H1z_Z2A5tX", "title": "DON\u2019T JUDGE A BOOK BY ITS COVER - ON THE DYNAMICS OF RECURRENT NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "", "abstract": "To be effective in sequential data processing, Recurrent Neural Networks (RNNs) are required to keep track of past events by creating memories. Consequently RNNs are harder to train than their feedforward counterparts, prompting the developments of both dedicated units such as LSTM and GRU and of a handful of training tricks. In this paper, we investigate the effect of different training protocols on the representation of memories in RNN. While reaching similar performance for different protocols, RNNs are shown to exhibit substantial differences in their ability to generalize for unforeseen tasks or conditions. We analyze the dynamics of the network\u2019s hidden state, and uncover the reasons for this difference. Each memory is found to be associated with a nearly steady state of the dynamics whose speed predicts performance on unforeseen tasks and which we refer to as a \u2019slow point\u2019. By tracing the formation of the slow points we are able to understand the origin of differences between training protocols. Our results show that multiple solutions to the same task exist but may rely on different dynamical mechanisms, and that training protocols can bias the choice of such solutions in an interpretable way.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Doron Haviv;Alexander Rivkind;Omri Barak", "authorids": "doron.haviv12@gmail.com;sashkarivkind@gmail.com;omri.barak@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhaviv2019dont,\ntitle={{DON}\u2019T {JUDGE} A {BOOK} {BY} {ITS} {COVER} - {ON} {THE} {DYNAMICS} {OF} {RECURRENT} {NEURAL} {NETWORKS}},\nauthor={Doron Haviv and Alexander Rivkind and Omri Barak},\nyear={2019},\nurl={https://openreview.net/forum?id=H1z_Z2A5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=H1z_Z2A5tX", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "wc_review": "299;670;429", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "217;501;972", "reply_reviewers": "0;0;0", "reply_authors": "1;2;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 466.0, 153.70317715215475 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 563.3333333333334, 311.362953623081 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Tr71r8FKrvsJ:scholar.google.com/&scioq=DON%E2%80%99T+JUDGE+A+BOOK+BY+ITS+COVER+-+ON+THE+DYNAMICS+OF+RECURRENT+NEURAL+NETWORKS&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Representing Formal Languages: A Comparison Between Finite Automata and Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1023", "id": "H1zeHnA9KX", "author_site": "Joshua Michalenko, Ameesh Shah, Abhinav Verma, Richard Baraniuk, Swarat Chaudhuri, Ankit B Patel", "tldr": "Finite Automata Can be Linearly decoded from Language-Recognizing RNNs using low coarseness abstraction functions and high accuracy decoders. ", "abstract": "We investigate the internal representations that a recurrent neural network (RNN) uses while learning to recognize a regular formal language. Specifically, we train a RNN on positive and negative examples from a regular language, and ask if there is a simple decoding function that maps states of this RNN to states of the minimal deterministic finite automaton (MDFA) for the language. Our experiments show that such a decoding function indeed exists, and that it maps states of the RNN not to MDFA states, but to states of an {\\em abstraction} obtained by clustering small sets of MDFA states into ``''superstates''. A qualitative analysis reveals that the abstraction often has a simple interpretation. Overall, the results suggest a strong structural relationship between internal representations used by RNNs and finite automata, and explain the well-known ability of RNNs to recognize formal grammatical structure. \n", "keywords": "Language recognition;Recurrent Neural Networks;Representation Learning;deterministic finite automaton;automaton", "primary_area": "", "supplementary_material": "", "author": "Joshua J. Michalenko;Ameesh Shah;Abhinav Verma;Richard G. Baraniuk;Swarat Chaudhuri;Ankit B. Patel", "authorids": "jjm7@rice.edu;ameesh@rice.edu;averma@rice.edu;richb@rice.edu;swarat@rice.edu;abp4@rice.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nmichalenko2018finite,\ntitle={Finite Automata Can be Linearly Decoded from Language-Recognizing {RNN}s},\nauthor={Joshua J. Michalenko and Ameesh Shah and Abhinav Verma and Swarat Chaudhuri and Ankit B. Patel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1zeHnA9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;3;3", "wc_review": "581;464;266", "wc_reply_reviewers": "0;171;0", "wc_reply_authors": "393;1306;422", "reply_reviewers": "0;1;0", "reply_authors": "2;3;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 437.0, 130.0076920801227 ], "wc_reply_reviewers_avg": [ 57.0, 80.61017305526642 ], "wc_reply_authors_avg": [ 707.0, 423.72239339768987 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1743679466435781138&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=H1zeHnA9KX", "pdf": "https://openreview.net/pdf?id=H1zeHnA9KX", "email": ";;;;;", "author_num": 6 }, { "title": "Visual Explanation by Interpretation: Improving Visual Feedback Capabilities of Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/879", "id": "H1ziPjC5Fm", "author_site": "Jos\u00e9 Antonio Oramas Mogrovejo, Kaili Wang, Tinne Tuytelaars", "tldr": "Interpretation by Identifying model-learned features that serve as indicators for the task of interest. Explain model decisions by highlighting the response of these features in test data. Evaluate explanations objectively with a controlled dataset.", "abstract": "Visual Interpretation and explanation of deep models is critical towards wide adoption of systems that rely on them. In this paper, we propose a novel scheme for both interpretation as well as explanation in which, given a pretrained model, we automatically identify internal features relevant for the set of classes considered by the model, without relying on additional annotations. We interpret the model through average visualizations of this reduced set of features. Then, at test time, we explain the network prediction by accompanying the predicted class label with supporting visualizations derived from the identified features. In addition, we propose a method to address the artifacts introduced by strided operations in deconvNet-based visualizations. Moreover, we introduce an8Flower , a dataset specifically designed for objective quantitative evaluation of methods for visual explanation. Experiments on the MNIST , ILSVRC 12, Fashion 144k and an8Flower datasets show that our method produces detailed explanations with good coverage of relevant features of the classes of interest.", "keywords": "model explanation;model interpretation;explainable ai;evaluation", "primary_area": "", "supplementary_material": "", "author": "Jose Oramas;Kaili Wang;Tinne Tuytelaars", "authorids": "jose.oramas@esat.kuleuven.be;kaili.wang@esat.kuleuven.be;tinne.tuytelaars@esat.kuleuven.be", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\noramas2018visual,\ntitle={Visual Explanation by Interpretation: Improving Visual Feedback Capabilities of Deep Neural Networks},\nauthor={Jose Oramas and Kaili Wang and Tinne Tuytelaars},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=H1ziPjC5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "4;5;8", "confidence": "5;3;4", "wc_review": "392;977;458", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "995;935;280", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 609.0, 261.60657484092405 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 736.6666666666666, 323.8398095080687 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.2401922307076307, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5644310437973500116&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=H1ziPjC5Fm", "pdf": "https://openreview.net/pdf?id=H1ziPjC5Fm", "email": ";;", "author_num": 3 }, { "id": "H1zxjsCqKQ", "title": "Gradient-based learning for F-measure and other performance metrics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many important classification performance metrics, e.g. $F$-measure, are non-differentiable and non-decomposable, and are thus unfriendly to gradient descent algorithm.\nConsequently, despite their popularity as evaluation metrics, these metrics are rarely optimized as training objectives in neural network community.\nIn this paper, we propose an empirical utility maximization scheme with provable learning guarantees to address the non-differentiability of these metrics. \nWe then derive a strongly consistent gradient estimator to handle non-decomposability.\nThese innovations enable end-to-end optimization of these metrics with the same computational complexity as optimizing a decomposable and differentiable metric, e.g. cross-entropy loss.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yu Gai;Zheng Zhang;Kyunghyun Cho", "authorids": "yg1246@nyu.edu;zz@nyu.edu;kyunghyun.cho@nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngai2019gradientbased,\ntitle={Gradient-based learning for F-measure and other performance metrics},\nauthor={Yu Gai and Zheng Zhang and Kyunghyun Cho},\nyear={2019},\nurl={https://openreview.net/forum?id=H1zxjsCqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=H1zxjsCqKQ", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;5;3", "wc_review": "490;377;339", "wc_reply_reviewers": "19;0;0", "wc_reply_authors": "214;173;196", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 402.0, 64.13007614736371 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 8.956685895029603 ], "wc_reply_authors_avg": [ 194.33333333333334, 16.779617264870957 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jcs5OPvGuE0J:scholar.google.com/&scioq=Gradient-based+learning+for+F-measure+and+other+performance+metrics&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Don't let your Discriminator be fooled", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1086", "id": "HJE6X305Fm", "author_site": "Brady Zhou, Philipp Kr\u00e4henb\u00fchl", "tldr": "A discriminator that is not easily fooled by adversarial example makes GAN training more robust and leads to a smoother objective.", "abstract": "Generative Adversarial Networks are one of the leading tools in generative modeling, image editing and content creation. \nHowever, they are hard to train as they require a delicate balancing act between two deep networks fighting a never ending duel. Some of the most promising adversarial models today minimize a Wasserstein objective. It is smoother and more stable to optimize. In this paper, we show that the Wasserstein distance is just one out of a large family of objective functions that yield these properties. By making the discriminator of a GAN robust to adversarial attacks we can turn any GAN objective into a smooth and stable loss. We experimentally show that any GAN objective, including Wasserstein GANs, benefit from adversarial robustness both quantitatively and qualitatively. The training additionally becomes more robust to suboptimal choices of hyperparameters, model architectures, or objective functions.", "keywords": "GAN;generative models;computer vision", "primary_area": "", "supplementary_material": "", "author": "Brady Zhou;Philipp Kr\u00e4henb\u00fchl", "authorids": "brady.zhou@utexas.edu;philkr@cs.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhou2018dont,\ntitle={Don't let your Discriminator be fooled},\nauthor={Brady Zhou and Philipp Kr\u00e4henb\u00fchl},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJE6X305Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;3", "wc_review": "355;111;146", "wc_reply_reviewers": "51;0;0", "wc_reply_authors": "513;14;23", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 204.0, 107.72495842035246 ], "wc_reply_reviewers_avg": [ 17.0, 24.041630560342615 ], "wc_reply_authors_avg": [ 183.33333333333334, 233.13849007736914 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8029169282646543117&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HJE6X305Fm", "pdf": "https://openreview.net/pdf?id=HJE6X305Fm", "email": ";", "author_num": 2 }, { "id": "HJEhIjA9tQ", "title": "Encoder Discriminator Networks for Unsupervised Representation Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Learning representations of data samples in an unsupervised way is needed whenever computers have to reason about unlabeled data. Applications range from compressing and denoising data to super-resolution, generating new samples from a given sample distribution and much more.\nIn this work, we use information entropy and a little game to motivate a new encoder discriminator architecture in order to learn unsupervised latent representations. Inspired by the game \"Taboo\", we train an encoder network to generate a meaningful representation of one particular sample of a dataset. Using this description, a discriminator network then has to retrieve the same sample from the whole dataset. We show that learning in this manner on many different samples repeatedly minimizes the information entropy given the latent description and, thus, forces the encoder network to make precise descriptions that can be interpreted by the discriminator.\nWe provide first results of this method on the MNIST and the Fashion MNIST dataset.", "keywords": "representation learning;unsupervised;encoder discriminator", "primary_area": "", "supplementary_material": "", "author": "Nils Wandel", "authorids": "nils.wandel@ais.uni-bonn.de", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJEhIjA9tQ", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;4", "wc_review": "272;170;256", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "106;248;318", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 232.66666666666666, 44.790872085975536 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 224.0, 88.19674975114823 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:roi2BUht9_sJ:scholar.google.com/&scioq=Encoder+Discriminator+Networks+for+Unsupervised+Representation+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJG0ojCcFm", "title": "Negotiating Team Formation Using Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "Reinforcement learning can be used to train agents to negotiate team formation across many negotiation protocols", "abstract": "When autonomous agents interact in the same environment, they must often cooperate to achieve their goals. One way for agents to cooperate effectively is to form a team, make a binding agreement on a joint plan, and execute it. However, when agents are self-interested, the gains from team formation must be allocated appropriately to incentivize agreement. Various approaches for multi-agent negotiation have been proposed, but typically only work for particular negotiation protocols. More general methods usually require human input or domain-specific data, and so do not scale. To address this, we propose a framework for training agents to negotiate and form teams using deep reinforcement learning. Importantly, our method makes no assumptions about the specific negotiation protocol, and is instead completely experience driven. We evaluate our approach on both non-spatial and spatially extended team-formation negotiation environments, demonstrating that our agents beat hand-crafted bots and reach negotiation outcomes consistent with fair solutions predicted by cooperative game theory. Additionally, we investigate how the physical location of agents influences negotiation outcomes.", "keywords": "Reinforcement Learning;Negotiation;Team Formation;Cooperative Game Theory;Shapley Value", "primary_area": "", "supplementary_material": "", "author": "Yoram Bachrach;Richard Everett;Edward Hughes;Angeliki Lazaridou;Joel Leibo;Marc Lanctot;Mike Johanson;Wojtek Czarnecki;Thore Graepel", "authorids": "yorambac@google.com;reverett@google.com;edwardhughes@google.com;jzl@google.com;angeliki@google.com;lanctot@google.com;mjohanson@google.com;lejlot@google.com;thore@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@misc{\nbachrach2019negotiating,\ntitle={Negotiating Team Formation Using Deep Reinforcement Learning},\nauthor={Yoram Bachrach and Richard Everett and Edward Hughes and Angeliki Lazaridou and Joel Leibo and Marc Lanctot and Mike Johanson and Wojtek Czarnecki and Thore Graepel},\nyear={2019},\nurl={https://openreview.net/forum?id=HJG0ojCcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJG0ojCcFm", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;2", "wc_review": "324;1283;228", "wc_reply_reviewers": "89;87;0", "wc_reply_authors": "1113;1147;657", "reply_reviewers": "1;1;0", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 611.6666666666666, 476.31945396154276 ], "wc_reply_reviewers_avg": [ 58.666666666666664, 41.49163235588057 ], "wc_reply_authors_avg": [ 972.3333333333334, 223.40595834091405 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12027803980037835218&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "HJG1Uo09Fm", "title": "Learning to Reinforcement Learn by Imitation", "track": "main", "status": "Reject", "tldr": "", "abstract": "Meta-reinforcement learning aims to learn fast reinforcement learning (RL) procedures that can be applied to new tasks or environments. While learning fast RL procedures holds promise for allowing agents to autonomously learn a diverse range of skills, existing methods for learning efficient RL are impractical for real world settings, as they rely on slow reinforcement learning algorithms for meta-training, even when the learned procedures are fast. In this paper, we propose to learn a fast reinforcement learning procedure through supervised imitation of an expert, such that, after meta-learning, an agent can quickly learn new tasks through trial-and-error. Through our proposed method, we show that it is possible to learn fast RL using demonstrations, rather than relying on slow RL, where expert agents can be trained quickly by using privileged information or off-policy RL methods. Our experimental evaluation on a number of complex simulated robotic domains demonstrates that our method can effectively learn to learn from spare rewards and is significantly more efficient than prior meta reinforcement learning algorithms.", "keywords": "meta-learning;reinforcement learning;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Rosen Kralev;Russell Mendonca;Alvin Zhang;Tianhe Yu;Abhishek Gupta;Pieter Abbeel;Sergey Levine;Chelsea Finn", "authorids": "rdkralev@gmail.com;russellm@berkeley.edu;alvinz@berkeley.edu;tianheyu927@gmail.com;abhigupta@berkeley.edu;pabbeel@cs.berkeley.edu;svlevine@eecs.berkeley.edu;cbfinn@eecs.berkeley.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nkralev2019learning,\ntitle={Learning to Reinforcement Learn by Imitation},\nauthor={Rosen Kralev and Russell Mendonca and Alvin Zhang and Tianhe Yu and Abhishek Gupta and Pieter Abbeel and Sergey Levine and Chelsea Finn},\nyear={2019},\nurl={https://openreview.net/forum?id=HJG1Uo09Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HJG1Uo09Fm", "pdf_size": 0, "rating": "2;3;4;5", "confidence": "5;2;3;2", "wc_review": "490;135;264;73", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 3.5, 1.118033988749895 ], "confidence_avg": [ 3.0, 1.224744871391589 ], "wc_review_avg": [ 240.5, 159.6785834105501 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.7302967433402213, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HJG7m2AcF7", "title": "Context Mover's Distance & Barycenters: Optimal transport of contexts for building representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a unified framework for building unsupervised representations of entities and their compositions, by viewing each entity as a histogram (or distribution) over its contexts. This enables us to take advantage of optimal transport and construct representations that effectively harness the geometry of the underlying space containing the contexts. Our method captures uncertainty via modelling the entities as distributions and simultaneously provides interpretability with the optimal transport map, hence giving a novel perspective for building rich and powerful feature representations. As a guiding example, we formulate unsupervised representations for text, and demonstrate it on tasks such as sentence similarity and word entailment detection. Empirical results show strong advantages gained through the proposed framework. This approach can potentially be used for any unsupervised or supervised problem (on text or other modalities) with a co-occurrence structure, such as any sequence data. The key tools at the core of this framework are Wasserstein distances and Wasserstein barycenters.", "keywords": "representation learning;wasserstein distance;wasserstein barycenter;entailment", "primary_area": "", "supplementary_material": "", "author": "Sidak Pal Singh;Andreas Hug;Aymeric Dieuleveut;Martin Jaggi", "authorids": "sidak.singh@epfl.ch;andreas.hug@epfl.ch;aymeric.dieuleveut@epfl.ch;martin.jaggi@epfl.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsingh2019context,\ntitle={Context Mover's Distance & Barycenters: Optimal transport of contexts for building representations},\nauthor={Sidak Pal Singh and Andreas Hug and Aymeric Dieuleveut and Martin Jaggi},\nyear={2019},\nurl={https://openreview.net/forum?id=HJG7m2AcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJG7m2AcF7", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "wc_review": "417;423;906", "wc_reply_reviewers": "0;0;475", "wc_reply_authors": "1104;876;1508", "reply_reviewers": "0;0;2", "reply_authors": "3;2;4", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 582.0, 229.11569130026865 ], "wc_reply_reviewers_avg": [ 158.33333333333334, 223.91714737574003 ], "wc_reply_authors_avg": [ 1162.6666666666667, 261.3265305237024 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1480733432414277760&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "title": "Latent Convolutional Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/773", "id": "HJGciiR5Y7", "author_site": "ShahRukh Athar, Evgeny Burnaev, Victor Lempitsky", "tldr": "We present a new deep latent model of natural images that can be trained from unlabeled datasets and can be utilized to solve various image restoration tasks.", "abstract": "We present a new latent model of natural images that can be learned on large-scale datasets. The learning process provides a latent embedding for every image in the training dataset, as well as a deep convolutional network that maps the latent space to the image space. After training, the new model provides a strong and universal image prior for a variety of image restoration tasks such as large-hole inpainting, superresolution, and colorization. To model high-resolution natural images, our approach uses latent spaces of very high dimensionality (one to two orders of magnitude higher than previous latent image models). To tackle this high dimensionality, we use latent spaces with a special manifold structure (convolutional manifolds) parameterized by a ConvNet of a certain architecture. In the experiments, we compare the learned latent models with latent models learned by autoencoders, advanced variants of generative adversarial networks, and a strong baseline system using simpler parameterization of the latent space. Our model outperforms the competing approaches over a range of restoration tasks.", "keywords": "latent models;convolutional networks;unsupervised learning;deep learning;modeling natural images;image restoration", "primary_area": "", "supplementary_material": "", "author": "ShahRukh Athar;Evgeny Burnaev;Victor Lempitsky", "authorids": "sathar@cs.stonybrook.edu;e.burnaev@skoltech.ru;lempitsky@skoltech.ru", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nathar2018latent,\ntitle={Latent Convolutional Models},\nauthor={ShahRukh Athar and Evgeny Burnaev and Victor Lempitsky},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJGciiR5Y7},\n}", "github": "[![github](/images/github_icon.svg) srxdev0619/Latent_Convolutional_Models](https://github.com/srxdev0619/Latent_Convolutional_Models)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;2", "wc_review": "114;264;366", "wc_reply_reviewers": "0;83;0", "wc_reply_authors": "240;326;412", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 248.0, 103.49879226348489 ], "wc_reply_reviewers_avg": [ 27.666666666666668, 39.12657522565563 ], "wc_reply_authors_avg": [ 326.0, 70.21870595978444 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1201013501878383620&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HJGciiR5Y7", "pdf": "https://openreview.net/pdf?id=HJGciiR5Y7", "email": ";;", "author_num": 3 }, { "title": "A Universal Music Translation Network", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/993", "id": "HJGkisCcKm", "author_site": "Noam Mor, Lior Wolf, Adam Polyak, Yaniv Taigman", "tldr": "An automatic method for converting music between instruments and styles", "abstract": "We present a method for translating music across musical instruments and styles. This method is based on unsupervised training of a multi-domain wavenet autoencoder, with a shared encoder and a domain-independent latent space that is trained end-to-end on waveforms. Employing a diverse training dataset and large net capacity, the single encoder allows us to translate also from musical domains that were not seen during training. We evaluate our method on a dataset collected from professional musicians, and achieve convincing translations. We also study the properties of the obtained translation and demonstrate translating even from a whistle, potentially enabling the creation of instrumental music by untrained humans. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Noam Mor;Lior Wolf;Adam Polyak;Yaniv Taigman", "authorids": "noam.mor@gmail.com;wolf@fb.com;adampolyak@fb.com;yaniv@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmor2018autoencoderbased,\ntitle={Autoencoder-based Music Translation},\nauthor={Noam Mor and Lior Wolf and Adam Polyak and Yaniv Taigman},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJGkisCcKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "wc_review": "307;739;570", "wc_reply_reviewers": "32;0;84", "wc_reply_authors": "451;724;433", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 538.6666666666666, 177.74951164177327 ], "wc_reply_reviewers_avg": [ 38.666666666666664, 34.61534662865912 ], "wc_reply_authors_avg": [ 536.0, 133.13902508280583 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6168332349111008894&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HJGkisCcKm", "pdf": "https://openreview.net/pdf?id=HJGkisCcKm", "email": ";;;", "author_num": 4 }, { "id": "HJGtFoC5Fm", "title": "On the Margin Theory of Feedforward Neural Networks", "track": "main", "status": "Reject", "tldr": "We show that training feedforward relu networks with a weak regularizer results in a maximum margin and analyze the implications of this result.", "abstract": "Past works have shown that, somewhat surprisingly, over-parametrization can help generalization in neural networks. Towards explaining this phenomenon, we adopt a margin-based perspective. We establish: 1) for multi-layer feedforward relu networks, the global minimizer of a weakly-regularized cross-entropy loss has the maximum normalized margin among all networks, 2) as a result, increasing the over-parametrization improves the normalized margin and generalization error bounds for deep networks. In the case of two-layer networks, an infinite-width neural network enjoys the best generalization guarantees. The typical infinite feature methods are kernel methods; we compare the neural net margin with that of kernel methods and construct natural instances where kernel methods have much weaker generalization guarantees. We validate this gap between the two approaches empirically. Finally, this infinite-neuron viewpoint is also fruitful for analyzing optimization. We show that a perturbed gradient flow on infinite-size networks finds a global optimizer in polynomial time.", "keywords": "generalization theory;implicit regularization;generalization;over-parametrization;theory;deep learning theory;margin", "primary_area": "", "supplementary_material": "", "author": "Colin Wei;Jason Lee;Qiang Liu;Tengyu Ma", "authorids": "colinwei@stanford.edu;jasonlee@marshall.usc.edu;lqiang@cs.texas.edu;tengyuma@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwei2019on,\ntitle={On the Margin Theory of Feedforward Neural Networks},\nauthor={Colin Wei and Jason Lee and Qiang Liu and Tengyu Ma},\nyear={2019},\nurl={https://openreview.net/forum?id=HJGtFoC5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJGtFoC5Fm", "pdf_size": 0, "rating": "5;5;6;7", "confidence": "4;4;4;3", "wc_review": "496;891;250;408", "wc_reply_reviewers": "0;0;206;0", "wc_reply_authors": "773;823;872;846", "reply_reviewers": "0;0;1;0", "reply_authors": "1;2;2;2", "rating_avg": [ 5.75, 0.82915619758885 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 511.25, 236.30211065498335 ], "wc_reply_reviewers_avg": [ 51.5, 89.20061658979718 ], "wc_reply_authors_avg": [ 828.5, 36.43144246389374 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.75, 0.4330127018922193 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8703882797784891, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13486893892059059668&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "How to train your MAML", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1106", "id": "HJGven05Y7", "author_site": "Antreas Antoniou, Harrison Edwards, Amos Storkey", "tldr": "MAML is great, but it has many problems, we solve many of those problems and as a result we learn most hyper parameters end to end, speed-up training and inference and set a new SOTA in few-shot learning", "abstract": "The field of few-shot learning has recently seen substantial advancements. Most of these advancements came from casting few-shot learning as a meta-learning problem.Model Agnostic Meta Learning or MAML is currently one of the best approaches for few-shot learning via meta-learning. MAML is simple, elegant and very powerful, however, it has a variety of issues, such as being very sensitive to neural network architectures, often leading to instability during training, requiring arduous hyperparameter searches to stabilize training and achieve high generalization and being very computationally expensive at both training and inference times. In this paper, we propose various modifications to MAML that not only stabilize the system, but also substantially improve the generalization performance, convergence speed and computational overhead of MAML, which we call MAML++.", "keywords": "meta-learning;deep-learning;few-shot learning;supervised learning;neural-networks;stochastic optimization", "primary_area": "", "supplementary_material": "", "author": "Antreas Antoniou;Harrison Edwards;Amos Storkey", "authorids": "a.antoniou@sms.ed.ac.uk;h.l.edwards@sms.ac.uk;a.storkey@sms.ed.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nantoniou2018how,\ntitle={How to train your {MAML}},\nauthor={Antreas Antoniou and Harrison Edwards and Amos Storkey},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJGven05Y7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=HJGven05Y7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;5;4", "wc_review": "278;303;596", "wc_reply_reviewers": "0;0;257", "wc_reply_authors": "509;717;361", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 392.3333333333333, 144.37528258750604 ], "wc_reply_reviewers_avg": [ 85.66666666666667, 121.15096184329514 ], "wc_reply_authors_avg": [ 529.0, 146.0228292653812 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 1039, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12854985256703612425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HJGven05Y7", "pdf": "https://openreview.net/pdf?id=HJGven05Y7", "email": ";;", "author_num": 3 }, { "id": "HJM4SjR5KQ", "title": "SpaMHMM: Sparse Mixture of Hidden Markov Models for Graph Connected Entities", "track": "main", "status": "Withdraw", "tldr": "A method to model the generative distribution of sequences coming from graph connected entities.", "abstract": "We propose a framework to model the distribution of sequential data coming from\na set of entities connected in a graph with a known topology. The method is\nbased on a mixture of shared hidden Markov models (HMMs), which are trained\nin order to exploit the knowledge of the graph structure and in such a way that the\nobtained mixtures tend to be sparse. Experiments in different application domains\ndemonstrate the effectiveness and versatility of the method.", "keywords": "multi-entity sequential data;hidden markov models", "primary_area": "", "supplementary_material": "", "author": "Diogo Pernes;Jaime S. Cardoso", "authorids": "dpc@inesctec.pt;jaime.cardoso@inesctec.pt", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJM4SjR5KQ", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "wc_review": "197;404;59", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "520;577;565", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 220.0, 141.78152206828645 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 554.0, 24.535688292770594 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1037368174342631197&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HJM4rsRqFX", "title": "Neural Variational Inference For Embedding Knowledge Graphs", "track": "main", "status": "Reject", "tldr": "Working toward generative knowledge graph models to better estimate predictive uncertainty in knowledge inference. ", "abstract": "Recent advances in Neural Variational Inference allowed for a renaissance in latent variable models in a variety of domains involving high-dimensional data. In this paper, we introduce two generic Variational Inference frameworks for generative models of Knowledge Graphs; Latent Fact Model and Latent Information Model. While traditional variational methods derive an analytical approximation for the intractable distribution over the latent variables, here we construct an inference network conditioned on the symbolic representation of entities and relation types in the Knowledge Graph, to provide the variational distributions. The new framework can create models able to discover underlying probabilistic semantics for the symbolic representation by utilising parameterisable distributions which permit training by back-propagation in the context of neural variational inference, resulting in a highly-scalable method. Under a Bernoulli sampling framework, we provide an alternative justification for commonly used techniques in large-scale stochastic variational inference, which drastically reduces training time at a cost of an additional approximation to the variational lower bound. The generative frameworks are flexible enough to allow training under any prior distribution that permits a re-parametrisation trick, as well as under any scoring function that permits maximum likelihood estimation of the parameters. Experiment results display the potential and efficiency of this framework by improving upon multiple benchmarks with Gaussian prior representations. Code publicly available on Github.", "keywords": "Statistical Relational Learning;Knowledge Graphs;Knowledge Extraction;Latent Feature Models;Variational Inference.", "primary_area": "", "supplementary_material": "", "author": "Alexander I. Cowen-Rivers;Pasquale Minervini", "authorids": "mc_rivers@icloud.com;p.minervini@ucl.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ncowen-rivers2019neural,\ntitle={Neural Variational Inference For Embedding Knowledge Graphs},\nauthor={Alexander I. Cowen-Rivers and Pasquale Minervini},\nyear={2019},\nurl={https://openreview.net/forum?id=HJM4rsRqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJM4rsRqFX", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;5;3", "wc_review": "648;213;105", "wc_reply_reviewers": "74;0;0", "wc_reply_authors": "687;278;214", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 322.0, 234.6955474652214 ], "wc_reply_reviewers_avg": [ 24.666666666666668, 34.883934538536344 ], "wc_reply_authors_avg": [ 393.0, 209.52485930472943 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lUwdQjQ_mJAJ:scholar.google.com/&scioq=Neural+Variational+Inference+For+Embedding+Knowledge+Graphs&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "title": "Learning a SAT Solver from Single-Bit Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/726", "id": "HJMC_iA5tm", "author_site": "Daniel Selsam, Matthew Lamm, Benedikt B\\\"{u}nz, Percy Liang, Leonardo Moura, David L Dill", "tldr": "We train a graph network to predict boolean satisfiability and show that it learns to search for solutions, and that the solutions it finds can be decoded from its activations.", "abstract": "We present NeuroSAT, a message passing neural network that learns to solve SAT problems after only being trained as a classifier to predict satisfiability. Although it is not competitive with state-of-the-art SAT solvers, NeuroSAT can solve problems that are substantially larger and more difficult than it ever saw during training by simply running for more iterations. Moreover, NeuroSAT generalizes to novel distributions; after training only on random SAT problems, at test time it can solve SAT problems encoding graph coloring, clique detection, dominating set, and vertex cover problems, all on a range of distributions over small random graphs.", "keywords": "sat;search;graph neural network;theorem proving;proof", "primary_area": "", "supplementary_material": "", "author": "Daniel Selsam;Matthew Lamm;Benedikt B\\\"{u}nz;Percy Liang;Leonardo de Moura;David L. Dill", "authorids": "dselsam@cs.stanford.edu;mlamm@cs.stanford.edu;buenz@cs.stanford.edu;pliang@cs.stanford.edu;leonardo@microsoft.com;dill@cs.stanford.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nselsam2018learning,\ntitle={Learning a {SAT} Solver from Single-Bit Supervision},\nauthor={Daniel Selsam and Matthew Lamm and Benedikt B\\\"{u}nz and Percy Liang and Leonardo de Moura and David L. Dill},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMC_iA5tm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=HJMC_iA5tm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;3", "wc_review": "443;839;253", "wc_reply_reviewers": "0;543;0", "wc_reply_authors": "443;1202;166", "reply_reviewers": "0;2;0", "reply_authors": "1;3;2", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 511.6666666666667, 244.11108582410225 ], "wc_reply_reviewers_avg": [ 181.0, 255.9726547895302 ], "wc_reply_authors_avg": [ 603.6666666666666, 437.9378443975304 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 541, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6266294675244210264&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HJMC_iA5tm", "pdf": "https://openreview.net/pdf?id=HJMC_iA5tm", "email": ";;;;;", "author_num": 6 }, { "title": "Learning Representations of Sets through Optimized Permutations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/692", "id": "HJMCcjAcYX", "author_site": "Yan Zhang, Jonathon Hare, Adam Prugel-Bennett", "tldr": "Learn how to permute a set, then encode permuted set with RNN to obtain a set representation.", "abstract": "Representations of sets are challenging to learn because operations on sets should be permutation-invariant. To this end, we propose a Permutation-Optimisation module that learns how to permute a set end-to-end. The permuted set can be further processed to learn a permutation-invariant representation of that set, avoiding a bottleneck in traditional set models. We demonstrate our model's ability to learn permutations and set representations with either explicit or implicit supervision on four datasets, on which we achieve state-of-the-art results: number sorting, image mosaics, classification from image mosaics, and visual question answering.\n", "keywords": "sets;representation learning;permutation invariance", "primary_area": "", "supplementary_material": "", "author": "Yan Zhang;Jonathon Hare;Adam Pr\u00fcgel-Bennett", "authorids": "yz5n12@ecs.soton.ac.uk;jsh2@ecs.soton.ac.uk;apb@ecs.soton.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhang2018learning,\ntitle={Learning Representations of Sets through Optimized Permutations},\nauthor={Yan Zhang and Jonathon Hare and Adam Pr\u00fcgel-Bennett},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMCcjAcYX},\n}", "github": "[![github](/images/github_icon.svg) Cyanogenoid/perm-optim](https://github.com/Cyanogenoid/perm-optim) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HJMCcjAcYX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "3;6;6", "confidence": "2;4;4", "wc_review": "552;511;529", "wc_reply_reviewers": "426;209;51", "wc_reply_authors": "1357;1534;711", "reply_reviewers": "5;1;1", "reply_authors": "7;3;3", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 530.6666666666666, 16.779617264870957 ], "wc_reply_reviewers_avg": [ 228.66666666666666, 153.72341685276479 ], "wc_reply_authors_avg": [ 1200.6666666666667, 353.70640681534485 ], "reply_reviewers_avg": [ 2.3333333333333335, 1.8856180831641267 ], "reply_authors_avg": [ 4.333333333333333, 1.8856180831641267 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18380743779170392260&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HJMCcjAcYX", "pdf": "https://openreview.net/pdf?id=HJMCcjAcYX", "email": ";;", "author_num": 3 }, { "id": "HJMCdsC5tX", "title": "A fully automated periodicity detection in time series", "track": "main", "status": "Reject", "tldr": "This paper presents a method to autonomously find multiple periodicities in a signal, using FFT and ACF and add three news steps (clustering/filtering/detrending)", "abstract": "This paper presents a method to autonomously find periodicities in a signal. It is based on the same idea of using Fourier Transform and autocorrelation function presented in Vlachos et al. 2005. While showing interesting results this method does not perform well on noisy signals or signals with multiple periodicities. Thus, our method adds several new extra steps (hints clustering, filtering and detrending) to fix these issues. Experimental results show that the proposed method outperforms the state of the art algorithms. ", "keywords": "Time series;feature engineering;period detection;machine learning", "primary_area": "", "supplementary_material": "", "author": "Tom Puech;Matthieu Boussard", "authorids": "tom.puech@craft.ai;matthieu.boussard@craft.ai", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npuech2019a,\ntitle={A fully automated periodicity detection in time series},\nauthor={Tom Puech and Matthieu Boussard},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMCdsC5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJMCdsC5tX", "pdf_size": 0, "rating": "3;3;5", "confidence": "2;3;2", "wc_review": "84;94;157", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 111.66666666666667, 32.31442746239244 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9223203625929846122&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Big-Little Net: An Efficient Multi-Scale Feature Representation for Visual and Speech Recognition", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/856", "id": "HJMHpjC9Ym", "author_site": "Chun-Fu (Richard) Chen, Quanfu Fan, Neil Mallinar, Tom Sercu, Rogerio Feris", "tldr": "", "abstract": "In this paper, we propose a novel Convolutional Neural Network (CNN) architecture for learning multi-scale feature representations with good tradeoffs between speed and accuracy. This is achieved by using a multi-branch network, which has different computational complexity at different branches with different resolutions. Through frequent merging of features from branches at distinct scales, our model obtains multi-scale features while using less computation. The proposed approach demonstrates improvement of model efficiency and performance on both object recognition and speech recognition tasks, using popular architectures including ResNet, ResNeXt and SEResNeXt. For object recognition, our approach reduces computation by 1/3 while improving accuracy significantly over 1% point than the baselines, and the computational savings can be higher up to 1/2 without compromising the accuracy. Our model also surpasses state-of-the-art CNN acceleration approaches by a large margin in terms of accuracy and FLOPs. On the task of speech recognition, our proposed multi-scale CNNs save 30% FLOPs with slightly better word error rates, showing good generalization across domains.", "keywords": "CNN;multi-scale;efficiency;object recognition;speech recognition", "primary_area": "", "supplementary_material": "", "author": "Chun-Fu (Richard) Chen;Quanfu Fan;Neil Mallinar;Tom Sercu;Rogerio Feris", "authorids": "chenrich@us.ibm.com;qfan@us.ibm.com;neil.r.mallinar@ibm.com;tom.sercu1@ibm.com;rsferis@us.ibm.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nchen2018biglittle,\ntitle={Big-Little Net: An Efficient Multi-Scale Feature Representation for Visual and Speech Recognition},\nauthor={Chun-Fu (Richard) Chen and Quanfu Fan and Neil Mallinar and Tom Sercu and Rogerio Feris},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMHpjC9Ym},\n}", "github": "[![github](/images/github_icon.svg) IBM/BigLittleNet](https://github.com/IBM/BigLittleNet) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HJMHpjC9Ym)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;4", "wc_review": "222;76;326", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "242;54;98", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 208.0, 102.5410486910811 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 131.33333333333334, 80.28836915906452 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=555905086227832192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HJMHpjC9Ym", "pdf": "https://openreview.net/pdf?id=HJMHpjC9Ym", "email": ";;;;", "author_num": 5 }, { "id": "HJMINj05tQ", "title": "Nesterov's method is the discretization of a differential equation with Hessian damping", "track": "main", "status": "Withdraw", "tldr": "We derive Nesterov's method arises as a straightforward discretization of an ODE different from the one in Su-Boyd-Candes and prove acceleration the stochastic case", "abstract": "Su-Boyd-Candes (2014) made a connection between Nesterov's method and an ordinary differential equation (ODE). We show if a Hessian damping term is added to the ODE from Su-Boyd-Candes (2014), then Nesterov's method arises as a straightforward discretization of the modified ODE. Analogously, in the strongly convex case, a Hessian damping term is added to Polyak's ODE, which is then discretized to yield Nesterov's method for strongly convex functions. Despite the Hessian term, both second order ODEs can be represented as first order systems.\n\nEstablished Liapunov analysis is used to recover the accelerated rates of convergence in both continuous and discrete time. Moreover, the Liapunov analysis can be extended to the case of stochastic gradients which allows the full gradient case to be considered as a special case of the stochastic case. The result is a unified approach to convex acceleration in both continuous and discrete time and in both the stochastic and full gradient cases. \n", "keywords": "Nesterov's method;convex optimization;first-order methods;stochastic gradient descent;differential equations;Liapunov's method", "primary_area": "", "supplementary_material": "", "author": "Adam M. Oberman;Maxime Laborde", "authorids": "adam.oberman@mcgill.ca;maxime.laborde@mcgill.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJMINj05tQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;5", "wc_review": "283;252;174", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 236.33333333333334, 45.85726647859518 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:d2bsu__QirkJ:scholar.google.com/&scioq=Nesterov%27s+method+is+the+discretization+of+a+differential+equation+with+Hessian+damping&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJMRvsAcK7", "title": "Dynamic Pricing on E-commerce Platform with Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "This paper describes a methodology for pre-training, evaluating and online dynamic pricing on E-commerce platform using deep reinforcement learning.", "abstract": "In this paper we develop an approach based on deep reinforcement learning (DRL) to address dynamic pricing problem on E-commerce platform. We models real-world E-commerce dynamic pricing problem as Markov Decision Process. Environment state are defined with four groups of different business data. We make several main improvements on the state-of-the-art DRL-based dynamic pricing approaches: 1. We first extend the application of dynamic pricing to a continuous pricing action space. 2. We solve the unknown demand function problem by designing different reward functions. 3. The cold-start problem is addressed by introducing pre-training and evaluation using the historical sales data. Field experiments are designed and conducted on real-world E-commerce platform, pricing thousands of SKUs of products lasting for months. The experiment results shows that, on E-commerce platform, the difference of the revenue conversion rates (DRCR) is a more suitable reward function than the revenue only, which is different from the conclusion from previous researches. Meanwhile, the proposed continuous action model performs better than the discrete one.", "keywords": "reinforcement learning;dynamic pricing;e-commerce;revenue management;field experiment", "primary_area": "", "supplementary_material": "", "author": "Jiaxi Liu;Yidong Zhang;Xiaoqing Wang;Yuming Deng;Xingyu Wu;Miaolan Xie", "authorids": "galiliu.ljx@alibaba-inc.com;tanfu.zyd@alibaba-inc.com;robin.wxq@alibaba-inc.com;yuming.dym@alibaba-inc.com;zhuyang.wxy@alibaba-inc.com;miaolan.xml@alibaba-inc.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nliu2019dynamic,\ntitle={Dynamic Pricing on E-commerce Platform with Deep Reinforcement Learning},\nauthor={Jiaxi Liu and Yidong Zhang and Xiaoqing Wang and Yuming Deng and Xingyu Wu and Miaolan Xie},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMRvsAcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJMRvsAcK7", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;5;4", "wc_review": "515;308;668", "wc_reply_reviewers": "9;0;0", "wc_reply_authors": "716;742;619", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 497.0, 147.51949023773096 ], "wc_reply_reviewers_avg": [ 3.0, 4.242640687119285 ], "wc_reply_authors_avg": [ 692.3333333333334, 52.92972279877872 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16480710405651723010&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJMXTsCqYQ", "title": "Constrained Bayesian Optimization for Automatic Chemical Design", "track": "main", "status": "Reject", "tldr": "", "abstract": "Automatic Chemical Design provides a framework for generating novel molecules with optimized molecular properties. The current model suffers from the pathology that it tends to produce invalid molecular structures. By reformulating the search procedure as a constrained Bayesian optimization problem, we showcase improvements in both the validity and quality of the generated molecules. We demonstrate that the model consistently produces novel molecules ranking above the 90th percentile of the distribution over training set scores across a range of objective functions. Importantly, our method suffers no degradation in the complexity or the diversity of the generated molecules.", "keywords": "Bayesian Optimization;Generative Models", "primary_area": "", "supplementary_material": "", "author": "Ryan-Rhys Griffiths;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "rrg27@cam.ac.uk;jmh233@cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngriffiths2019constrained,\ntitle={Constrained Bayesian Optimization for Automatic Chemical Design},\nauthor={Ryan-Rhys Griffiths and Jos\u00e9 Miguel Hern\u00e1ndez-Lobato},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMXTsCqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJMXTsCqYQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;4", "wc_review": "332;231;435", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 332.6666666666667, 83.28398538868215 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3262310868406021998&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HJMXus0ct7", "title": "iRDA Method for Sparse Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "A sparse optimization algorithm for deep CNN models.", "abstract": "We propose a new approach, known as the iterative regularized dual averaging (iRDA), to improve the efficiency of convolutional neural networks (CNN) by significantly reducing the redundancy of the model without reducing its accuracy. The method has been tested for various data sets, and proven to be significantly more efficient than most existing compressing techniques in the deep learning literature. For many popular data sets such as MNIST and CIFAR-10, more than 95% of the weights can be zeroed out without losing accuracy. In particular, we are able to make ResNet18 with 95% sparsity to have an accuracy that is comparable to that of a much larger model ResNet50 with the best 60% sparsity as reported in the literature.", "keywords": "sparse convolutional neural networks;regularized dual averaging", "primary_area": "", "supplementary_material": "", "author": "Xiaodong Jia;Liang Zhao;Lian Zhang;Juncai He;Jinchao Xu", "authorids": "jiaxiaodong1994@gmail.com;zhaoliang14@lsec.cc.ac.cn;lzhangay@ust.hk;juncaihe@pku.edu.cn;xu@math.psu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\njia2019irda,\ntitle={i{RDA} Method for Sparse Convolutional Neural Networks},\nauthor={Xiaodong Jia and Liang Zhao and Lian Zhang and Juncai He and Jinchao Xu},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMXus0ct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJMXus0ct7", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "wc_review": "35;200;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 155.33333333333334, 86.02454430115989 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CpDKhLHE-toJ:scholar.google.com/&scioq=iRDA+Method+for+Sparse+Convolutional+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJMghjA9YX", "title": "Model Comparison for Semantic Grouping", "track": "main", "status": "Reject", "tldr": "Competitive alternative to sentence embeddings in the task of semantic similarity using model comparison", "abstract": "We introduce a probabilistic framework for quantifying the semantic similarity between two groups of embeddings. We formulate the task of semantic similarity as a model comparison task in which we contrast a generative model which jointly models two sentences versus one that does not. We illustrate how this framework can be used for the Semantic Textual Similarity tasks using clear assumptions about how the embeddings of words are generated. We apply information criteria based model comparison to overcome the shortcomings of Bayesian model comparison, whilst still penalising model complexity. We achieve competitive results by applying the proposed framework with an appropriate choice of likelihood on the STS datasets.", "keywords": "model comparison;semantic similarity;STS;von Mises-Fisher;Information Theoretic Criteria", "primary_area": "", "supplementary_material": "", "author": "Francisco Vargas;Kamen Brestnichki;Nils Hammerla", "authorids": "francisco.vargas@babylonhealth.com;kamen.brestnichki@babylonhealth.com;nils.hammerla@babylonhealth.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nvargas2019model,\ntitle={Model Comparison for Semantic Grouping},\nauthor={Francisco Vargas and Kamen Brestnichki and Nils Hammerla},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMghjA9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJMghjA9YX", "pdf_size": 0, "rating": "5;5;5", "confidence": "1;3;3", "wc_review": "285;392;199", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "689;576;1122", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 292.0, 78.94723976597705 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 795.6666666666666, 235.31869642867073 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18345833118099808380&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "HJMjW3RqtX", "title": "One-Shot High-Fidelity Imitation: Training Large-Scale Deep Nets with RL", "track": "main", "status": "Reject", "tldr": "We present MetaMimic, an algorithm that takes as input a demonstration dataset and outputs (i) a one-shot high-fidelity imitation policy (ii) an unconditional task policy.", "abstract": "Humans are experts at high-fidelity imitation -- closely mimicking a demonstration, often in one attempt. Humans use this ability to quickly solve a task instance, and to bootstrap learning of new tasks. Achieving these abilities in autonomous agents is an open problem. In this paper, we introduce an off-policy RL algorithm (MetaMimic) to narrow this gap. MetaMimic can learn both (i) policies for high-fidelity one-shot imitation of diverse novel skills, and (ii) policies that enable the agent to solve tasks more efficiently than the demonstrators. MetaMimic relies on the principle of storing all experiences in a memory and replaying these to learn massive deep neural network policies by off-policy RL. This paper introduces, to the best of our knowledge, the largest existing neural networks for deep RL and shows that larger networks with normalization are needed to achieve one-shot high-fidelity imitation on a challenging manipulation task.\nThe results also show that both types of policy can be learned from vision, in spite of the task rewards being sparse, and without access to demonstrator actions. ", "keywords": "Imitation Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Tom Le Paine;Sergio Gomez;Ziyu Wang;Scott Reed;Yusuf Aytar;Tobias Pfaff;Matt Hoffman;Gabriel Barth-Maron;Serkan Cabi;David Budden;Nando de Freitas", "authorids": "tpaine@google.com;sergomez@google.com;ziyu@google.com;reedscot@google.com;yusufaytar@google.com;tpfaff@google.com;mwhoffman@google.com;gabrielbm@google.com;cabi@google.com;budden@google.com;nandodefreitas@google.com", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@misc{\npaine2019oneshot,\ntitle={One-Shot High-Fidelity Imitation: Training Large-Scale Deep Nets with {RL}},\nauthor={Tom Le Paine and Sergio Gomez and Ziyu Wang and Scott Reed and Yusuf Aytar and Tobias Pfaff and Matt Hoffman and Gabriel Barth-Maron and Serkan Cabi and David Budden and Nando de Freitas},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMjW3RqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJMjW3RqtX", "pdf_size": 0, "rating": "4;4;5;5", "confidence": "4;4;3;3", "wc_review": "300;827;887;380", "wc_reply_reviewers": "0;0;154;0", "wc_reply_authors": "673;2746;1398;773", "reply_reviewers": "0;0;1;0", "reply_authors": "1;4;2;1", "rating_avg": [ 4.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 598.5, 260.90659248091066 ], "wc_reply_reviewers_avg": [ 38.5, 66.68395609140178 ], "wc_reply_authors_avg": [ 1397.5, 826.64275839083 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 11, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16129560776930901154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJMsiiRctX", "title": "Probabilistic Program Induction for Intuitive Physics Game Play", "track": "main", "status": "Reject", "tldr": "The paper describes a method imitating human cognition about the physical world to play games in environments of physical interactions.", "abstract": "Recent findings suggest that humans deploy cognitive mechanism of physics simulation engines to simulate the physics of objects. We propose a framework for bots to deploy similar tools for interacting with intuitive physics environments. The framework employs a physics simulation in a probabilistic way to infer about moves performed by an agent in a setting governed by Newtonian laws of motion. However, methods of probabilistic programs can be slow in such setting due to their need to generate many samples. We complement the model with a model-free approach to aid the sampling procedures in becoming more efficient through learning from experience during game playing. We present an approach where a myriad of model-free approaches (a convolutional neural network in our model) and model-based approaches (probabilistic physics simulation) is able to achieve what neither could alone. This way the model outperforms an all model-free or all model-based approach. We discuss a case study showing empirical results of the performance of the model on the game of Flappy Bird. ", "keywords": "intuitive physics;probabilistic programming;computational cognitive science;probabilistic models", "primary_area": "", "supplementary_material": "", "author": "Fahad Alhasoun", "authorids": "fha@mit.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nalhasoun2019probabilistic,\ntitle={Probabilistic Program Induction for Intuitive Physics Game Play},\nauthor={Fahad Alhasoun},\nyear={2019},\nurl={https://openreview.net/forum?id=HJMsiiRctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJMsiiRctX", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;2", "wc_review": "182;442;657", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 427.0, 194.20779249728025 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MCfrgWpOimIJ:scholar.google.com/&scioq=Probabilistic+Program+Induction+for+Intuitive+Physics+Game+Play&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HJN6DiAcKQ", "title": "Engaging Image Captioning Via Personality", "track": "main", "status": "Withdraw", "tldr": "We develop engaging image captioning models conditioned on personality that are also state of the art on regular captioning tasks.", "abstract": "Standard image captioning tasks such as COCO and Flickr30k are factual, neutral in tone and (to a human) state the obvious (e.g., \u201ca man playing a guitar\u201d). While such tasks are useful to verify that a machine understands the content of an image, they are not engaging to humans as captions. With this in mind we define a new task, Personality-Captions, where the goal is to be as engaging to humans as possible by incorporating controllable style and personality traits.We collect and release a large dataset of 201,858 of such captions conditioned over 215 possible traits. We build models that combine existing work from (i) sentence representations (Mazar\u00e9 et al., 2018) with Transformers trained on 1.7 billion dialogue examples; and (ii) image representations (Mahajan et al., 2018) with ResNets trained on 3.5 billion social media images. We obtain state-of-the-art performance on Flickr30k and COCO, and strong performance on our new task. Finally, online evaluations validate that our task and models are engaging to humans, with our best model close to human performance.", "keywords": "image;captioning;captions;vision;language", "primary_area": "", "supplementary_material": "", "author": "Kurt Shuster;Samuel Humeau;Hexiang Hu;Antoine Bordes;Jason Weston", "authorids": "kshuster@fb.com;samuelhumeau@fb.com;hexianghu@fb.com;abordes@fb.com;jaseweston@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJN6DiAcKQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;5;5", "wc_review": "412;358;586", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 452.0, 97.28309205612247 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 200, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1421029866164165717&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HJNJws0cF7", "title": "Convolutional Neural Networks combined with Runge-Kutta Methods", "track": "main", "status": "Reject", "tldr": "", "abstract": "A convolutional neural network for image classification can be constructed mathematically since it can be regarded as a multi-period dynamical system. In this paper, a novel approach is proposed to construct network models from the dynamical systems view. Since a pre-activation residual network can be deemed an approximation of a time-dependent dynamical system using the forward Euler method, higher order Runge-Kutta methods (RK methods) can be utilized to build network models in order to achieve higher accuracy. The model constructed in such a way is referred to as the Runge-Kutta Convolutional Neural Network (RKNet). RK methods also provide an interpretation of Dense Convolutional Networks (DenseNets) and Convolutional Neural Networks with Alternately Updated Clique (CliqueNets) from the dynamical systems view. The proposed methods are evaluated on benchmark datasets: CIFAR-10/100, SVHN and ImageNet. The experimental results are consistent with the theoretical properties of RK methods and support the dynamical systems interpretation. Moreover, the experimental results show that the RKNets are superior to the state-of-the-art network models on CIFAR-10 and on par on CIFAR-100, SVHN and ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mai Zhu;Bo Chang;Chong Fu", "authorids": "zhumai@stumail.neu.edu.cn;bchang@stat.ubc.ca;fuchong@mail.neu.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhu2019convolutional,\ntitle={Convolutional Neural Networks combined with Runge-Kutta Methods},\nauthor={Mai Zhu and Bo Chang and Chong Fu},\nyear={2019},\nurl={https://openreview.net/forum?id=HJNJws0cF7},\n}", "github": "[![github](/images/github_icon.svg) ZhuMai/RKCNN](https://github.com/ZhuMai/RKCNN)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJNJws0cF7", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;3", "wc_review": "133;305;250", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "82;265;370", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 229.33333333333334, 71.72323348972927 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 239.0, 119.00420160649792 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7308518194485985884&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7 }, { "id": "HJe3TsR5K7", "title": "Learning Joint Wasserstein Auto-Encoders for Joint Distribution Matching", "track": "main", "status": "Reject", "tldr": "We propose a novel Joint Wasserstein Auto-Encoders (JWAE) for Joint Distribution Matching problem, and apply it to image-to-image translation and video-to-video synthesis tasks.", "abstract": "We study the joint distribution matching problem which aims at learning bidirectional mappings to match the joint distribution of two domains. This problem occurs in unsupervised image-to-image translation and video-to-video synthesis tasks, which, however, has two critical challenges: (i) it is difficult to exploit sufficient information from the joint distribution; (ii) how to theoretically and experimentally evaluate the generalization performance remains an open question. To address the above challenges, we propose a new optimization problem and design a novel Joint Wasserstein Auto-Encoders (JWAE) to minimize the Wasserstein distance of the joint distributions in two domains. We theoretically prove that the generalization ability of the proposed method can be guaranteed by minimizing the Wasserstein distance of joint distributions. To verify the generalization ability, we apply our method to unsupervised video-to-video synthesis by performing video frame interpolation and producing visually smooth videos in two domains, simultaneously. Both qualitative and quantitative comparisons demonstrate the superiority of our method over several state-of-the-arts.", "keywords": "joint distribution matching;image-to-image translation;video-to-video synthesis;Wasserstein distance", "primary_area": "", "supplementary_material": "", "author": "Jiezhang Cao;Yong Guo;Langyuan Mo;Peilin Zhao;Junzhou Huang;Mingkui Tan", "authorids": "secaojiezhang@mail.scut.edu.cn;guoyongcs@gmail.com;selangyuanmo@mail.scut.edu.cn;peilinzhao@hotmail.com;jzhuang@uta.edu;mingkuitan@scut.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ncao2019learning,\ntitle={Learning Joint Wasserstein Auto-Encoders for Joint Distribution Matching},\nauthor={Jiezhang Cao and Yong Guo and Langyuan Mo and Peilin Zhao and Junzhou Huang and Mingkui Tan},\nyear={2019},\nurl={https://openreview.net/forum?id=HJe3TsR5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJe3TsR5K7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "218;252;606", "wc_reply_reviewers": "0;0;6", "wc_reply_authors": "428;287;317", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 358.6666666666667, 175.44103156204807 ], "wc_reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "wc_reply_authors_avg": [ 344.0, 60.64651680022522 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Bb_ysVsq_V4J:scholar.google.com/&scioq=Learning+Joint+Wasserstein+Auto-Encoders+for+Joint+Distribution+Matching&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Unsupervised Hyper-alignment for Multilingual Word Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/999", "id": "HJe62s09tX", "author_site": "Jean Alaux-Lorain, Edouard Grave, marco cuturi, Armand Joulin", "tldr": "", "abstract": "We consider the problem of aligning continuous word representations, learned in multiple languages, to a common space. It was recently shown that, in the case of two languages, it is possible to learn such a mapping without supervision. This paper extends this line of work to the problem of aligning multiple languages to a common space. A solution is to independently map all languages to a pivot language. Unfortunately, this degrades the quality of indirect word translation. We thus propose a novel formulation that ensures composable mappings, leading to better alignments. We evaluate our method by jointly aligning word vectors in eleven languages, showing consistent improvement with indirect mappings while maintaining competitive performance on direct word translation.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jean Alaux;Edouard Grave;Marco Cuturi;Armand Joulin", "authorids": "jean.alaux--lorain@ens.fr;egrave@fb.com;marco.cuturi.cameto@gmail.com;ajoulin@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nalaux2018unsupervised,\ntitle={Unsupervised Hyper-alignment for Multilingual Word Embeddings},\nauthor={Jean Alaux and Edouard Grave and Marco Cuturi and Armand Joulin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJe62s09tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;4;3", "wc_review": "387;1154;189", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 576.6666666666666, 416.16209448830017 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9547479920673238095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HJe62s09tX", "pdf": "https://openreview.net/pdf?id=HJe62s09tX", "email": ";;;", "author_num": 4 }, { "id": "HJeABnCqKQ", "title": "Generative Adversarial Self-Imitation Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper explores a simple regularizer for reinforcement learning by proposing Generative Adversarial Self-Imitation Learning (GASIL), which encourages the agent to imitate past good trajectories via generative adversarial imitation learning framework. Instead of directly maximizing rewards, GASIL focuses on reproducing past good trajectories, which can potentially make long-term credit assignment easier when rewards are sparse and delayed. GASIL can be easily combined with any policy gradient objective by using GASIL as a learned reward shaping function. Our experimental results show that GASIL improves the performance of proximal policy optimization on 2D Point Mass and MuJoCo environments with delayed reward and stochastic dynamics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Junhyuk Oh;Yijie Guo;Satinder Singh;Honglak Lee", "authorids": "junhyuk@umich.edu;guoyijie@umich.edu;baveja@umich.edu;honglak@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\noh2019generative,\ntitle={Generative Adversarial Self-Imitation Learning},\nauthor={Junhyuk Oh and Yijie Guo and Satinder Singh and Honglak Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeABnCqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJeABnCqKQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;5", "wc_review": "221;505;279", "wc_reply_reviewers": "38;68;5", "wc_reply_authors": "128;164;155", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 335.0, 122.51802588462918 ], "wc_reply_reviewers_avg": [ 37.0, 25.729360660537214 ], "wc_reply_authors_avg": [ 149.0, 15.297058540778355 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2461932419210740155&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HJeB0sC9Fm", "title": "Detecting Memorization in ReLU Networks", "track": "main", "status": "Reject", "tldr": "We use the non-negative rank of ReLU activation matrices as a complexity measure and show it (negatively) correlates with good generalization.", "abstract": "We propose a new notion of 'non-linearity' of a network layer with respect to an input batch that is based on its proximity to a linear system, which is reflected in the non-negative rank of the activation matrix.\nWe measure this non-linearity by applying non-negative factorization to the activation matrix.\nConsidering batches of similar samples, we find that high non-linearity in deep layers is indicative of memorization. Furthermore, by applying our approach layer-by-layer, we find that the mechanism for memorization consists of distinct phases. We perform experiments on fully-connected and convolutional neural networks trained on several image and audio datasets. Our results demonstrate that as an indicator for memorization, our technique can be used to perform early stopping.", "keywords": "Memorization;Generalization;ReLU;Non-negative matrix factorization", "primary_area": "", "supplementary_material": "", "author": "Edo Collins;Siavash Arjomand Bigdeli;Sabine S\u00fcsstrunk", "authorids": "edo.collins@epfl.ch;siavash.bigdeli@epfl.ch;sabine.susstrunk@epfl.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncollins2019detecting,\ntitle={Detecting Memorization in Re{LU} Networks},\nauthor={Edo Collins and Siavash Arjomand Bigdeli and Sabine S\u00fcsstrunk},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeB0sC9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJeB0sC9Fm", "pdf_size": 0, "rating": "5;6;9", "confidence": "4;4;5", "wc_review": "792;790;322", "wc_reply_reviewers": "712;156;0", "wc_reply_authors": "1660;952;162", "reply_reviewers": "2;1;0", "reply_authors": "3;3;1", "rating_avg": [ 6.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 634.6666666666666, 221.09022793621816 ], "wc_reply_reviewers_avg": [ 289.3333333333333, 305.58068583527256 ], "wc_reply_authors_avg": [ 924.6666666666666, 611.8612769429212 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9707253433941508, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11657214855810265822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJeEWnR9F7", "title": "Scaling up Deep Learning for PDE-based Models", "track": "main", "status": "Withdraw", "tldr": "We present RNNs for training surrogate models of PDEs, wherein consistency constraints ensure the solutions are physically meaningful, even when the training uses much smaller domains than the trained model is applied to.", "abstract": "Across numerous applications, forecasting relies on numerical solvers for partial differential equations (PDEs). Although the use of deep-learning techniques has been proposed, the uses have been restricted by the fact the training data are obtained using PDE solvers. Thereby, the uses were limited to domains, where the PDE solver was applicable, but no further. \n\nWe present methods for training on small domains, while applying the trained models on larger domains, with consistency constraints ensuring the solutions are physically meaningful even at the boundary of the small domains. We demonstrate the results on an air-pollution forecasting model for Dublin, Ireland.", "keywords": "recurrent neural networks;partial differential equation;domain decomposition;consistency constraints;advection;diffusion", "primary_area": "", "supplementary_material": "", "author": "Philipp Haehnel;Julien Monteil;Jakub Marecek;Fearghal O'Donncha", "authorids": "haehnel@maths.tcd.ie;julien.monteil@ie.ibm.com;jakub.marecek@ie.ibm.com;feardonn@ie.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HJeEWnR9F7", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9310012868052068997&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJeKCi0qYX", "title": "MILE: A Multi-Level Framework for Scalable Graph Embedding", "track": "main", "status": "Reject", "tldr": "A generic framework to scale existing graph embedding techniques to large graphs.", "abstract": "Recently there has been a surge of interest in designing graph embedding methods. Few, if any, can scale to a large-sized graph with millions of nodes due to both computational complexity and memory requirements. In this paper, we relax this limitation by introducing the MultI-Level Embedding (MILE) framework \u2013 a generic methodology allowing contemporary graph embedding methods to scale to large graphs. MILE repeatedly coarsens the graph into smaller ones using a hybrid matching technique to maintain the backbone structure of the graph. It then applies existing embedding methods on the coarsest graph and refines the embeddings to the original graph through a novel graph convolution neural network that it learns. The proposed MILE framework is agnostic to the underlying graph embedding techniques and can be applied to many existing graph embedding methods without modifying them. We employ our framework on several popular graph embedding techniques and conduct embedding for real-world graphs. Experimental results on five large-scale datasets demonstrate that MILE significantly boosts the speed (order of magnitude) of graph embedding while also often generating embeddings of better quality for the task of node classification. MILE can comfortably scale to a graph with 9 million nodes and 40 million edges, on which existing methods run out of memory or take too long to compute on a modern workstation.", "keywords": "Network Embedding;Graph Convolutional Networks;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Jiongqian Liang;Saket Gurukar;Srinivasan Parthasarathy", "authorids": "liang.albert@outlook.com;gurukar.1@osu.edu;srini@cse.ohio-state.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliang2019mile,\ntitle={{MILE}: A Multi-Level Framework for Scalable Graph Embedding},\nauthor={Jiongqian Liang and Saket Gurukar and Srinivasan Parthasarathy},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeKCi0qYX},\n}", "github": "[![github](/images/github_icon.svg) jiongqian/MILE](https://github.com/jiongqian/MILE)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJeKCi0qYX", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;5;3", "wc_review": "507;316;188", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1387;689;499", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 337.0, 131.07504211964485 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 858.3333333333334, 381.7864440873836 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.3273268353539886, "gs_citation": 124, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=183387025337298186&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "HJeNIjA5Y7", "title": "Image Score: how to select useful samples", "track": "main", "status": "Reject", "tldr": "", "abstract": "There has long been debates on how we could interpret neural networks and understand the decisions our models make. Specifically, why deep neural networks tend to be error-prone when dealing with samples that output low softmax scores. We present an efficient approach to measure the confidence of decision-making steps by statistically investigating each unit's contribution to that decision. Instead of focusing on how the models react on datasets, we study the datasets themselves given a pre-trained model. Our approach is capable of assigning a score to each sample within a dataset that measures the frequency of occurrence of that sample's chain of activation. We demonstrate with experiments that our method could select useful samples to improve deep neural networks in a semi-supervised leaning setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simiao Zuo;Jialin Wu", "authorids": "zsmx1996@utexas.edu;jialinwu@utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzuo2019image,\ntitle={Image Score: how to select useful samples},\nauthor={Simiao Zuo and Jialin Wu},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeNIjA5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJeNIjA5Y7", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;3", "wc_review": "544;319;131", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 331.3333333333333, 168.8319348411971 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JLpiSVBp2ecJ:scholar.google.com/&scioq=Image+Score:+how+to+select+useful+samples&hl=en&as_sdt=0,33", "gs_version_total": 3 }, { "id": "HJeOMhA5K7", "title": "Human-Guided Column Networks: Augmenting Deep Learning with Advice", "track": "main", "status": "Reject", "tldr": "Guiding relation-aware deep models towards better learning with human knowledge.", "abstract": "While extremely successful in several applications, especially with low-level representations; sparse, noisy samples and structured domains (with multiple objects and interactions) are some of the open challenges in most deep models. Column Networks, a deep architecture, can succinctly capture such domain structure and interactions, but may still be prone to sub-optimal learning from sparse and noisy samples. Inspired by the success of human-advice guided learning in AI, especially in data-scarce domains, we propose Knowledge-augmented Column Networks that leverage human advice/knowledge for better learning with noisy/sparse samples. Our experiments demonstrate how our approach leads to either superior overall performance or faster convergence.", "keywords": "Knowledge-guided learning;Human advice;Column Networks;Knowledge-based relational deep model;Collective classification", "primary_area": "", "supplementary_material": "", "author": "Mayukh Das;Yang Yu;Devendra Singh Dhami;Gautam Kunapuli;Sriraam Natarajan", "authorids": "mayukh.das1@utdallas.edu;yangyu@hlt.utdallas.edu;devendra.dhami@utdallas.edu;gautam.kunapuli@utdallas.edu;sriraam.natarajan@utdallas.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndas2019humanguided,\ntitle={Human-Guided Column Networks: Augmenting Deep Learning with Advice},\nauthor={Mayukh Das and Yang Yu and Devendra Singh Dhami and Gautam Kunapuli and Sriraam Natarajan},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeOMhA5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJeOMhA5K7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;3", "wc_review": "239;362;84", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "390;215;84", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 228.33333333333334, 113.74337587550123 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 229.66666666666666, 125.35372174592806 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Kj8Lj8f0eDsJ:scholar.google.com/&scioq=Human-Guided+Column+Networks:+Augmenting+Deep+Learning+with+Advice&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HJePRoAct7", "title": "Graph U-Net", "track": "main", "status": "Reject", "tldr": "We propose the graph U-Net based on our novel graph pooling and unpooling layer for network embedding.", "abstract": "We consider the problem of representation learning for graph data. Convolutional neural networks can naturally operate on images, but have significant challenges in dealing with graph data. Given images are special cases of graphs with nodes lie on 2D lattices, graph embedding tasks have a natural correspondence with image pixel-wise prediction tasks such as segmentation. While encoder-decoder architectures like U-Net have been successfully applied on many image pixel-wise prediction tasks, similar methods are lacking for graph data. This is due to the fact that pooling and up-sampling operations are not natural on graph data. To address these challenges, we propose novel graph pooling (gPool) and unpooling (gUnpool) operations in this work. The gPool layer adaptively selects some nodes to form a smaller graph based on their scalar projection values on a trainable projection vector. We further propose the gUnpool layer as the inverse operation of the gPool layer. The gUnpool layer restores the graph into its original structure using the position information of nodes selected in the corresponding gPool layer. Based on our proposed gPool and gUnpool layers, we develop an encoder-decoder model on graph, known as the graph U-Net. Our experimental results on node classification tasks demonstrate that our methods achieve consistently better performance than previous models.", "keywords": "graph;pooling;unpooling;U-Net", "primary_area": "", "supplementary_material": "", "author": "Hongyang Gao;Shuiwang Ji", "authorids": "hongyang.gao@tamu.edu;sji@tamu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngao2019graph,\ntitle={Graph U-Net},\nauthor={Hongyang Gao and Shuiwang Ji},\nyear={2019},\nurl={https://openreview.net/forum?id=HJePRoAct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJePRoAct7", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;4;5", "wc_review": "591;230;600", "wc_reply_reviewers": "0;11;53", "wc_reply_authors": "1019;578;636", "reply_reviewers": "0;1;1", "reply_authors": "2;1;2", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 473.6666666666667, 172.33752412699394 ], "wc_reply_reviewers_avg": [ 21.333333333333332, 22.83759084394752 ], "wc_reply_authors_avg": [ 744.3333333333334, 195.65672887880845 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 1561, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2250116536319373587&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 18 }, { "id": "HJePno0cYm", "title": "Transformer-XL: Language Modeling with Longer-Term Dependency", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a novel neural architecture, Transformer-XL, for modeling longer-term dependency. To address the limitation of fixed-length contexts, we introduce a notion of recurrence by reusing the representations from the history. Empirically, we show state-of-the-art (SoTA) results on both word-level and character-level language modeling datasets, including WikiText-103, One Billion Word, Penn Treebank, and enwiki8. Notably, we improve the SoTA results from 1.06 to 0.99 in bpc on enwiki8, from 33.0 to 18.9 in perplexity on WikiText-103, and from 28.0 to 23.5 in perplexity on One Billion Word. Performance improves when the attention length increases during evaluation, and our best model attends to up to 1,600 words and 3,800 characters. To quantify the effective length of dependency, we devise a new metric and show that on WikiText-103 Transformer-XL manages to model dependency that is about 80% longer than recurrent networks and 450% longer than Transformer. Moreover, Transformer-XL is up to 1,800+ times faster than vanilla Transformer during evaluation.", "keywords": "Language Modeling;Self-Attention", "primary_area": "", "supplementary_material": "", "author": "Zihang Dai*;Zhilin Yang*;Yiming Yang;William W. Cohen;Jaime Carbonell;Quoc V. Le;Ruslan Salakhutdinov", "authorids": "zander.dai@gmail.com;zhiliny@cs.cmu.edu;yiming@cs.cmu.edu;wcohen@google.com;jgc@cs.cmu.edu;qvl@google.com;rsalakhu@cs.cmu.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ndai*2019transformerxl,\ntitle={Transformer-{XL}: Language Modeling with Longer-Term Dependency},\nauthor={Zihang Dai* and Zhilin Yang* and Yiming Yang and William W. Cohen and Jaime Carbonell and Quoc V. Le and Ruslan Salakhutdinov},\nyear={2019},\nurl={https://openreview.net/forum?id=HJePno0cYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJePno0cYm", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "wc_review": "152;339;128", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "154;259;80", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 206.33333333333334, 94.31978701323611 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 164.33333333333334, 73.44083030273796 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8644497229750296718&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HJePy3RcF7", "title": "Rethinking learning rate schedules for stochastic optimization", "track": "main", "status": "Reject", "tldr": "This paper presents a rigorous study of why practically used learning rate schedules (for a given computational budget) offer significant advantages even though these schemes are not advocated by the classical theory of Stochastic Approximation.", "abstract": "There is a stark disparity between the learning rate schedules used in the practice of large scale machine learning and what are considered admissible learning rate schedules prescribed in the theory of stochastic approximation. Recent results, such as in the 'super-convergence' methods which use oscillating learning rates, serve to emphasize this point even more.\nOne plausible explanation is that non-convex neural network training procedures are better suited to the use of fundamentally different learning rate schedules, such as the ``cut the learning rate every constant number of epochs'' method (which more closely resembles an exponentially decaying learning rate schedule); note that this widely used schedule is in stark contrast to the polynomial decay schemes prescribed in the stochastic approximation literature, which are indeed shown to be (worst case) optimal for classes of convex optimization problems.\n\nThe main contribution of this work shows that the picture is far more nuanced, where we do not even need to move to non-convex optimization to show other learning rate schemes can be far more effective. In fact, even for the simple case of stochastic linear regression with a fixed time horizon, the rate achieved by any polynomial decay scheme is sub-optimal compared to the statistical minimax rate (by a factor of condition number); in contrast the ```''cut the learning rate every constant number of epochs'' provides an exponential improvement (depending only logarithmically on the condition number) compared to any polynomial decay scheme. Finally, it is important to ask if our theoretical insights are somehow fundamentally tied to quadratic loss minimization (where we have circumvented minimax lower bounds for more general convex optimization problems)? Here, we conjecture that recent results which make the gradient norm small at a near optimal rate, for both convex and non-convex optimization, may also provide more insights into learning rate schedules used in practice.\n", "keywords": "SGD;learning rate;step size schedules;stochastic approximation;stochastic optimization;deep learning;non-convex optimization;stochastic gradient descent", "primary_area": "", "supplementary_material": "", "author": "Rong Ge;Sham M. Kakade;Rahul Kidambi;Praneeth Netrapalli", "authorids": "rongge@cs.duke.edu;sham@cs.washington.edu;rkidambi@uw.edu;praneeth@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nge2019rethinking,\ntitle={Rethinking learning rate schedules for stochastic optimization},\nauthor={Rong Ge and Sham M. Kakade and Rahul Kidambi and Praneeth Netrapalli},\nyear={2019},\nurl={https://openreview.net/forum?id=HJePy3RcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=HJePy3RcF7", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "wc_review": "1231;265;685", "wc_reply_reviewers": "135;0;10", "wc_reply_authors": "1152;440;325", "reply_reviewers": "1;0;1", "reply_authors": "3;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 727.0, 395.48451297111495 ], "wc_reply_reviewers_avg": [ 48.333333333333336, 61.41841924229426 ], "wc_reply_authors_avg": [ 639.0, 365.77133111640484 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5104334016414995189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJeQToAqKQ", "title": "TherML: The Thermodynamics of Machine Learning", "track": "main", "status": "Reject", "tldr": "We offer a framework for representation learning that connects with a wide class of existing objectives and is analogous to thermodynamics.", "abstract": "In this work we offer an information-theoretic framework for representation learning that connects with a wide class of existing objectives in machine learning. We develop a formal correspondence between this work and thermodynamics and discuss its implications.", "keywords": "representation learning;information theory;information bottleneck;thermodynamics;predictive information", "primary_area": "", "supplementary_material": "", "author": "Alexander A. Alemi;Ian Fischer", "authorids": "alemi@google.com;iansf@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nalemi2019therml,\ntitle={Ther{ML}: The Thermodynamics of Machine Learning},\nauthor={Alexander A. Alemi and Ian Fischer},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeQToAqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJeQToAqKQ", "pdf_size": 0, "rating": "3;5;7", "confidence": "4;3;3", "wc_review": "701;217;410", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "311;77;111", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 442.6666666666667, 198.93773453576426 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 166.33333333333334, 103.23220858282986 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17225156190166198331&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HJeQbnA5tm", "title": "Noisy Information Bottlenecks for Generalization", "track": "main", "status": "Reject", "tldr": "We limit mutual information between parameters and data using noise to improve generalization in deep models.", "abstract": "We propose Noisy Information Bottlenecks (NIB) to limit mutual information between learned parameters and the data through noise. We show why this benefits generalization and allows mitigation of model overfitting both for supervised and unsupervised learning, even for arbitrarily complex architectures. We reinterpret methods including the Variational Autoencoder, beta-VAE, network weight uncertainty and a variant of dropout combined with weight decay as special cases of our approach, explaining and quantifying regularizing properties and vulnerabilities within information theory.", "keywords": "information theory;deep learning;generalization;information bottleneck;variational inference;approximate inference", "primary_area": "", "supplementary_material": "", "author": "Julius Kunze;Louis Kirsch;Hippolyt Ritter;David Barber", "authorids": "juliuskunze@gmail.com;mail@louiskirsch.com;j.ritter@cs.ucl.ac.uk;d.barber@cs.ucl.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkunze2019noisy,\ntitle={Noisy Information Bottlenecks for Generalization},\nauthor={Julius Kunze and Louis Kirsch and Hippolyt Ritter and David Barber},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeQbnA5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJeQbnA5tm", "pdf_size": 0, "rating": "3;5;7", "confidence": "4;3;2", "wc_review": "236;326;144", "wc_reply_reviewers": "283;0;0", "wc_reply_authors": "1091;671;580", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 235.33333333333334, 74.3026842643581 ], "wc_reply_reviewers_avg": [ 94.33333333333333, 133.40747938386198 ], "wc_reply_authors_avg": [ 780.6666666666666, 222.56135234032786 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:umX7bZO-2QYJ:scholar.google.com/&scioq=Noisy+Information+Bottlenecks+for+Generalization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Visual Semantic Navigation using Scene Priors", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/820", "id": "HJeRkh05Km", "author_site": "Wei Yang, Xiaolong Wang, Ali Farhadi, Abhinav Gupta, Roozbeh Mottaghi", "tldr": "", "abstract": "How do humans navigate to target objects in novel scenes? Do we use the semantic/functional priors we have built over years to efficiently search and navigate? For example, to search for mugs, we search cabinets near the coffee machine and for fruits we try the fridge. In this work, we focus on incorporating semantic priors in the task of semantic navigation. We propose to use Graph Convolutional Networks for incorporating the prior knowledge into a deep reinforcement learning framework. The agent uses the features from the knowledge graph to predict the actions. For evaluation, we use the AI2-THOR framework. Our experiments show how semantic knowledge improves the performance significantly. More importantly, we show improvement in generalization to unseen scenes and/or objects.", "keywords": "Visual Navigation;Scene Prior;Knowledge Graph;Graph Convolution Networks;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Wei Yang;Xiaolong Wang;Ali Farhadi;Abhinav Gupta;Roozbeh Mottaghi", "authorids": "wyang@ee.cuhk.edu.hk;xiaolonw@cs.cmu.edu;ali@cs.washington.edu;abhinavg@cs.cmu.edu;roozbehm@allenai.org", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyang2019visual,\ntitle={Visual Semantic Navigation using Scene Priors},\nauthor={Wei Yang and Xiaolong Wang and Ali Farhadi and Abhinav Gupta and Roozbeh Mottaghi},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeRkh05Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;1", "wc_review": "197;305;427", "wc_reply_reviewers": "68;53;0", "wc_reply_authors": "186;313;471", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 309.6666666666667, 93.95507200548332 ], "wc_reply_reviewers_avg": [ 40.333333333333336, 29.169999809545576 ], "wc_reply_authors_avg": [ 323.3333333333333, 116.57996778558865 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 391, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10385662033870004027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HJeRkh05Km", "pdf": "https://openreview.net/pdf?id=HJeRkh05Km", "email": ";;;;", "author_num": 5 }, { "id": "HJeRm3Aqt7", "title": "GenEval: A Benchmark Suite for Evaluating Generative Models", "track": "main", "status": "Reject", "tldr": "We introduce battery of synthetic distributions and metrics for measuring the success of generative models ", "abstract": "Generative models are important for several practical applications, from low level image processing tasks, to model-based planning in robotics. More generally,\nthe study of generative models is motivated by the long-standing endeavor to model uncertainty and to discover structure by leveraging unlabeled data.\nUnfortunately, the lack of an ultimate task of interest has hindered progress in the field, as there is no established way to\ncompare models and, often times, evaluation is based on mere visual inspection of samples drawn from such models.\n\nIn this work, we aim at addressing this problem by introducing a new benchmark evaluation suite, dubbed \\textit{GenEval}.\nGenEval hosts a large array of distributions capturing many important\nproperties of real datasets, yet in a controlled setting, such as lower intrinsic dimensionality, multi-modality, compositionality,\nindependence and causal structure. Any model can be easily plugged for evaluation, provided it can generate samples.\n\nOur extensive evaluation suggests that different models have different strenghts, and that GenEval is a great tool to gain insights about how models and metrics work.\nWe offer GenEval to the community~\\footnote{Available at: \\it{coming soon}.} and believe that this benchmark will facilitate comparison and development of\nnew generative models.", "keywords": "generative models;GAN;VAE;Real NVP", "primary_area": "", "supplementary_material": "", "author": "Anton Bakhtin;Arthur Szlam;Marc'Aurelio Ranzato", "authorids": "yolo@fb.com;aszlam@fb.com;ranzato@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbakhtin2019geneval,\ntitle={GenEval: A Benchmark Suite for Evaluating Generative Models},\nauthor={Anton Bakhtin and Arthur Szlam and Marc'Aurelio Ranzato},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeRm3Aqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJeRm3Aqt7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;4", "wc_review": "298;293;1348", "wc_reply_reviewers": "41;153;74", "wc_reply_authors": "947;763;547", "reply_reviewers": "1;2;1", "reply_authors": "3;3;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 646.3333333333334, 496.15745708617766 ], "wc_reply_reviewers_avg": [ 89.33333333333333, 46.99172503986444 ], "wc_reply_authors_avg": [ 752.3333333333334, 163.47340932260374 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:NThZtjAC-ZsJ:scholar.google.com/&scioq=GenEval:+A+Benchmark+Suite+for+Evaluating+Generative+Models&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJedho0qFX", "title": "Using Word Embeddings to Explore the Learned Representations of Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "A simple technique using word embeddings provides multiple insights into the function and performance of CNNs, both during and after training, and for misclassified and adversarial examples.", "abstract": "As deep neural net architectures minimize loss, they build up information in a hierarchy of learned representations that ultimately serve their final goal. Different architectures tackle this problem in slightly different ways, but all models aim to create representational spaces that accumulate information through the depth of the network. Here we build on previous work that indicated that two very different model classes trained on two very different tasks actually build knowledge representations that have similar underlying representations. Namely, we compare word embeddings from SkipGram (trained to predict co-occurring words) to several CNN architectures (trained for image classification) in order to understand how this accumulation of knowledge behaves in CNNs. We improve upon previous work by including 5 times more ImageNet classes in our experiments, and further expand the scope of the analyses to include a network trained on CIFAR-100. We characterize network behavior in pretrained models, and also during training, misclassification, and adversarial attack. Our work illustrates the power of using one model to explore another, gives new insights for CNN models, and provides a framework for others to perform similar analyses when developing new architectures.", "keywords": "Distributional Semantics;word embeddings;cnns;interpretability", "primary_area": "", "supplementary_material": "", "author": "Dhanush Dharmaretnam;Chris Foster;Alona Fyshe", "authorids": "dhanush987@gmail.com;chris.james.foster@gmail.com;alona@ualberta.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndharmaretnam2019using,\ntitle={Using Word Embeddings to Explore the Learned Representations of Convolutional Neural Networks},\nauthor={Dhanush Dharmaretnam and Chris Foster and Alona Fyshe},\nyear={2019},\nurl={https://openreview.net/forum?id=HJedho0qFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJedho0qFX", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;2;4", "wc_review": "259;229;184", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 224.0, 30.822070014844883 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Zg9fMI4OHRkJ:scholar.google.com/&scioq=Using+Word+Embeddings+to+Explore+the+Learned+Representations+of+Convolutional+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HJehSnCcFX", "title": "Inference of unobserved event streams with neural Hawkes particle smoothing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Events that we observe in the world may be caused by other, unobserved events. We consider sequences of discrete events in continuous time. When only some of the events are observed, we propose particle smoothing to infer the missing events. Particle smoothing is an extension of particle filtering in which proposed events are conditioned on the future as well as the past. For our setting, we develop a novel proposal distribution that is a type of continuous-time bidirectional LSTM. We use the sampled particles in an approximate minimum Bayes risk decoder that outputs a single low-risk prediction of the missing events. We experiment in multiple synthetic and real domains, modeling the complete sequences in each domain with a neural Hawkes process (Mei & Eisner, 2017). On held-out incomplete sequences, our method is effective at inferring the ground-truth unobserved events. In particular, particle smoothing consistently improves upon particle filtering, showing the benefit of training a bidirectional proposal distribution.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hongyuan Mei;Guanghui Qin;Jason Eisner", "authorids": "hmei@cs.jhu.edu;ghq@pku.edu.cn;jason@cs.jhu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmei2019inference,\ntitle={Inference of unobserved event streams with neural Hawkes particle smoothing},\nauthor={Hongyuan Mei and Guanghui Qin and Jason Eisner},\nyear={2019},\nurl={https://openreview.net/forum?id=HJehSnCcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJehSnCcFX", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;3;4", "wc_review": "687;314;149", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "2194;2052;366", "reply_reviewers": "0;0;0", "reply_authors": "5;5;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 383.3333333333333, 225.04271199535037 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1537.3333333333333, 830.2840210166371 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.6666666666666665, 1.8856180831641267 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:u__32LoTK8YJ:scholar.google.com/&scioq=Inference+of+unobserved+event+streams+with+neural+Hawkes+particle+smoothing&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJei-2RcK7", "title": "Graph Transformer", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph neural networks (GNN) have gained increasing research interests as a mean to the challenging goal of robust and universal graph learning. Previous GNNs have assumed single pre-fixed graph structure and permitted only local context encoding. This paper proposes a novel Graph Transformer (GTR) architecture that captures long-range dependency with global attention, and enables dynamic graph structures. In particular, GTR propagates features within the same graph structure via an intra-graph message passing, and transforms dynamic semantics across multi-domain graph-structured data (e.g. images, sequences, knowledge graphs) for multi-modal learning via an inter-graph message passing. Furthermore, GTR enables effective incorporation of any prior graph structure by weighted averaging of the prior and learned edges, which can be crucially useful for scenarios where prior knowledge is desired. The proposed GTR achieves new state-of-the-arts across three benchmark tasks, including few-shot learning, medical abnormality and disease classification, and graph classification. Experiments show that GTR is superior in learning robust graph representations, transforming high-level semantics across domains, and bridging between prior graph structure with automatic structure learning. ", "keywords": "Graph neural networks;transformer;attention", "primary_area": "", "supplementary_material": "", "author": "Yuan Li;Xiaodan Liang;Zhiting Hu;Yinbo Chen;Eric P. Xing", "authorids": "liyuanchristy@gmail.com;xiaodan1@cs.cmu.edu;zhitingh@cs.cmu.edu;cyb15@mails.tsinghua.edu.cn;epxing@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019graph,\ntitle={Graph Transformer },\nauthor={Yuan Li and Xiaodan Liang and Zhiting Hu and Yinbo Chen and Eric P. Xing},\nyear={2019},\nurl={https://openreview.net/forum?id=HJei-2RcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJei-2RcK7", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;5;5", "wc_review": "137;130;141", "wc_reply_reviewers": "147;0;32", "wc_reply_authors": "602;363;498", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 136.0, 4.546060565661952 ], "wc_reply_reviewers_avg": [ 59.666666666666664, 63.12069567283161 ], "wc_reply_authors_avg": [ 487.6666666666667, 97.8445479091991 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HJej3s09Km", "title": "On the effect of the activation function on the distribution of hidden nodes in a deep network", "track": "main", "status": "Reject", "tldr": "We prove that, for activation functions satisfying some conditions, as a deep network gets wide, the lengths of the vectors of hidden variables converge to a length map.", "abstract": "We analyze the joint probability distribution on the lengths of the\nvectors of hidden variables in different layers of a fully connected\ndeep network, when the weights and biases are chosen randomly according to\nGaussian distributions, and the input is binary-valued. We show\nthat, if the activation function satisfies a minimal set of\nassumptions, satisfied by all activation functions that we know that\nare used in practice, then, as the width of the network gets large,\nthe ``length process'' converges in probability to a length map\nthat is determined as a simple function of the variances of the\nrandom weights and biases, and the activation function.\n\nWe also show that this convergence may fail for activation functions \nthat violate our assumptions.", "keywords": "theory;length map;initialization", "primary_area": "", "supplementary_material": "", "author": "Philip M. Long and Hanie Sedghi;Philip M. Long and Hanie Sedghi", "authorids": "plong@google.com;hsedghi@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsedghi2019on,\ntitle={On the effect of the activation function on the distribution of hidden nodes in a deep network},\nauthor={Philip M. Long and Hanie Sedghi},\nyear={2019},\nurl={https://openreview.net/forum?id=HJej3s09Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJej3s09Km", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;3", "wc_review": "468;191;192", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "260;78;88", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 283.6666666666667, 130.34398933420582 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 142.0, 83.53841431740649 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15928001590395036075&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "HJej6jR5Fm", "title": "Meta-Learning to Guide Segmentation", "track": "main", "status": "Reject", "tldr": "We propose a meta-learning approach for guiding visual segmentation tasks from varying amounts of supervision.", "abstract": "There are myriad kinds of segmentation, and ultimately the `\"right\" segmentation of a given scene is in the eye of the annotator. Standard approaches require large amounts of labeled data to learn just one particular kind of segmentation. As a first step towards relieving this annotation burden, we propose the problem of guided segmentation: given varying amounts of pixel-wise labels, segment unannotated pixels by propagating supervision locally (within an image) and non-locally (across images). We propose guided networks, which extract a latent task representation---guidance---from variable amounts and classes (categories, instances, etc.) of pixel supervision and optimize our architecture end-to-end for fast, accurate, and data-efficient segmentation by meta-learning. To span the few-shot and many-shot learning regimes, we examine guidance from as little as one pixel per concept to as much as 1000+ images, and compare to full gradient optimization at both extremes. To explore generalization, we analyze guidance as a bridge between different levels of supervision to segment classes as the union of instances. Our segmentor concentrates different amounts of supervision of different types of classes into an efficient latent representation, non-locally propagates this supervision across images, and can be updated quickly and cumulatively when given more supervision.", "keywords": "meta-learning;few-shot learning;visual segmentation", "primary_area": "", "supplementary_material": "", "author": "Kate Rakelly*;Evan Shelhamer*;Trevor Darrell;Alexei A. Efros;Sergey Levine", "authorids": "rakelly@eecs.berkeley.edu;shelhamer@cs.berkeley.edu;trevor@eecs.berkeley.edu;efros@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nrakelly*2019metalearning,\ntitle={Meta-Learning to Guide Segmentation},\nauthor={Kate Rakelly* and Evan Shelhamer* and Trevor Darrell and Alexei A. Efros and Sergey Levine},\nyear={2019},\nurl={https://openreview.net/forum?id=HJej6jR5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJej6jR5Fm", "pdf_size": 0, "rating": "3;3;7", "confidence": "5;4;4", "wc_review": "511;422;159", "wc_reply_reviewers": "280;0;0", "wc_reply_authors": "763;1504;390", "reply_reviewers": "1;0;0", "reply_authors": "1;2;1", "rating_avg": [ 4.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 364.0, 149.44118129440315 ], "wc_reply_reviewers_avg": [ 93.33333333333333, 131.99326582148888 ], "wc_reply_authors_avg": [ 885.6666666666666, 462.9862008982797 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13890355884700065612&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJepJh0qKX", "title": "Empirical Study of Easy and Hard Examples in CNN Training", "track": "main", "status": "Reject", "tldr": "Unknown properties of easy and hard examples are shown, and they come from biases in a dataset and SGD.", "abstract": "Deep Neural Networks (DNNs) generalize well despite their massive size and capability of memorizing all examples.\nThere is a hypothesis that DNNs start learning from simple patterns based on the observations that are consistently well-classified at early epochs (i.e., easy examples) and examples misclassified (i.e., hard examples).\nHowever, despite the importance of understanding the learning dynamics of DNNs, properties of easy and hard examples are not fully investigated.\nIn this paper, we study the similarities of easy and hard examples respectively among different CNNs, assessing those examples\u2019 contributions to generalization.\nOur results show that most easy examples are identical among different CNNs, as they share similar dataset-dependent patterns (e.g., colors, structures, and superficial cues in high-frequency).\nMoreover, while hard examples tend to contribute more to generalization than easy examples, removing a large number of easy examples leads to poor generalization, and we find that most misclassified examples in validation dataset are hard examples.\nBy analyzing intriguing properties of easy and hard examples, we discover that the reason why easy and hard examples have such properties can be explained by biases in a dataset and Stochastic Gradient Descent (SGD).", "keywords": "easy examples;hard example;CNN", "primary_area": "", "supplementary_material": "", "author": "Ikki Kishida;Hideki Nakayama", "authorids": "kishida@nlab.ci.i.u-tokyo.ac.jp;nakayama@nlab.ci.i.u-tokyo.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkishida2019empirical,\ntitle={Empirical Study of Easy and Hard Examples in {CNN} Training},\nauthor={Ikki Kishida and Hideki Nakayama},\nyear={2019},\nurl={https://openreview.net/forum?id=HJepJh0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJepJh0qKX", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;5", "wc_review": "356;722;686", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 588.0, 164.7057983193063 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18355698356208331501&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "HJeq43AqF7", "title": "Unsupervised Latent Tree Induction with Deep Inside-Outside Recursive Auto-Encoders", "track": "main", "status": "Withdraw", "tldr": "In this work we propose deep inside-outside recursive auto-encoders(DIORA) a fully unsupervised method of discovering syntax while simultaneously learning representations for discovered constituents. ", "abstract": "Syntax is a powerful abstraction for language understanding. Many downstream tasks require segmenting input text into meaningful constituent chunks (e.g., noun phrases or entities); more generally, models for learning semantic representations of text benefit from integrating syntax in the form of parse trees (e.g., tree-LSTMs). Supervised parsers have traditionally been used to obtain these trees, but lately interest has increased in unsupervised methods that induce syntactic representations directly from unlabeled text. To this end, we propose the deep inside-outside recursive autoencoder (DIORA), a fully-unsupervised method for discovering syntax that simultaneously learns representations for constituents within the induced tree. Unlike many prior approaches, DIORA does not rely on supervision from auxiliary downstream tasks and is thus not constrained to particular domains. Furthermore, competing approaches do not learn explicit phrase representations along with tree structures, which limits their applicability to phrase-based tasks. Extensive experiments on unsupervised parsing, segmentation, and phrase clustering demonstrate the efficacy of our method. DIORA achieves the state of the art in unsupervised parsing (46.9 F1) on the benchmark WSJ dataset.", "keywords": "latent-tree-learning;unsupervised-parsing", "primary_area": "", "supplementary_material": "", "author": "Andrew Drozdov;Patrick Verga;Mohit Yadev;Mohit Iyyer;Andrew McCallum", "authorids": "adrozdov@cs.umass.edu;pat@cs.umass.edu;ymohit@cs.umass.edu;miyyer@cs.umass.edu;mccallum@cs.umass.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJeq43AqF7", "pdf_size": 0, "rating": "2;5;6", "confidence": "4;4;3", "wc_review": "287;317;288", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "210;501;355", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 297.3333333333333, 13.912424503139471 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 355.3333333333333, 118.80048634393529 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.6933752452815364, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10522796287850523862&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HJerDj05tQ", "title": "Optimization on Multiple Manifolds", "track": "main", "status": "Reject", "tldr": "This paper introduces an algorithm to handle optimization problem with multiple constraints under vision of manifold.", "abstract": "Optimization on manifold has been widely used in machine learning, to handle optimization problems with constraint. Most previous works focus on the case with a single manifold. However, in practice it is quite common that the optimization problem involves more than one constraints, (each constraint corresponding to one manifold). It is not clear in general how to optimize on multiple manifolds effectively and provably especially when the intersection of multiple manifolds is not a manifold or cannot be easily calculated. We propose a unified algorithm framework to handle the optimization on multiple manifolds. Specifically, we integrate information from multiple manifolds and move along an ensemble direction by viewing the information from each manifold as a drift and adding them together. We prove the convergence properties of the proposed algorithms. We also apply the algorithms into training neural network with batch normalization layers and achieve preferable empirical results.", "keywords": "Optimization;Multiple constraints;Manifold", "primary_area": "", "supplementary_material": "", "author": "Mingyang Yi;Huishuai Zhang;Wei Chen;Zhi-ming Ma;Tie-yan Liu", "authorids": "yimingyang17@mails.ucas.edu.cn;huishuai.zhang@microsoft.com;wche@microsoft.com;mazm@amt.ac.cn;tie-yan.liu@mircosoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyi2019optimization,\ntitle={Optimization on Multiple Manifolds},\nauthor={Mingyang Yi and Huishuai Zhang and Wei Chen and Zhi-ming Ma and Tie-yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=HJerDj05tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJerDj05tQ", "pdf_size": 0, "rating": "1;3;7", "confidence": "5;4;3", "wc_review": "1046;381;74", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 2.494438257849294 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 500.3333333333333, 405.68981035049705 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9819805060619656, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "NOODL: Provable Online Dictionary Learning and Sparse Coding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/847", "id": "HJeu43ActQ", "author_site": "Sirisha Rambhatla, Xingguo Li, Jarvis Haupt", "tldr": "We present a provable algorithm for exactly recovering both factors of the dictionary learning model. ", "abstract": "We consider the dictionary learning problem, where the aim is to model the given data as a linear combination of a few columns of a matrix known as a dictionary, where the sparse weights forming the linear combination are known as coefficients. Since the dictionary and coefficients, parameterizing the linear model are unknown, the corresponding optimization is inherently non-convex. This was a major challenge until recently, when provable algorithms for dictionary learning were proposed. Yet, these provide guarantees only on the recovery of the dictionary, without explicit recovery guarantees on the coefficients. Moreover, any estimation error in the dictionary adversely impacts the ability to successfully localize and estimate the coefficients. This potentially limits the utility of existing provable dictionary learning methods in applications where coefficient recovery is of interest. To this end, we develop NOODL: a simple Neurally plausible alternating Optimization-based Online Dictionary Learning algorithm, which recovers both the dictionary and coefficients exactly at a geometric rate, when initialized appropriately. Our algorithm, NOODL, is also scalable and amenable for large scale distributed implementations in neural architectures, by which we mean that it only involves simple linear and non-linear operations. Finally, we corroborate these theoretical results via experimental evaluation of the proposed algorithm with the current state-of-the-art techniques.", "keywords": "dictionary learning;provable dictionary learning;online dictionary learning;sparse coding;support recovery;iterative hard thresholding;matrix factorization;neural architectures;neural networks;noodl", "primary_area": "", "supplementary_material": "", "author": "Sirisha Rambhatla;Xingguo Li;Jarvis Haupt", "authorids": "rambh002@umn.edu;lixx1661@umn.edu;jdhaupt@umn.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nrambhatla2019NOODL,\ntitle={NOODL: Provable Online Dictionary Learning and Sparse Coding},\nauthor={Sirisha Rambhatla and Xingguo Li and Jarvis Haupt},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeu43ActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;2;2", "wc_review": "203;176;338", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "389;787;401", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.0, 0.0 ], "wc_review_avg": [ 239.0, 70.8660708661063 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 525.6666666666666, 184.85549912176145 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12647512351246426060&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HJeu43ActQ", "pdf": "https://openreview.net/pdf?id=HJeu43ActQ", "email": ";;", "author_num": 3 }, { "id": "HJeuOiRqKQ", "title": "Pooling Is Neither Necessary nor Sufficient for Appropriate Deformation Stability in CNNs", "track": "main", "status": "Reject", "tldr": "We find that pooling alone does not determine deformation stability in CNNs and that filter smoothness plays an important role in determining stability. ", "abstract": "Many of our core assumptions about how neural networks operate remain empirically untested. One common assumption is that convolutional neural networks need to be stable to small translations and deformations to solve image recognition tasks. For many years, this stability was baked into CNN architectures by incorporating interleaved pooling layers. Recently, however, interleaved pooling has largely been abandoned. This raises a number of questions: Are our intuitions about deformation stability right at all? Is it important? Is pooling necessary for deformation invariance? If not, how is deformation invariance achieved in its absence? In this work, we rigorously test these questions, and find that deformation stability in convolutional networks is more nuanced than it first appears: (1) Deformation invariance is not a binary property, but rather that different tasks require different degrees of deformation stability at different layers. (2) Deformation stability is not a fixed property of a network and is heavily adjusted over the course of training, largely through the smoothness of the convolutional filters. (3) Interleaved pooling layers are neither necessary nor sufficient for achieving the optimal form of deformation stability for natural image classification. (4) Pooling confers \\emph{too much} deformation stability for image classification at initialization, and during training, networks have to learn to \\emph{counteract} this inductive bias. Together, these findings provide new insights into the role of interleaved pooling and deformation invariance in CNNs, and demonstrate the importance of rigorous empirical testing of even our most basic assumptions about the working of neural networks.", "keywords": "Convolutional Neural Networks;Deformation Stability;Pooling;Transformation Invariance", "primary_area": "", "supplementary_material": "", "author": "Avraham Ruderman;Neil C. Rabinowitz;Ari S. Morcos;Daniel Zoran", "authorids": "aruderman@google.com;ncr@google.com;arimorcos@gmail.com;danielzoran@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nruderman2019pooling,\ntitle={Pooling Is Neither Necessary nor Sufficient for Appropriate Deformation Stability in {CNN}s},\nauthor={Avraham Ruderman and Neil C. Rabinowitz and Ari S. Morcos and Daniel Zoran},\nyear={2019},\nurl={https://openreview.net/forum?id=HJeuOiRqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer1;AnonReviewer6", "site": "https://openreview.net/forum?id=HJeuOiRqKQ", "pdf_size": 0, "rating": "4;5;5;5", "confidence": "4;2;2;5", "wc_review": "371;252;251;313", "wc_reply_reviewers": "0;0;310;0", "wc_reply_authors": "292;0;347;177", "reply_reviewers": "0;0;1;0", "reply_authors": "1;0;1;1", "rating_avg": [ 4.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 1.299038105676658 ], "wc_review_avg": [ 296.75, 49.68085647409875 ], "wc_reply_reviewers_avg": [ 77.5, 134.23393758658798 ], "wc_reply_authors_avg": [ 204.0, 132.7949547234382 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0.75, 0.4330127018922193 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.3333333333333333, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2543638633806982422&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "HJex0o05F7", "title": "UaiNets: From Unsupervised to Active Deep Anomaly Detection", "track": "main", "status": "Reject", "tldr": "A method for active anomaly detection. We present a new layer that can be attached to any deep learning model designed for unsupervised anomaly detection to transform it into an active method.", "abstract": "This work presents a method for active anomaly detection which can be built upon existing deep learning solutions for unsupervised anomaly detection. We show that a prior needs to be assumed on what the anomalies are, in order to have performance guarantees in unsupervised anomaly detection. We argue that active anomaly detection has, in practice, the same cost of unsupervised anomaly detection but with the possibility of much better results. To solve this problem, we present a new layer that can be attached to any deep learning model designed for unsupervised anomaly detection to transform it into an active method, presenting results on both synthetic and real anomaly detection datasets.", "keywords": "Anomaly Detection;Active Learning;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Tiago Pimentel;Marianne Monteiro;Juliano Viana;Adriano Veloso;Nivio Ziviani", "authorids": "tiago.pimentel@kunumi.com;marianne@kunumi.com;juliano@kunumi.com;adrianov@dcc.ufmg.br;nivio@dcc.ufmg.br", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npimentel2019uainets,\ntitle={UaiNets: From Unsupervised to Active Deep Anomaly Detection},\nauthor={Tiago Pimentel and Marianne Monteiro and Juliano Viana and Adriano Veloso and Nivio Ziviani},\nyear={2019},\nurl={https://openreview.net/forum?id=HJex0o05F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJex0o05F7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;2", "wc_review": "583;98;180", "wc_reply_reviewers": "163;0;0", "wc_reply_authors": "611;451;377", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 287.0, 211.96383339302642 ], "wc_reply_reviewers_avg": [ 54.333333333333336, 76.83893688893816 ], "wc_reply_authors_avg": [ 479.6666666666667, 97.65699610826093 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:E1ppgIHvDCYJ:scholar.google.com/&scioq=UaiNets:+From+Unsupervised+to+Active+Deep+Anomaly+Detection&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJf7ts0cFm", "title": "State-Regularized Recurrent Networks", "track": "main", "status": "Reject", "tldr": "We introduce stochastic state transition mechanism to RNNs, simplifies finite state automata (DFA) extraction, forces RNNs to operate more like automata with external memory, better extrapolation behavior and interpretability.", "abstract": "Recurrent networks are a widely used class of neural architectures. They have, however, two shortcomings. First, it is difficult to understand what exactly they learn. Second, they tend to work poorly on sequences requiring long-term memorization, despite having this capacity in principle. We aim to address both shortcomings with a class of recurrent networks that use a stochastic state transition mechanism between cell applications. This mechanism, which we term state-regularization, makes RNNs transition between a finite set of learnable states. We show that state-regularization (a) simplifies the extraction of finite state automata modeling an RNN's state transition dynamics, and (b) forces RNNs to operate more like automata with external memory and less like finite state machines.", "keywords": "recurrent network;finite state machines;state-regularized;interpretability and explainability", "primary_area": "", "supplementary_material": "", "author": "Cheng Wang;Mathias Niepert", "authorids": "dr.rer.nat.chengwang@gmail.com;mathias.niepert@neclab.eu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwang2019stateregularized,\ntitle={State-Regularized Recurrent Networks},\nauthor={Cheng Wang and Mathias Niepert},\nyear={2019},\nurl={https://openreview.net/forum?id=HJf7ts0cFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=HJf7ts0cFm", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;5;4", "wc_review": "329;297;271", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "780;622;596", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 299.0, 23.72059583287626 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 666.0, 81.30600633819537 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mlDtrLgXRKoJ:scholar.google.com/&scioq=State-Regularized+Recurrent+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Stochastic Gradient/Mirror Descent: Minimax Optimality and Implicit Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/688", "id": "HJf9ZhC9FX", "author_site": "Navid Azizan, Babak Hassibi", "tldr": "", "abstract": "Stochastic descent methods (of the gradient and mirror varieties) have become increasingly popular in optimization. In fact, it is now widely recognized that the success of deep learning is not only due to the special deep architecture of the models, but also due to the behavior of the stochastic descent methods used, which play a key role in reaching \"good\" solutions that generalize well to unseen data. In an attempt to shed some light on why this is the case, we revisit some minimax properties of stochastic gradient descent (SGD) for the square loss of linear models---originally developed in the 1990's---and extend them to \\emph{general} stochastic mirror descent (SMD) algorithms for \\emph{general} loss functions and \\emph{nonlinear} models. \nIn particular, we show that there is a fundamental identity which holds for SMD (and SGD) under very general conditions, and which implies the minimax optimality of SMD (and SGD) for sufficiently small step size, and for a general class of loss functions and general nonlinear models.\nWe further show that this identity can be used to naturally establish other properties of SMD (and SGD), namely convergence and \\emph{implicit regularization} for over-parameterized linear models (in what is now being called the \"interpolating regime\"), some of which have been shown in certain cases in prior literature. We also argue how this identity can be used in the so-called \"highly over-parameterized\" nonlinear setting (where the number of parameters far exceeds the number of data points) to provide insights into why SMD (and SGD) may have similar convergence and implicit regularization properties for deep learning. ", "keywords": "optimization;stochastic gradient descent;mirror descent;implicit regularization;deep learning theory", "primary_area": "", "supplementary_material": "", "author": "Navid Azizan;Babak Hassibi", "authorids": "azizan@caltech.edu;hassibi@caltech.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nazizan2018stochastic,\ntitle={Stochastic Gradient/Mirror Descent: Minimax Optimality and Implicit Regularization},\nauthor={Navid Azizan and Babak Hassibi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJf9ZhC9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;3;4", "wc_review": "283;171;349", "wc_reply_reviewers": "0;119;0", "wc_reply_authors": "701;1246;360", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 267.6666666666667, 73.47259504211229 ], "wc_reply_reviewers_avg": [ 39.666666666666664, 56.09713797413277 ], "wc_reply_authors_avg": [ 769.0, 364.88993774378963 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11983430360306499226&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HJf9ZhC9FX", "pdf": "https://openreview.net/pdf?id=HJf9ZhC9FX", "email": ";", "author_num": 2 }, { "id": "HJfQrs0qt7", "title": "Convergence Properties of Deep Neural Networks on Separable Data", "track": "main", "status": "Reject", "tldr": "This paper analyzes the learning dynamics of neural networks on classification tasks solved by gradient descent using the cross-entropy and hinge losses.", "abstract": "While a lot of progress has been made in recent years, the dynamics of learning in deep nonlinear neural networks remain to this day largely misunderstood. In this work, we study the case of binary classification and prove various properties of learning in such networks under strong assumptions such as linear separability of the data. Extending existing results from the linear case, we confirm empirical observations by proving that the classification error also follows a sigmoidal shape in nonlinear architectures. We show that given proper initialization, learning expounds parallel independent modes and that certain regions of parameter space might lead to failed training. We also demonstrate that input norm and features' frequency in the dataset lead to distinct convergence speeds which might shed some light on the generalization capabilities of deep neural networks. We provide a comparison between the dynamics of learning with cross-entropy and hinge losses, which could prove useful to understand recent progress in the training of generative adversarial networks. Finally, we identify a phenomenon that we baptize gradient starvation where the most frequent features in a dataset prevent the learning of other less frequent but equally informative features.", "keywords": "learning dynamics;gradient descent;classification;optimization;cross-entropy;hinge loss;implicit regularization;gradient starvation", "primary_area": "", "supplementary_material": "", "author": "Remi Tachet des Combes;Mohammad Pezeshki;Samira Shabanian;Aaron Courville;Yoshua Bengio", "authorids": "remi.tachet@microsoft.com;mohammad.pezeshki@umontreal.ca;s.shabanian@gmail.com;aaron.courville@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ncombes2019convergence,\ntitle={Convergence Properties of Deep Neural Networks on Separable Data},\nauthor={Remi Tachet des Combes and Mohammad Pezeshki and Samira Shabanian and Aaron Courville and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=HJfQrs0qt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJfQrs0qt7", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;4;4", "wc_review": "377;628;155", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "371;346;179", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 386.6666666666667, 193.22238195635848 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 298.6666666666667, 85.23040667638647 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4610646655832289226&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Active Learning with Partial Feedback", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/794", "id": "HJfSEnRqKQ", "author_site": "Peiyun Hu, Zachary Lipton, Anima Anandkumar, Deva Ramanan", "tldr": "We provide a new perspective on training a machine learning model from scratch in hierarchical label setting, i.e. thinking of it as two-way communication between human and algorithms, and study how we can both measure and improve the efficiency. ", "abstract": "While many active learning papers assume that the learner can simply ask for a label and receive it, real annotation often presents a mismatch between the form of a label (say, one among many classes), and the form of an annotation (typically yes/no binary feedback). To annotate examples corpora for multiclass classification, we might need to ask multiple yes/no questions, exploiting a label hierarchy if one is available. To address this more realistic setting, we propose active learning with partial feedback (ALPF), where the learner must actively choose both which example to label and which binary question to ask. At each step, the learner selects an example, asking if it belongs to a chosen (possibly composite) class. Each answer eliminates some classes, leaving the learner with a partial label. The learner may then either ask more questions about the same example (until an exact label is uncovered) or move on immediately, leaving the first example partially labeled. Active learning with partial labels requires (i) a sampling strategy to choose (example, class) pairs, and (ii) learning from partial labels between rounds. Experiments on Tiny ImageNet demonstrate that our most effective method improves 26% (relative) in top-1 classification accuracy compared to i.i.d. baselines and standard active learners given 30% of the annotation budget that would be required (naively) to annotate the dataset. Moreover, ALPF-learners fully annotate TinyImageNet at 42% lower cost. Surprisingly, we observe that accounting for per-example annotation costs can alter the conventional wisdom that active learners should solicit labels for hard examples.", "keywords": "Active Learning;Learning from Partial Feedback", "primary_area": "", "supplementary_material": "", "author": "Peiyun Hu;Zachary C. Lipton;Anima Anandkumar;Deva Ramanan", "authorids": "peiyunh@cs.cmu.edu;zlipton@cmu.edu;anima@caltech.edu;deva@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhu2018active,\ntitle={Active Learning with Partial Feedback},\nauthor={Peiyun Hu and Zack Lipton and Anima Anandkumar and Deva Ramanan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJfSEnRqKQ},\n}", "github": "[![github](/images/github_icon.svg) peiyunh/alpf](https://github.com/peiyunh/alpf)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "wc_review": "215;426;200", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "329;393;162", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 280.3333333333333, 103.18376271918412 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 294.6666666666667, 97.38012573871985 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2828167692054854631&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14, "openreview": "https://openreview.net/forum?id=HJfSEnRqKQ", "pdf": "https://openreview.net/pdf?id=HJfSEnRqKQ", "email": ";;;", "author_num": 4 }, { "title": "Gradient descent aligns the layers of deep linear networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/951", "id": "HJflg30qKX", "author_site": "Ziwei Ji, Matus Telgarsky", "tldr": "", "abstract": "This paper establishes risk convergence and asymptotic weight matrix alignment --- a form of implicit regularization --- of gradient flow and gradient descent when applied to deep linear networks on linearly separable data. In more detail, for gradient flow applied to strictly decreasing loss functions (with similar results for gradient descent with particular decreasing step sizes):\n(i) the risk converges to 0;\n(ii) the normalized i-th weight matrix asymptotically equals its rank-1 approximation u_iv_i^T;\n(iii) these rank-1 matrices are aligned across layers, meaning |v_{i+1}^T u_i| -> 1.\nIn the case of the logistic loss (binary cross entropy), more can be said: the linear function induced by the network --- the product of its weight matrices --- converges to the same direction as the maximum margin solution. This last property was identified in prior work, but only under assumptions on gradient descent which here are implied by the alignment phenomenon.", "keywords": "implicit regularization;alignment of layers;deep linear networks;gradient descent;separable data", "primary_area": "", "supplementary_material": "", "author": "Ziwei Ji;Matus Telgarsky", "authorids": "ziweiji2@illinois.edu;mjt@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nji2018gradient,\ntitle={Gradient descent aligns the layers of deep linear networks},\nauthor={Ziwei Ji and Matus Telgarsky},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJflg30qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "5;4;4", "wc_review": "227;180;222", "wc_reply_reviewers": "18;0;0", "wc_reply_authors": "352;87;183", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 209.66666666666666, 21.076579946049648 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 207.33333333333334, 109.54552579736983 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 284, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6734207111249111403&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HJflg30qKX", "pdf": "https://openreview.net/pdf?id=HJflg30qKX", "email": ";", "author_num": 2 }, { "title": "Data-Dependent Coresets for Compressing Neural Networks with Applications to Generalization Bounds", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/868", "id": "HJfwJ2A5KX", "author_site": "Cenk Baykal, Lucas Liebenwein, Igor Gilitschenski, Dan Feldman, Daniela Rus", "tldr": "", "abstract": "We present an efficient coresets-based neural network compression algorithm that sparsifies the parameters of a trained fully-connected neural network in a manner that provably approximates the network's output. Our approach is based on an importance sampling scheme that judiciously defines a sampling distribution over the neural network parameters, and as a result, retains parameters of high importance while discarding redundant ones. We leverage a novel, empirical notion of sensitivity and extend traditional coreset constructions to the application of compressing parameters. Our theoretical analysis establishes guarantees on the size and accuracy of the resulting compressed network and gives rise to generalization bounds that may provide new insights into the generalization properties of neural networks. We demonstrate the practical effectiveness of our algorithm on a variety of neural network configurations and real-world data sets.", "keywords": "coresets;neural network compression;generalization bounds;matrix sparsification", "primary_area": "", "supplementary_material": "", "author": "Cenk Baykal;Lucas Liebenwein;Igor Gilitschenski;Dan Feldman;Daniela Rus", "authorids": "baykal@mit.edu;lucasl@mit.edu;igilitschenski@mit.edu;dannyf@gmail.com;rus@csail.mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbaykal2018datadependent,\ntitle={Data-Dependent Coresets for Compressing Neural Networks with Applications to Generalization Bounds},\nauthor={Cenk Baykal and Lucas Liebenwein and Igor Gilitschenski and Dan Feldman and Daniela Rus},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJfwJ2A5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "wc_review": "506;202;432", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "681;353;805", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 380.0, 129.4398187060947 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 613.0, 190.68997526526314 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15061912731430801795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HJfwJ2A5KX", "pdf": "https://openreview.net/pdf?id=HJfwJ2A5KX", "email": ";;;;", "author_num": 5 }, { "id": "HJfxbhR9KQ", "title": "Mimicking actions is a good strategy for beginners: Fast Reinforcement Learning with Expert Action Sequences", "track": "main", "status": "Reject", "tldr": "Appending most frequent action pairs from an expert player to a novice RL agent's action space improves the scores by huge margin.", "abstract": "Imitation Learning is the task of mimicking the behavior of an expert player in a Reinforcement Learning(RL) Environment to enhance the training of a fresh agent (called novice) beginning from scratch. Most of the Reinforcement Learning environments are stochastic in nature, i.e., the state sequences that an agent may encounter usually follow a Markov Decision Process (MDP). This makes the task of mimicking difficult as it is very unlikely that a new agent may encounter same or similar state sequences as an expert. Prior research in Imitation Learning proposes various ways to learn a mapping between the states encountered and the respective actions taken by the expert while mostly being agnostic to the order in which these were performed. Most of these methods need considerable number of states-action pairs to achieve good results. We propose a simple alternative to Imitation Learning by appending the novice\u2019s action space with the frequent short action sequences that the expert has taken. This simple modification, surprisingly improves the exploration and significantly outperforms alternative approaches like Dataset Aggregation. We experiment with several popular Atari games and show significant and consistent growth in the score that the new agents achieve using just a few expert action sequences.", "keywords": "Reinforcement Learning;Imitation Learning;Atari;A3C;GA3C", "primary_area": "", "supplementary_material": "", "author": "Tharun Medini;Anshumali Shrivastava", "authorids": "tharun.medini@rice.edu;anshumali@rice.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmedini2019mimicking,\ntitle={Mimicking actions is a good strategy for beginners: Fast Reinforcement Learning with Expert Action Sequences},\nauthor={Tharun Medini and Anshumali Shrivastava},\nyear={2019},\nurl={https://openreview.net/forum?id=HJfxbhR9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=HJfxbhR9KQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "2;3;4", "wc_review": "190;464;373", "wc_reply_reviewers": "0;207;0", "wc_reply_authors": "365;459;500", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 342.3333333333333, 113.9424806158304 ], "wc_reply_reviewers_avg": [ 69.0, 97.58073580374356 ], "wc_reply_authors_avg": [ 441.3333333333333, 56.51155240794183 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:eg94mKHVtrgJ:scholar.google.com/&scioq=Mimicking+actions+is+a+good+strategy+for+beginners:+Fast+Reinforcement+Learning+with+Expert+Action+Sequences&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJg3rjA5tQ", "title": "Spread Divergences", "track": "main", "status": "Reject", "tldr": "Using noise to define the divergence between distributions with different support.", "abstract": "For distributions $p$ and $q$ with different support, the divergence $\\div{p}{q}$ generally will not exist. We define a spread divergence $\\sdiv{p}{q}$ on modified $p$ and $q$ and describe sufficient conditions for the existence of such a divergence. We give examples of using a spread divergence to train implicit generative models, including linear models (Principal Components Analysis and Independent Components Analysis) and non-linear models (Deep Generative Networks).", "keywords": "Generative Adversarial Network;Divergence", "primary_area": "", "supplementary_material": "", "author": "David Barber;Mingtian Zhang;Raza Habib;Thomas Bird", "authorids": "d.barber@cs.ucl.ac.uk;mingtian.zhang.17@ucl.ac.uk;raza.habib.15@ucl.ac.uk;thomas.bird.17@ucl.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbarber2019spread,\ntitle={Spread Divergences},\nauthor={David Barber and Mingtian Zhang and Raza Habib and Thomas Bird},\nyear={2019},\nurl={https://openreview.net/forum?id=HJg3rjA5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJg3rjA5tQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "586;464;307", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "566;602;105", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 452.3333333333333, 114.19963027766578 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 424.3333333333333, 226.280553492537 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16017195806562829572&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7 }, { "id": "HJg6e2CcK7", "title": "Clean-Label Backdoor Attacks", "track": "main", "status": "Reject", "tldr": "We show how to successfully perform backdoor attacks without changing training labels.", "abstract": "Deep neural networks have been recently demonstrated to be vulnerable to backdoor attacks. Specifically, by altering a small set of training examples, an adversary is able to install a backdoor that can be used during inference to fully control the model\u2019s behavior. While the attack is very powerful, it crucially relies on the adversary being able to introduce arbitrary, often clearly mislabeled, inputs to the training set and can thus be detected even by fairly rudimentary data filtering. In this paper, we introduce a new approach to executing backdoor attacks, utilizing adversarial examples and GAN-generated data. The key feature is that the resulting poisoned inputs appear to be consistent with their label and thus seem benign even upon human inspection.", "keywords": "data poisoning;backdoor attacks;clean labels;adversarial examples;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Alexander Turner;Dimitris Tsipras;Aleksander Madry", "authorids": "turneram@mit.edu;tsipras@mit.edu;madry@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nturner2019cleanlabel,\ntitle={Clean-Label Backdoor Attacks},\nauthor={Alexander Turner and Dimitris Tsipras and Aleksander Madry},\nyear={2019},\nurl={https://openreview.net/forum?id=HJg6e2CcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJg6e2CcK7", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;2;2", "wc_review": "489;105;252", "wc_reply_reviewers": "164;0;7", "wc_reply_authors": "868;16;309", "reply_reviewers": "2;0;1", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 282.0, 158.19608086169518 ], "wc_reply_reviewers_avg": [ 57.0, 75.71437556149206 ], "wc_reply_authors_avg": [ 397.6666666666667, 353.43300480980673 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 178, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5708531697668375749&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJgHwi0ctm", "title": "NA", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "NA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Masoud Faraki;Mahsa Baktashmotlagh;Tom Drummond;Mathieu Salzmann", "authorids": "masoud.faraki@monash.edu;m.baktashmotlagh@qut.edu.au;tom.drummond@monash.edu;mathieu.salzmann@epfl.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJgHwi0ctm", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;4", "wc_review": "209;433;156", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 266.0, 120.05276617665528 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HJgJS30qtm", "title": "REVISTING NEGATIVE TRANSFER USING ADVERSARIAL LEARNING", "track": "main", "status": "Reject", "tldr": "We look at negative transfer from a domain adaptation point of view to derive an adversarial learning algorithm.", "abstract": "An unintended consequence of feature sharing is the model fitting to correlated tasks within the dataset, termed negative transfer. In this paper, we revisit the problem of negative transfer in multitask setting and find that its corrosive effects are applicable to a wide range of linear and non-linear models, including neural networks. We first study the effects of negative transfer in a principled way and show that previously proposed counter-measures are insufficient, particularly for trainable features. We propose an adversarial training approach to mitigate the effects of negative transfer by viewing the problem in a domain adaptation setting. Finally, empirical results on attribute prediction multi-task on AWA and CUB datasets further validate the need for correcting negative sharing in an end-to-end manner.", "keywords": "Negative Transfer;Adversarial Learning", "primary_area": "", "supplementary_material": "", "author": "Saneem Ahmed Chemmengath;Samarth Bharadwaj;Suranjana Samanta;Karthik Sankaranarayanan", "authorids": "saneem.cg@in.ibm.com;samarth.b@in.ibm.com;suransam@in.ibm.com;kartsank@in.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchemmengath2019revisting,\ntitle={{REVISTING} {NEGATIVE} {TRANSFER} {USING} {ADVERSARIAL} {LEARNING}},\nauthor={Saneem Ahmed Chemmengath and Samarth Bharadwaj and Suranjana Samanta and Karthik Sankaranarayanan},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgJS30qtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJgJS30qtm", "pdf_size": 0, "rating": "2;4;6", "confidence": "4;4;4", "wc_review": "531;311;537", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 459.6666666666667, 105.15174220567575 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FmeFtUAvN6YJ:scholar.google.com/&scioq=REVISTING+NEGATIVE+TRANSFER+USING+ADVERSARIAL+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HJgODj05KX", "title": "A preconditioned accelerated stochastic gradient descent algorithm", "track": "main", "status": "Reject", "tldr": "We propose a preconditioned accelerated gradient method that combines Nesterov\u2019s accelerated gradient descent with a class of diagonal preconditioners, in a stochastic setting.", "abstract": "We propose a preconditioned accelerated stochastic gradient method suitable for large scale optimization. We derive sufficient convergence conditions for the minimization of convex functions using a generic class of diagonal preconditioners and provide a formal convergence proof based on a framework originally used for on-line learning. Inspired by recent popular adaptive per-feature algorithms, we propose a specific preconditioner based on the second moment of the gradient. The sufficient convergence conditions motivate a critical adaptation of the per-feature updates in order to ensure convergence. We show empirical results for the minimization of convex and non-convex cost functions, in the context of neural network training. The method compares favorably with respect to current, first order, stochastic optimization methods.", "keywords": "stochastic optimization;neural network;preconditioned accelerated stochastic gradient descent", "primary_area": "", "supplementary_material": "", "author": "Alexandru Onose;Seyed Iman Mossavat;Henk-Jan H. Smilde", "authorids": "alexandru.onose@asml.com;iman.mossavat@asml.com;henk-jan.smilde@asml.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nonose2019a,\ntitle={A preconditioned accelerated stochastic gradient descent algorithm},\nauthor={Alexandru Onose and Seyed Iman Mossavat and Henk-Jan H. Smilde},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgODj05KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJgODj05KX", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;3", "wc_review": "507;248;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 343.0, 116.4502755113386 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3333774144313560095&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HJgOl3AqY7", "title": "Modulated Variational Auto-Encoders for Many-to-Many Musical Timbre Transfer", "track": "main", "status": "Reject", "tldr": "The paper uses Variational Auto-Encoding and network conditioning for Musical Timbre Transfer, we develop and generalize our architecture for many-to-many instrument transfers together with visualizations and evaluations.", "abstract": "Generative models have been successfully applied to image style transfer and domain translation. However, there is still a wide gap in the quality of results when learning such tasks on musical audio. Furthermore, most translation models only enable one-to-one or one-to-many transfer by relying on separate encoders or decoders and complex, computationally-heavy models. In this paper, we introduce the Modulated Variational auto-Encoders (MoVE) to perform musical timbre transfer. First, we define timbre transfer as applying parts of the auditory properties of a musical instrument onto another. We show that we can achieve and improve this task by conditioning existing domain translation techniques with Feature-wise Linear Modulation (FiLM). Then, by replacing the usual adversarial translation criterion by a Maximum Mean Discrepancy (MMD) objective, we alleviate the need for an auxiliary pair of discriminative networks. This allows a faster and more stable training, along with a controllable latent space encoder. By further conditioning our system on several different instruments, we can generalize to many-to-many transfer within a single variational architecture able to perform multi-domain transfers. Our models map inputs to 3-dimensional representations, successfully translating timbre from one instrument to another and supporting sound synthesis on a reduced set of control parameters. We evaluate our method in reconstruction and generation tasks while analyzing the auditory descriptor distributions across transferred domains. We show that this architecture incorporates generative controls in multi-domain transfer, yet remaining rather light, fast to train and effective on small datasets.", "keywords": "Musical Timbre;Instrument Translation;Domain Translation;Style Transfer;Sound Synthesis;Musical Information;Deep Learning;Variational Auto-Encoder;Generative Models;Network Conditioning", "primary_area": "", "supplementary_material": "", "author": "Adrien Bitton;Philippe Esling;Axel Chemla-Romeu-Santos", "authorids": "bitton@ircam.fr;philippe.esling@ircam.fr;axel.chemla-romeu-santos@ircam.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbitton2019modulated,\ntitle={Modulated Variational Auto-Encoders for Many-to-Many Musical Timbre Transfer},\nauthor={Adrien Bitton and Philippe Esling and Axel Chemla-Romeu-Santos},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgOl3AqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJgOl3AqY7", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;3;3", "wc_review": "837;511;165", "wc_reply_reviewers": "193;222;0", "wc_reply_authors": "897;652;167", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 504.3333333333333, 274.3833490250861 ], "wc_reply_reviewers_avg": [ 138.33333333333334, 98.53031118504711 ], "wc_reply_authors_avg": [ 572.0, 303.3424907042643 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18240424078159890222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HJgTHnActQ", "title": "Local Image-to-Image Translation via Pixel-wise Highway Adaptive Instance Normalization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, image-to-image translation has seen a significant success. Among many approaches, image translation based on an exemplar image, which contains the target style information, has been popular, owing to its capability to handle multimodality as well as its suitability for practical use. However, most of the existing methods extract the style information from an entire exemplar and apply it to the entire input image, which introduces excessive image translation in irrelevant image regions. In response, this paper proposes a novel approach that jointly extracts out the local masks of the input image and the exemplar as targeted regions to be involved for image translation. In particular, the main novelty of our model lies in (1) co-segmentation networks for local mask generation and (2) the local mask-based highway adaptive instance normalization technique. We demonstrate the quantitative and the qualitative evaluation results to show the advantages of our proposed approach. Finally, the code is available at https://github.com/AnonymousIclrAuthor/Highway-Adaptive-Instance-Normalization", "keywords": "image to image translation;image translation;exemplar;mutlimodal", "primary_area": "", "supplementary_material": "", "author": "Wonwoong Cho;Seunghwan Choi;Junwoo Park;David Keetae Park;Tao Qin;Jaegul Choo", "authorids": "tyflehd21@korea.ac.kr;shadow2496@korea.ac.kr;skp.1003874@sk.com;heykeetae@gmail.com;taotsin@msn.com;jchoo@korea.ac.kr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ncho2019local,\ntitle={Local Image-to-Image Translation via Pixel-wise Highway Adaptive Instance Normalization},\nauthor={Wonwoong Cho and Seunghwan Choi and Junwoo Park and David Keetae Park and Tao Qin and Jaegul Choo},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgTHnActQ},\n}", "github": "[![github](/images/github_icon.svg) AnonymousIclrAuthor/Highway-Adaptive-Instance-Normalization](https://github.com/AnonymousIclrAuthor/Highway-Adaptive-Instance-Normalization)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJgTHnActQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;4;4", "wc_review": "328;405;256", "wc_reply_reviewers": "112;346;0", "wc_reply_authors": "953;1193;114", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 329.6666666666667, 60.8404105472305 ], "wc_reply_reviewers_avg": [ 152.66666666666666, 144.15115523489763 ], "wc_reply_authors_avg": [ 753.3333333333334, 462.5727570399662 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OMvoK9Aw-XgJ:scholar.google.com/&scioq=Local+Image-to-Image+Translation+via+Pixel-wise+Highway+Adaptive+Instance+Normalization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJgVisRqtX", "title": "SEGEN: SAMPLE-ENSEMBLE GENETIC EVOLUTIONARY NETWORK MODEL", "track": "main", "status": "Reject", "tldr": "We introduce a new representation learning model, namely \u201cSample-Ensemble Genetic Evolutionary Network\u201d (SEGEN), which can serve as an alternative approach to deep learning models.", "abstract": "Deep learning, a rebranding of deep neural network research works, has achieved a remarkable success in recent years. With multiple hidden layers, deep learning models aim at computing the hierarchical feature representations of the observational data. Meanwhile, due to its severe disadvantages in data consumption, computational resources, parameter tuning costs and the lack of result explainability, deep learning has also suffered from lots of criticism. In this paper, we will introduce a new representation learning model, namely \u201cSample-Ensemble Genetic Evolutionary Network\u201d (SEGEN), which can serve as an alternative approach to deep learning models. Instead of building one single deep model, based on a set of sampled sub-instances, SEGEN adopts a genetic-evolutionary learning strategy to build a group of unit models generations by generations. The unit models incorporated in SEGEN can be either traditional machine learning models or the recent deep learning models with a much \u201cnarrower\u201d and \u201cshallower\u201d architecture. The learning results of each instance at the final generation will be effectively combined from each unit model via diffusive propagation and ensemble learning strategies. From the computational perspective, SEGEN requires far less data, fewer computational resources and parameter tuning efforts, but has sound theoretic interpretability of the learning process and results. Extensive experiments have been done on several different real-world benchmark datasets, and the experimental results obtained by SEGEN have demonstrated its advantages over the state-of-the-art representation learning models.", "keywords": "Genetic Evolutionary Network;Deep Learning;Genetic Algorithm;Ensemble Learning;Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Jiawei Zhang;Limeng Cui;Fisher B. Gouza", "authorids": "jiawei@ifmlab.org;lmcui932@163.com;fisherbgouza@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019segen,\ntitle={{SEGEN}: {SAMPLE}-{ENSEMBLE} {GENETIC} {EVOLUTIONARY} {NETWORK} {MODEL}},\nauthor={Jiawei Zhang and Limeng Cui and Fisher B. Gouza},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgVisRqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJgVisRqtX", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;2", "wc_review": "707;349;49", "wc_reply_reviewers": "295;209;0", "wc_reply_authors": "659;784;179", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 368.3333333333333, 268.97500916545306 ], "wc_reply_reviewers_avg": [ 168.0, 123.8735914820696 ], "wc_reply_authors_avg": [ 540.6666666666666, 260.7787482820553 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8548909076461463161&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "On the loss landscape of a class of deep neural networks with no bad local valleys", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/766", "id": "HJgXsjA5tQ", "author_site": "Quynh Nguyen, Mahesh Chandra Mukkamala, Matthias Hein", "tldr": "", "abstract": "We identify a class of over-parameterized deep neural networks with standard activation functions and cross-entropy loss which provably have no bad local valley, in the sense that from any point in parameter space there exists a continuous path on which the cross-entropy loss is non-increasing and gets arbitrarily close to zero. This implies that these networks have no sub-optimal strict local minima.", "keywords": "loss landscape;local minima;deep neural networks", "primary_area": "", "supplementary_material": "", "author": "Quynh Nguyen;Mahesh Chandra Mukkamala;Matthias Hein", "authorids": "quynh@cs.uni-saarland.de;mmahesh.chandra873@gmail.com;matthias.hein@uni-tuebingen.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nnguyen2018on,\ntitle={On the loss landscape of a class of deep neural networks with no bad local valleys},\nauthor={Quynh Nguyen and Mahesh Chandra Mukkamala and Matthias Hein},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgXsjA5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;4", "wc_review": "788;502;359", "wc_reply_reviewers": "422;479;0", "wc_reply_authors": "2133;589;362", "reply_reviewers": "2;1;0", "reply_authors": "4;2;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 549.6666666666666, 178.35233543622454 ], "wc_reply_reviewers_avg": [ 300.3333333333333, 213.63884374232032 ], "wc_reply_authors_avg": [ 1028.0, 786.8295029208466 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11262228996628138485&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HJgXsjA5tQ", "pdf": "https://openreview.net/pdf?id=HJgXsjA5tQ", "email": ";;", "author_num": 3 }, { "id": "HJgZrsC5t7", "title": "Improving On-policy Learning with Statistical Reward Accumulation", "track": "main", "status": "Reject", "tldr": "Improving On-policy Learning with Statistical Reward Accumulation", "abstract": "Deep reinforcement learning has obtained significant breakthroughs in recent years. Most methods in deep-RL achieve good results via the maximization of the reward signal provided by the environment, typically in the form of discounted cumulative returns. Such reward signals represent the immediate feedback of a particular action performed by an agent. However, tasks with sparse reward signals are still challenging to on-policy methods. In this paper, we introduce an effective characterization of past reward statistics (which can be seen as long-term feedback signals) to supplement this immediate reward feedback. In particular, value functions are learned with multi-critics supervision, enabling complex value functions to be more easily approximated in on-policy learning, even when the reward signals are sparse. We also introduce a novel exploration mechanism called ``hot-wiring'' that can give a boost to seemingly trapped agents. We demonstrate the effectiveness of our advantage actor multi-critic (A2MC) method across the discrete domains in Atari games as well as continuous domains in the MuJoCo environments. A video demo is provided at https://youtu.be/zBmpf3Yz8tc and source codes will be made available upon paper acceptance.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yubin Deng;Ke Yu;Dahua Lin;Xiaoou Tang;Chen Change Loy", "authorids": "dy015@ie.cuhk.edu.hk;yk017@ie.cuhk.edu.hk;dhlin@ie.cuhk.edu.hk;xtang@ie.cuhk.edu.hk;ccloy@ieee.org", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndeng2019improving,\ntitle={Improving On-policy Learning with Statistical Reward Accumulation},\nauthor={Yubin Deng and Ke Yu and Dahua Lin and Xiaoou Tang and Chen Change Loy},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgZrsC5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=HJgZrsC5t7", "pdf_size": 0, "rating": "4;5", "confidence": "3;3", "wc_review": "660;229", "wc_reply_reviewers": "0;0", "wc_reply_authors": "2011;663", "reply_reviewers": "0;0", "reply_authors": "3;1", "rating_avg": [ 4.5, 0.5 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 444.5, 215.5 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1337.0, 674.0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bsNqjuPqNG0J:scholar.google.com/&scioq=Improving+On-policy+Learning+with+Statistical+Reward+Accumulation&hl=en&as_sdt=0,5", "gs_version_total": 7 }, { "title": "DOM-Q-NET: Grounded RL on Structured Language", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/844", "id": "HJgd1nAqFX", "author_site": "Sheng Jia, Jamie Kiros, Jimmy Ba", "tldr": "Graph-based Deep Q Network for Web Navigation ", "abstract": "Building agents to interact with the web would allow for significant improvements in knowledge understanding and representation learning. However, web navigation tasks are difficult for current deep reinforcement learning (RL) models due to the large discrete action space and the varying number of actions between the states. In this work, we introduce DOM-Q-NET, a novel architecture for RL-based web navigation to address both of these problems. It parametrizes Q functions with separate networks for different action categories: clicking a DOM element and typing a string input. Our model utilizes a graph neural network to represent the tree-structured HTML of a standard web page. We demonstrate the capabilities of our model on the MiniWoB environment where we can match or outperform existing work without the use of expert demonstrations. Furthermore, we show 2x improvements in sample efficiency when training in the multi-task setting, allowing our model to transfer learned behaviours across tasks. ", "keywords": "Reinforcement Learning;Web Navigation;Graph Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Sheng Jia;Jamie Ryan Kiros;Jimmy Ba", "authorids": "sheng.jia@mail.utoronto.ca;kirosjamie@gmail.com;jba@cs.utoronto.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\njia2018domqnet,\ntitle={{DOM}-Q-{NET}: Grounded {RL} on Structured Language},\nauthor={Sheng Jia and Jamie Ryan Kiros and Jimmy Ba},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgd1nAqFX},\n}", "github": "[![github](/images/github_icon.svg) Sheng-J/DOM-Q-NET](https://github.com/Sheng-J/DOM-Q-NET)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;1;3", "wc_review": "495;53;758", "wc_reply_reviewers": "49;0;646", "wc_reply_authors": "1255;91;1928", "reply_reviewers": "1;0;3", "reply_authors": "2;1;3", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 435.3333333333333, 290.8909685011818 ], "wc_reply_reviewers_avg": [ 231.66666666666666, 293.66004532830516 ], "wc_reply_authors_avg": [ 1091.3333333333333, 758.8290687338985 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10126688324952353090&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HJgd1nAqFX", "pdf": "https://openreview.net/pdf?id=HJgd1nAqFX", "email": ";;", "author_num": 3 }, { "title": "Boosting Robustness Certification of Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/818", "id": "HJgeEh09KQ", "author_site": "Gagandeep Singh, Timon Gehr, Markus P\u00fcschel, Martin Vechev", "tldr": "We refine the over-approximation results from incomplete verifiers using MILP solvers to prove more robustness properties than state-of-the-art. ", "abstract": "We present a novel approach for the certification of neural networks against adversarial perturbations which combines scalable overapproximation methods with precise (mixed integer) linear programming. This results in significantly better precision than state-of-the-art verifiers on challenging feedforward and convolutional neural networks with piecewise linear activation functions.", "keywords": "Robustness certification;Adversarial Attacks;Abstract Interpretation;MILP Solvers;Verification of Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Gagandeep Singh;Timon Gehr;Markus P\u00fcschel;Martin Vechev", "authorids": "gsingh@inf.ethz.ch;timon.gehr@inf.ethz.ch;pueschel@inf.ethz.ch;martin.vechev@inf.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsingh2018robustness,\ntitle={Robustness Certification with Refinement},\nauthor={Gagandeep Singh and Timon Gehr and Markus P\u00fcschel and Martin Vechev},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgeEh09KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;3", "wc_review": "198;411;416", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "158;203;329", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 341.6666666666667, 101.60817989818646 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 230.0, 72.37402849088892 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 241, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10096304269347816524&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HJgeEh09KQ", "pdf": "https://openreview.net/pdf?id=HJgeEh09KQ", "email": ";;;", "author_num": 4 }, { "title": "Learning To Simulate", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/924", "id": "HJgkx2Aqt7", "author_site": "Nataniel Ruiz, Samuel Schulter, Manmohan Chandraker", "tldr": "We propose an algorithm that automatically adjusts parameters of a simulation engine to generate training data for a neural network such that validation accuracy is maximized.", "abstract": "Simulation is a useful tool in situations where training data for machine learning models is costly to annotate or even hard to acquire. In this work, we propose a reinforcement learning-based method for automatically adjusting the parameters of any (non-differentiable) simulator, thereby controlling the distribution of synthesized data in order to maximize the accuracy of a model trained on that data. In contrast to prior art that hand-crafts these simulation parameters or adjusts only parts of the available parameters, our approach fully controls the simulator with the actual underlying goal of maximizing accuracy, rather than mimicking the real data distribution or randomly generating a large volume of data. We find that our approach (i) quickly converges to the optimal simulation parameters in controlled experiments and (ii) can indeed discover good sets of parameters for an image rendering simulator in actual computer vision applications.", "keywords": "Simulation in machine learning;reinforcement learning;policy gradients;image rendering", "primary_area": "", "supplementary_material": "", "author": "Nataniel Ruiz;Samuel Schulter;Manmohan Chandraker", "authorids": "nruiz9@bu.edu;samuel@nec-labs.com;manu@nec-labs.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nruiz2018learning,\ntitle={Learning To Simulate},\nauthor={Nataniel Ruiz and Samuel Schulter and Manmohan Chandraker},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgkx2Aqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;4", "wc_review": "224;735;533", "wc_reply_reviewers": "0;0;173", "wc_reply_authors": "592;1123;722", "reply_reviewers": "0;0;1", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 497.3333333333333, 210.13381979639124 ], "wc_reply_reviewers_avg": [ 57.666666666666664, 81.55298209684848 ], "wc_reply_authors_avg": [ 812.3333333333334, 225.9945918723622 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 153, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18422438747696856662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HJgkx2Aqt7", "pdf": "https://openreview.net/pdf?id=HJgkx2Aqt7", "email": ";;", "author_num": 3 }, { "id": "HJglg2A9FX", "title": "Iteratively Learning from the Best", "track": "main", "status": "Reject", "tldr": "We propose a simple framework that addresses the problem of spurious data in both supervised and unsupervised settings.", "abstract": "We study a simple generic framework to address the issue of bad training data; both bad labels in supervised problems, and bad samples in unsupervised ones. Our approach starts by fitting a model to the whole training dataset, but then iteratively improves it by alternating between (a) revisiting the training data to select samples with lowest current loss, and (b) re-training the model on only these selected samples. It can be applied to any existing model training setting which provides a loss measure for samples, and a way to refit on new ones. We show the merit of this approach in both theory and practice We first prove statistical consistency, and linear convergence to the ground truth and global optimum, for two simpler model settings: mixed linear regression, and gaussian mixture models. We then demonstrate its success empirically in (a) saving the accuracy of existing deep image classifiers when there are errors in the labels of training images, and (b) improving the quality of samples generated by existing DC-GAN models, when it is given training data that contains a fraction of the images from a different and unintended dataset. The experimental results show significant improvement over the baseline methods that ignore the existence of bad labels/samples. ", "keywords": "noisy samples;deep learning;generative adversarial network", "primary_area": "", "supplementary_material": "", "author": "Yanyao Shen;Sujay Sanghavi", "authorids": "shenyanyao@utexas.edu;sanghavi@mail.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshen2019iteratively,\ntitle={Iteratively Learning from the Best},\nauthor={Yanyao Shen and Sujay Sanghavi},\nyear={2019},\nurl={https://openreview.net/forum?id=HJglg2A9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJglg2A9FX", "pdf_size": 0, "rating": "3;6;6", "confidence": "5;4;3", "wc_review": "417;331;545", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "830;698;801", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 431.0, 87.92420978699022 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 776.3333333333334, 56.64117073491881 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18077898258564094423&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJguLo0cKQ", "title": "Strength in Numbers: Trading-off Robustness and Computation via Adversarially-Trained Ensembles", "track": "main", "status": "Reject", "tldr": "Adversarial training of ensembles provides robustness to adversarial examples beyond that observed in adversarially trained models and independently-trained ensembles thereof.", "abstract": "While deep learning has led to remarkable results on a number of challenging problems, researchers have discovered a vulnerability of neural networks in adversarial settings, where small but carefully chosen perturbations to the input can make the models produce extremely inaccurate outputs. This makes these models particularly unsuitable for safety-critical application domains (e.g. self-driving cars) where robustness is extremely important. Recent work has shown that augmenting training with adversarially generated data provides some degree of robustness against test-time attacks. In this paper we investigate how this approach scales as we increase the computational budget given to the defender. We show that increasing the number of parameters in adversarially-trained models increases their robustness, and in particular that ensembling smaller models while adversarially training the entire ensemble as a single model is a more efficient way of spending said budget than simply using a larger single model. Crucially, we show that it is the adversarial training of the ensemble, rather than the ensembling of adversarially trained models, which provides robustness.", "keywords": "adversarial examples;adversarial robustness;visualisation;ensembles", "primary_area": "", "supplementary_material": "", "author": "Edward Grefenstette;Robert Stanforth;Brendan O'Donoghue;Jonathan Uesato;Grzegorz Swirszcz;Pushmeet Kohli", "authorids": "etg@google.com;stanforth@google.com;bodonoghue@google.com;juesato@google.com;swirszcz@google.com;pushmeet@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ngrefenstette2019strength,\ntitle={Strength in Numbers: Trading-off Robustness and Computation via Adversarially-Trained Ensembles},\nauthor={Edward Grefenstette and Robert Stanforth and Brendan O'Donoghue and Jonathan Uesato and Grzegorz Swirszcz and Pushmeet Kohli},\nyear={2019},\nurl={https://openreview.net/forum?id=HJguLo0cKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJguLo0cKQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;3", "wc_review": "650;292;187", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1605;177;321", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 376.3333333333333, 198.20247784077327 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 701.0, 641.9221136555431 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16110935085178891541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HJgyAoRqFQ", "title": "State-Denoised Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "We propose a mechanism for denoising the internal state of an RNN to improve generalization performance.", "abstract": "Recurrent neural networks (RNNs) are difficult to train on sequence processing tasks, not only because input noise may be amplified through feedback, but also because any inaccuracy in the weights has similar consequences as input noise. We describe a method for denoising the hidden state during training to achieve more robust representations thereby improving generalization performance. Attractor dynamics are incorporated into the hidden state to `clean up' representations at each step of a sequence. The attractor dynamics are trained through an auxillary denoising loss to recover previously experienced hidden states from noisy versions of those states. This state-denoised recurrent neural network (SDRNN) performs multiple steps of internal processing for each external sequence step. On a range of tasks, we show that the SDRNN outperforms a generic RNN as well as a variant of the SDRNN with attractor dynamics on the hidden state but without the auxillary loss. We argue that attractor dynamics---and corresponding connectivity constraints---are an essential component of the deep learning arsenal and should be invoked not only for recurrent networks but also for improving deep feedforward nets and intertask transfer.", "keywords": "recurrent nets;attractor nets;denoising;sequence processing", "primary_area": "", "supplementary_material": "", "author": "Michael C. Mozer;Denis Kazakov;Robert V. Lindsey", "authorids": "mozer@colorado.edu;denis.kazakov@colorado.edu;rob@imagen.ai", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmozer2019statedenoised,\ntitle={State-Denoised Recurrent Neural Networks},\nauthor={Michael C. Mozer and Denis Kazakov and Robert V. Lindsey},\nyear={2019},\nurl={https://openreview.net/forum?id=HJgyAoRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJgyAoRqFQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;4", "wc_review": "408;450;364", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "325;315;218", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 407.3333333333333, 35.11251755270318 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 286.0, 48.25626038833372 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=541048990722753765&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HJl0jiRqtX", "title": "EDDI: Efficient Dynamic Discovery of High-Value Information with Partial VAE", "track": "main", "status": "Reject", "tldr": "", "abstract": "Making decisions requires information relevant to the task at hand. Many real-life decision-making situations allow acquiring further relevant information at a specific cost. For example, in assessing the health status of a patient we may decide to take additional measurements such as diagnostic tests or imaging scans before making a final assessment. More information that is relevant allows for better decisions but it may be costly to acquire all of this information. How can we trade off the desire to make good decisions with the option to acquire further information at a cost? To this end, we propose a principled framework, named EDDI (Efficient Dynamic Discovery of high-value Information), based on the theory of Bayesian experimental design. In EDDI we propose a novel partial variational autoencoder (Partial VAE), to efficiently handle missing data over varying subsets of known information. EDDI combines this Partial VAE with an acquisition function that maximizes expected information gain on a set of target variables. EDDI is efficient and demonstrates that dynamic discovery of high-value information is possible; we show cost reduction at the same decision quality and improved decision quality at the same cost in benchmarks and in two health-care applications.. We believe there is great potential for realizing these gains in real-world decision support systems.", "keywords": "active variable selection;missing data;amortized inference", "primary_area": "", "supplementary_material": "", "author": "Chao Ma;Sebastian Tschiatschek;Konstantina Palla;Jose Miguel Hernandez Lobato;Sebastian Nowozin;Cheng Zhang", "authorids": "cm905@cam.ac.uk;sebastian.tschiatschek@microsoft.com;konstantina.palla@microsoft.com;jmh233@cam.ac.uk;sebastian.nowozin@microsoft.com;cheng.zhang@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nma2019eddi,\ntitle={{EDDI}: Efficient Dynamic Discovery of High-Value Information with Partial {VAE}},\nauthor={Chao Ma and Sebastian Tschiatschek and Konstantina Palla and Jose Miguel Hernandez Lobato and Sebastian Nowozin and Cheng Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=HJl0jiRqtX},\n}", "github": "[![github](/images/github_icon.svg) microsoft/EDDI](https://github.com/microsoft/EDDI)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJl0jiRqtX", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;2", "wc_review": "384;256;219", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1402;840;421", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 286.3333333333333, 70.69339117311856 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 887.6666666666666, 401.90739674153247 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7932877212524867960&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "HJl1ujCct7", "title": "A Multi-modal one-class generative adversarial network for anomaly detection in manufacturing", "track": "main", "status": "Reject", "tldr": "", "abstract": "One class anomaly detection on high-dimensional data is one of the critical issue in both fundamental machine learning research area and manufacturing applica- tions. A good anomaly detection should accurately discriminate anomalies from normal data. Although most previous anomaly detection methods achieve good performances, they do not perform well on high-dimensional imbalanced data- set 1) with a limited amount of data; 2) multi-modal distribution; 3) few anomaly data. In this paper, we develop a multi-modal one-class generative adversarial net- work based detector (MMOC-GAN) to distinguish anomalies from normal data (products). Apart from a domain-specific feature extractor, our model leverage a generative adversarial network(GAN). The generator takes in a modified noise vector using a pseudo latent prior and generate samples at the low-density area of the given normal data to simulate the anomalies. The discriminator then is trained to distinguish the generate samples from the normal samples. Since the generated samples simulate the low density area for each modal, the discriminator could directly detect anomalies from normal data. Experiments demonstrate that our model outperforms the state-of-the-art one-class classification models and other anomaly detection methods on both normal data and anomalies accuracy, as well as the F1 score. Also, the generated samples can fully capture the low density area of different types of products.\n", "keywords": "Anomaly detection;one-class model;GAN", "primary_area": "", "supplementary_material": "", "author": "Shuhui Qu;Janghwan Lee;Wei Xiong;Wonhyouk Jang;Jie Wang", "authorids": "shuhuiq@stanford.edu;jake.ee@samsung.com;w.xiong@samsung.com;damian.jang@samsung.com;jiewang@stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nqu2019a,\ntitle={A Multi-modal one-class generative adversarial network for anomaly detection in manufacturing},\nauthor={Shuhui Qu and Janghwan Lee and Wei Xiong and Wonhyouk Jang and Jie Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=HJl1ujCct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJl1ujCct7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;4", "wc_review": "488;439;476", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 467.6666666666667, 20.853989759489405 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RQA4A2VbGnkJ:scholar.google.com/&scioq=A+Multi-modal+one-class+generative+adversarial+network+for+anomaly+detection+in+manufacturing&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HJl2Ns0qKX", "title": "Generative adversarial interpolative autoencoding: adversarial training on latent space interpolations encourages convex latent distributions", "track": "main", "status": "Reject", "tldr": "We designed an autoencoder which is trained to learn a convex latent distribution by using an adversarial loss function to discriminate latent space interpolations from real data. ", "abstract": "We present a neural network architecture based upon the Autoencoder (AE) and Generative Adversarial Network (GAN) that promotes a convex latent distribution by training adversarially on latent space interpolations. By using an AE as both the generator and discriminator of a GAN, we pass a pixel-wise error function across the discriminator, yielding an AE which produces sharp samples that match both high- and low-level features of the original images. Samples generated from interpolations between data in latent space remain within the distribution of real data as trained by the discriminator, and therefore preserve realistic resemblances to the network inputs.", "keywords": "convex;GAN;autoencoder;interpolation;stimuli generation;adversarial;latent distribution", "primary_area": "", "supplementary_material": "", "author": "Tim Sainburg;Marvin Thielk;Brad Thielman;Benjamin Migliori;Timothy Gentner", "authorids": "tsainbur@ucsd.edu;marvin.thielk@gmail.com;;ben.migliori@lanl.gov;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nsainburg2019generative,\ntitle={Generative adversarial interpolative autoencoding: adversarial training on latent space interpolations encourages convex latent distributions},\nauthor={Tim Sainburg and Marvin Thielk and Brad Thielman and Benjamin Migliori and Timothy Gentner},\nyear={2019},\nurl={https://openreview.net/forum?id=HJl2Ns0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJl2Ns0qKX", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;4;4", "wc_review": "672;1307;994", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1409;1561;1632", "reply_reviewers": "0;0;0", "reply_authors": "3;3;3", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 991.0, 259.2463435936304 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1534.0, 93.01971117277598 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18245039238742670788&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "HJlEUoR9Km", "title": "Improved resistance of neural networks to adversarial images through generative pre-training", "track": "main", "status": "Reject", "tldr": "Generative pre-training with mean field Boltzmann machines increases robustness against adversarial images in neural networks.", "abstract": "We train a feed forward neural network with increased robustness against adversarial attacks compared to conventional training approaches. This is achieved using a novel pre-trained building block based on a mean field description of a Boltzmann machine. On the MNIST dataset the method achieves strong adversarial resistance without data augmentation or adversarial training. We show that the increased adversarial resistance is correlated with the generative performance of the underlying Boltzmann machine.", "keywords": "adversarial images;Boltzmann machine;mean field approximation", "primary_area": "", "supplementary_material": "", "author": "Joachim Wabnig", "authorids": "joachim.wabnig@nokia-bell-labs.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nwabnig2019improved,\ntitle={Improved resistance of neural networks to adversarial images through generative pre-training},\nauthor={Joachim Wabnig},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlEUoR9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJlEUoR9Km", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;4", "wc_review": "292;359;386", "wc_reply_reviewers": "0;119;0", "wc_reply_authors": "299;708;268", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 345.6666666666667, 39.51652256405611 ], "wc_reply_reviewers_avg": [ 39.666666666666664, 56.09713797413277 ], "wc_reply_authors_avg": [ 425.0, 200.5110138288335 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:myDDkQo3jvMJ:scholar.google.com/&scioq=Improved+resistance+of+neural+networks+to+adversarial+images+through+generative+pre-training&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Towards Understanding Regularization in Batch Normalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/835", "id": "HJlLKjR9FQ", "author_site": "Ping Luo, Xinjiang Wang, wenqi shao, Zhanglin Peng", "tldr": "", "abstract": "Batch Normalization (BN) improves both convergence and generalization in training neural networks. This work understands these phenomena theoretically. We analyze BN by using a basic block of neural networks, consisting of a kernel layer, a BN layer, and a nonlinear activation function. This basic network helps us understand the impacts of BN in three aspects. First, by viewing BN as an implicit regularizer, BN can be decomposed into population normalization (PN) and gamma decay as an explicit regularization. Second, learning dynamics of BN and the regularization show that training converged with large maximum and effective learning rate. Third, generalization of BN is explored by using statistical mechanics. Experiments demonstrate that BN in convolutional neural networks share the same traits of regularization as the above analyses.", "keywords": "batch normalization;regularization;deep learning", "primary_area": "", "supplementary_material": "", "author": "Ping Luo;Xinjiang Wang;Wenqi Shao;Zhanglin Peng", "authorids": "pluo@ie.cuhk.edu.hk;wangxinjiang@sensetime.com;shaowenqi@sensetime.com;zhanglinpeng@sensetime.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nluo2018towards,\ntitle={Towards Understanding Regularization in Batch Normalization},\nauthor={Ping Luo and Xinjiang Wang and Wenqi Shao and Zhanglin Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlLKjR9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;2;5", "wc_review": "278;185;340", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "479;521;512", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 267.6666666666667, 63.69894469734609 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 504.0, 18.05547008526779 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 283, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7627324369776488890&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HJlLKjR9FQ", "pdf": "https://openreview.net/pdf?id=HJlLKjR9FQ", "email": ";;;", "author_num": 4 }, { "title": "The Laplacian in RL: Learning Representations with Efficient Approximations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1003", "id": "HJlNpoA5YQ", "author_site": "Yifan Wu, George Tucker, Ofir Nachum", "tldr": "We propose a scalable method to approximate the eigenvectors of the Laplacian in the reinforcement learning context and we show that the learned representations can improve the performance of an RL agent.", "abstract": "The smallest eigenvectors of the graph Laplacian are well-known to provide a succinct representation of the geometry of a weighted graph. In reinforcement learning (RL), where the weighted graph may be interpreted as the state transition process induced by a behavior policy acting on the environment, approximating the eigenvectors of the Laplacian provides a promising approach to state representation learning. However, existing methods for performing this approximation are ill-suited in general RL settings for two main reasons: First, they are computationally expensive, often requiring operations on large matrices. Second, these methods lack adequate justification beyond simple, tabular, finite-state settings. In this paper, we present a fully general and scalable method for approximating the eigenvectors of the Laplacian in a model-free RL context. We systematically evaluate our approach and empirically show that it generalizes beyond the tabular, finite-state setting. Even in tabular, finite-state settings, its ability to approximate the eigenvectors outperforms previous proposals. Finally, we show the potential benefits of using a Laplacian representation learned using our method in goal-achieving RL tasks, providing evidence that our technique can be used to significantly improve the performance of an RL agent.", "keywords": "Laplacian;reinforcement learning;representation", "primary_area": "", "supplementary_material": "", "author": "Yifan Wu;George Tucker;Ofir Nachum", "authorids": "yw4@andrew.cmu.edu;gjt@google.com;ofirnachum@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nwu2018the,\ntitle={The Laplacian in {RL}: Learning Representations with Efficient Approximations},\nauthor={Yifan Wu and George Tucker and Ofir Nachum},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlNpoA5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;3", "wc_review": "296;376;175", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "700;630;167", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 282.3333333333333, 82.62498949403194 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 499.0, 236.49242412108399 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 108, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5981331586225750792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HJlNpoA5YQ", "pdf": "https://openreview.net/pdf?id=HJlNpoA5YQ", "email": ";;", "author_num": 3 }, { "id": "HJlPB2CqYQ", "title": "ISONETRY : GEOMETRY OF CRITICAL INITIALIZATIONS AND TRAINING", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent work on critical initializations of deep neural networks has shown that by constraining the spectrum of input-output Jacobians allows for fast training of very deep networks without skip connections. The current understanding of this class of initializations is limited with respect to classical notions from optimization. In particular, the connections between Jacobian eigenvalues and curvature of the parameter space are unknown. Similarly, there is no firm understanding of the effects of maintaining orthogonality during training. With this work we complement the existing understanding of critical initializations and show that the curvature is proportional to the maximum singular value of the Jacobian. Furthermore we show that optimization under orthogonality constraints ameliorates the dependence on choice of initial parameters, but is not strictly necessary.", "keywords": "Deep learning", "primary_area": "", "supplementary_material": "", "author": "Piotr A Sokol;Il Memming Park", "authorids": "piotr.sokol@stonybrook.edu;memming.park@stonybrook.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HJlPB2CqYQ", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B5ez_-01wncJ:scholar.google.com/&scioq=ISONETRY+:+GEOMETRY+OF+CRITICAL+INITIALIZATIONS+AND+TRAINING&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Predicting the Generalization Gap in Deep Networks with Margin Distributions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/897", "id": "HJlQfnCqKX", "author_site": "YiDing Jiang, Dilip Krishnan, Hossein Mobahi, Samy Bengio", "tldr": "We develop a new scheme to predict the generalization gap in deep networks with high accuracy.", "abstract": "As shown in recent research, deep neural networks can perfectly fit randomly labeled data, but with very poor accuracy on held out data. This phenomenon indicates that loss functions such as cross-entropy are not a reliable indicator of generalization. This leads to the crucial question of how generalization gap should be predicted from the training data and network parameters. In this paper, we propose such a measure, and conduct extensive empirical studies on how well it can predict the generalization gap. Our measure is based on the concept of margin distribution, which are the distances of training points to the decision boundary. We find that it is necessary to use margin distributions at multiple layers of a deep network. On the CIFAR-10 and the CIFAR-100 datasets, our proposed measure correlates very strongly with the generalization gap. In addition, we find the following other factors to be of importance: normalizing margin values for scale independence, using characterizations of margin distribution rather than just the margin (closest distance to decision boundary), and working in log space instead of linear space (effectively using a product of margins rather than a sum).\nOur measure can be easily applied to feedforward deep networks with any architecture and may point towards new training loss functions that could enable better generalization.", "keywords": "Deep learning;large margin;generalization bounds;generalization gap.", "primary_area": "", "supplementary_material": "", "author": "Yiding Jiang;Dilip Krishnan;Hossein Mobahi;Samy Bengio", "authorids": "ydjiang@google.com;dilipkay@google.com;hmobahi@google.com;bengio@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njiang2018predicting,\ntitle={Predicting the Generalization Gap in Deep Networks with Margin Distributions},\nauthor={Yiding Jiang and Dilip Krishnan and Hossein Mobahi and Samy Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlQfnCqKX},\n}", "github": "[![github](/images/github_icon.svg) google-research/google-research](https://github.com/google-research/google-research) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HJlQfnCqKX)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;9", "confidence": "4;4;4", "wc_review": "239;422;283", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1174;774;467", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 314.6666666666667, 77.99287716765478 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 805.0, 289.4627206855257 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 249, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13633337648471293543&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HJlQfnCqKX", "pdf": "https://openreview.net/pdf?id=HJlQfnCqKX", "email": ";;;", "author_num": 4 }, { "id": "HJlWXhC5Km", "title": "Learning to Control Visual Abstractions for Structured Exploration in Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "structured exploration in deep reinforcement learning via unsupervised visual abstraction discovery and control", "abstract": "Exploration in environments with sparse rewards is a key challenge for reinforcement learning. How do we design agents with generic inductive biases so that they can explore in a consistent manner instead of just using local exploration schemes like epsilon-greedy? We propose an unsupervised reinforcement learning agent which learns a discrete pixel grouping model that preserves spatial geometry of the sensors and implicitly of the environment as well. We use this representation to derive geometric intrinsic reward functions, like centroid coordinates and area, and learn policies to control each one of them with off-policy learning. These policies form a basis set of behaviors (options) which allows us explore in a consistent way and use them in a hierarchical reinforcement learning setup to solve for extrinsically defined rewards. We show that our approach can scale to a variety of domains with competitive performance, including navigation in 3D environments and Atari games with sparse rewards.", "keywords": "exploration;deep reinforcement learning;intrinsic motivation;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "catalin ionescu;tejas kulkarni;aaron van de oord;andriy mnih;vlad mnih", "authorids": "cdi@google.com;tkulkarni@google.com;avdnoord@google.com;amnih@google.com;vmnih@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nionescu2019learning,\ntitle={Learning to Control Visual Abstractions for Structured Exploration in Deep Reinforcement Learning},\nauthor={catalin ionescu and tejas kulkarni and aaron van de oord and andriy mnih and vlad mnih},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlWXhC5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HJlWXhC5Km", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;3;3", "wc_review": "256;333;438", "wc_reply_reviewers": "0;0;44", "wc_reply_authors": "375;351;464", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 342.3333333333333, 74.59371436134698 ], "wc_reply_reviewers_avg": [ 14.666666666666666, 20.741798914805393 ], "wc_reply_authors_avg": [ 396.6666666666667, 48.60955553066587 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HJlY0jA5F7", "title": "Improving Sample-based Evaluation for Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "This paper improves existing sample-based evaluation for GANs and contains some insightful experiments.", "abstract": "In this paper, we propose an improved quantitative evaluation framework for Generative Adversarial Networks (GANs) on generating domain-specific images, where we improve conventional evaluation methods on two levels: the feature representation and the evaluation metric. Unlike most existing evaluation frameworks which transfer the representation of ImageNet inception model to map images onto the feature space, our framework uses a specialized encoder to acquire fine-grained domain-specific representation. Moreover, for datasets with multiple classes, we propose Class-Aware Frechet Distance (CAFD), which employs a Gaussian mixture model on the feature space to better fit the multi-manifold feature distribution. Experiments and analysis on both the feature level and the image level were conducted to demonstrate improvements of our proposed framework over the recently proposed state-of-the-art FID method. To our best knowledge, we are the first to provide counter examples where FID gives inconsistent results with human judgments. It is shown in the experiments that our framework is able to overcome the shortness of FID and improves robustness. Code will be made available.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Shaohui Liu*;Yi Wei*;Jiwen Lu;Jie Zhou", "authorids": "b1ueber2y@gmail.com;wei-y15@mails.tsinghua.edu.cn;lujiwen@tsinghua.edu.cn;jzhou@tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nliu*2019improving,\ntitle={Improving Sample-based Evaluation for Generative Adversarial Networks},\nauthor={Shaohui Liu* and Yi Wei* and Jiwen Lu and Jie Zhou},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlY0jA5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJlY0jA5F7", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;4;3", "wc_review": "407;1871;183", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 820.3333333333334, 748.5405058437979 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11008532348646687049&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJlYzhR9tm", "title": "Language Modeling with Graph Temporal Convolutional Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, there have been some attempts to use non-recurrent neural models for language modeling. \nHowever, a noticeable performance gap still remains. \nWe propose a non-recurrent neural language model, dubbed graph temporal convolutional network (GTCN), that relies on graph neural network blocks and convolution operations. While the standard recurrent neural network language models encode sentences sequentially without modeling higher-level structural information, our model regards sentences as graphs and processes input words within a message propagation framework, aiming to learn better syntactic information by inferring skip-word connections. Specifically, the graph network blocks operate in parallel and learn the underlying graph structures in sentences without any additional annotation pertaining to structure knowledge. Experiments demonstrate that the model without recurrence can achieve comparable perplexity results in language modeling tasks and successfully learn syntactic information.", "keywords": "Graph Neural Network;Language Modeling;Convolution", "primary_area": "", "supplementary_material": "", "author": "Hongyin Luo;Yichen Li;Jie Fu;James Glass", "authorids": "hyluo@mit.edu;yl3506@nyu.edu;jie.fu@polymtl.ca;glass@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nluo2019language,\ntitle={Language Modeling with Graph Temporal Convolutional Networks},\nauthor={Hongyin Luo and Yichen Li and Jie Fu and James Glass},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlYzhR9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJlYzhR9tm", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;3;5", "wc_review": "193;202;588", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "142;133;412", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 327.6666666666667, 184.12012986694916 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 229.0, 129.4526940623485 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14383832397601613305&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJldzhA5tQ", "title": "Learning powerful policies and better dynamics models by encouraging consistency", "track": "main", "status": "Reject", "tldr": "In this paper, we formulate a way to ensure consistency between the predictions of dynamics model and the real observations from the environment. Thus allowing the agent to learn powerful policies, as well as better dynamics models.", "abstract": "Model-based reinforcement learning approaches have the promise of being sample efficient. Much of the progress in learning dynamics models in RL has been made by learning models via supervised learning. There is enough evidence that humans build a model of the environment, not only by observing the environment but also by interacting with the environment. Interaction with the environment allows humans to carry out \"experiments\": taking actions that help uncover true causal relationships which can be used for building better dynamics models. Analogously, we would expect such interaction to be helpful for a learning agent while learning to model the environment dynamics. In this paper, we build upon this intuition, by using an auxiliary cost function to ensure consistency between what the agent observes (by acting in the real world) and what it imagines (by acting in the ``learned'' world). Our empirical analysis shows that the proposed approach helps to train powerful policies as well as better dynamics models.", "keywords": "model-based reinforcement learning;deep learning;generative agents;policy gradient;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Shagun Sodhani;Anirudh Goyal;Tristan Deleu;Yoshua Bengio;Jian Tang", "authorids": "sshagunsodhani@gmail.com;anirudhgoyal9119@gmail.com;tristan.deleu@gmail.com;yoshua.bengio@mila.quebec;tangjianpku@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nsodhani2019learning,\ntitle={Learning powerful policies and better dynamics models by encouraging consistency},\nauthor={Shagun Sodhani and Anirudh Goyal and Tristan Deleu and Yoshua Bengio and Jian Tang},\nyear={2019},\nurl={https://openreview.net/forum?id=HJldzhA5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJldzhA5tQ", "pdf_size": 0, "rating": "2;3;5", "confidence": "4;5;3", "wc_review": "199;227;974", "wc_reply_reviewers": "0;0;441", "wc_reply_authors": "2402;2380;3841", "reply_reviewers": "0;0;1", "reply_authors": "6;5;7", "rating_avg": [ 3.3333333333333335, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 466.6666666666667, 358.9209136038498 ], "wc_reply_reviewers_avg": [ 147.0, 207.88939366884497 ], "wc_reply_authors_avg": [ 2874.3333333333335, 683.5955594420496 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 6.0, 0.816496580927726 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.6546536707079772, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:92NXU6B-xGgJ:scholar.google.com/&scioq=Learning+powerful+policies+and+better+dynamics+models+by+encouraging+consistency&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJlfAo09KX", "title": "Guaranteed Recovery of One-Hidden-Layer Neural Networks via Cross Entropy", "track": "main", "status": "Reject", "tldr": "We provide the first theoretical analysis of guaranteed recovery of one-hidden-layer neural networks under cross entropy loss for classification problems.", "abstract": "We study model recovery for data classification, where the training labels are generated from a one-hidden-layer fully -connected neural network with sigmoid activations, and the goal is to recover the weight vectors of the neural network. We prove that under Gaussian inputs, the empirical risk function using cross entropy exhibits strong convexity and smoothness uniformly in a local neighborhood of the ground truth, as soon as the sample complexity is sufficiently large. This implies that if initialized in this neighborhood, which can be achieved via the tensor method, gradient descent converges linearly to a critical point that is provably close to the ground truth without requiring a fresh set of samples at each iteration. To the best of our knowledge, this is the first global convergence guarantee established for the empirical risk minimization using cross entropy via gradient descent for learning one-hidden-layer neural networks, at the near-optimal sample and computational complexity with respect to the network input dimension.", "keywords": "cross entropy;neural networks;parameter recovery", "primary_area": "", "supplementary_material": "", "author": "Haoyu Fu;Yuejie Chi;Yingbin Liang", "authorids": "fu.436@osu.edu;yuejiechi@cmu.edu;liang.889@osu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfu2019guaranteed,\ntitle={Guaranteed Recovery of One-Hidden-Layer Neural Networks via Cross Entropy},\nauthor={Haoyu Fu and Yuejie Chi and Yingbin Liang},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlfAo09KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJlfAo09KX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "350;327;170", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 282.3333333333333, 79.98472076312798 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7510262395262539854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "title": "Adversarial Imitation via Variational Inverse Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1137", "id": "HJlmHoR5tQ", "author_site": "Ahmed Qureshi, Byron Boots, Michael C Yip", "tldr": "Our method introduces the empowerment-regularized maximum-entropy inverse reinforcement learning to learn near-optimal rewards and policies from expert demonstrations.", "abstract": "We consider a problem of learning the reward and policy from expert examples under unknown dynamics. Our proposed method builds on the framework of generative adversarial networks and introduces the empowerment-regularized maximum-entropy inverse reinforcement learning to learn near-optimal rewards and policies. Empowerment-based regularization prevents the policy from overfitting to expert demonstrations, which advantageously leads to more generalized behaviors that result in learning near-optimal rewards. Our method simultaneously learns empowerment through variational information maximization along with the reward and policy under the adversarial learning formulation. We evaluate our approach on various high-dimensional complex control tasks. We also test our learned rewards in challenging transfer learning problems where training and testing environments are made to be different from each other in terms of dynamics or structure. The results show that our proposed method not only learns near-optimal rewards and policies that are matching expert behavior but also performs significantly better than state-of-the-art inverse reinforcement learning algorithms.", "keywords": "Inverse Reinforcement Learning;Imitation learning;Variational lnference;Learning from demonstrations", "primary_area": "", "supplementary_material": "", "author": "Ahmed H. Qureshi;Byron Boots;Michael C. Yip", "authorids": "a1quresh@eng.ucsd.edu;bboots@cc.gatech.edu;yip@ucsd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nqureshi2018adversarial,\ntitle={Adversarial Imitation via Variational Inverse Reinforcement Learning},\nauthor={Ahmed H. Qureshi and Byron Boots and Michael C. Yip},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlmHoR5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "wc_review": "585;288;532", "wc_reply_reviewers": "551;0;10", "wc_reply_authors": "1059;274;798", "reply_reviewers": "3;0;1", "reply_authors": "3;1;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 468.3333333333333, 129.33762879464825 ], "wc_reply_reviewers_avg": [ 187.0, 257.4192429999488 ], "wc_reply_authors_avg": [ 710.3333333333334, 326.41520934063243 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17015599061555307750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HJlmHoR5tQ", "pdf": "https://openreview.net/pdf?id=HJlmHoR5tQ", "email": ";;", "author_num": 3 }, { "id": "HJlmhs05tm", "title": "EnGAN: Latent Space MCMC and Maximum Entropy Generators for Energy-based Models", "track": "main", "status": "Reject", "tldr": "We introduced entropy maximization to GANs, leading to a reinterpretation of the critic as an energy function.", "abstract": "Unsupervised learning is about capturing dependencies between variables and is driven by the contrast between the probable vs improbable configurations of these variables, often either via a generative model which only samples probable ones or with an energy function (unnormalized log-density) which is low for probable ones and high for improbable ones. Here we consider learning both an energy function and an efficient approximate sampling mechanism for the corresponding distribution. Whereas the critic (or discriminator) in generative adversarial networks (GANs) learns to separate data and generator samples, introducing an entropy maximization regularizer on the generator can turn the interpretation of the critic into an energy function, which separates the training distribution from everything else, and thus can be used for tasks like anomaly or novelty detection. \n\nThis paper is motivated by the older idea of sampling in latent space rather than data space because running a Monte-Carlo Markov Chain (MCMC) in latent space has been found to be easier and more efficient, and because a GAN-like generator can convert latent space samples to data space samples. For this purpose, we show how a Markov chain can be run in latent space whose samples can be mapped to data space, producing better samples. These samples are also used for the negative phase gradient required to estimate the log-likelihood gradient of the data space energy function. To maximize entropy at the output of the generator, we take advantage of recently introduced neural estimators of mutual information. We find that in addition to producing a useful scoring function for anomaly detection, the resulting approach produces sharp samples (like GANs) while covering the modes well, leading to high Inception and Fr\u00e9chet scores.\n", "keywords": "Energy based model;Generative models;MCMC;GANs", "primary_area": "", "supplementary_material": "", "author": "Rithesh Kumar;Anirudh Goyal;Aaron Courville;Yoshua Bengio", "authorids": "ritheshkumar.95@gmail.com;anirudhgoyal9119@gmail.com;aaron.courville@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkumar2019engan,\ntitle={En{GAN}: Latent Space {MCMC} and Maximum Entropy Generators for Energy-based Models},\nauthor={Rithesh Kumar and Anirudh Goyal and Aaron Courville and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlmhs05tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJlmhs05tm", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;4", "wc_review": "566;279;334", "wc_reply_reviewers": "85;165;0", "wc_reply_authors": "1230;1551;808", "reply_reviewers": "1;2;0", "reply_authors": "4;4;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 393.0, 124.37309462527121 ], "wc_reply_reviewers_avg": [ 83.33333333333333, 67.37127643802579 ], "wc_reply_authors_avg": [ 1196.3333333333333, 304.2612181808403 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10690460793592404929&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJlt7209Km", "title": "Theoretical and Empirical Study of Adversarial Examples", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many techniques are developed to defend against adversarial examples at scale. So far, the most successful defenses generate adversarial examples during each training step and add them to the training data. Yet, this brings significant computational overhead. In this paper, we investigate defenses against adversarial attacks. First, we propose feature smoothing, a simple data augmentation method with little computational overhead. Essentially, feature smoothing trains a neural network on virtual training data as an interpolation of features from a pair of samples, with the new label remaining the same as the dominant data point. The intuition behind feature smoothing is to generate virtual data points as close as adversarial examples, and to avoid the computational burden of generating data during training. Our experiments on MNIST and CIFAR10 datasets explore different combinations of known regularization and data augmentation methods and show that feature smoothing with logit squeezing performs best for both adversarial and clean accuracy. Second, we propose an unified framework to understand the connections and differences among different efficient methods by analyzing the biases and variances of decision boundary. We show that under some symmetrical assumptions, label smoothing, logit squeezing, weight decay, mix up and feature smoothing all produce an unbiased estimation of the decision boundary with smaller estimated variance. All of those methods except weight decay are also stable when the assumptions no longer hold.", "keywords": "Adversarial examples;Feature smoothing;Data augmentation;Decision boundary", "primary_area": "", "supplementary_material": "", "author": "Fuchen Liu;Hongwei Shang;Hong Zhang", "authorids": "fuchenl@andrew.cmu.edu;shanghongwei@oath.com;hongz@oath.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliu2019theoretical,\ntitle={Theoretical and Empirical Study of Adversarial Examples},\nauthor={Fuchen Liu and Hongwei Shang and Hong Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=HJlt7209Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJlt7209Km", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;2", "wc_review": "523;284;346", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 384.3333333333333, 101.26642527950163 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tsHvadQ8B7wJ:scholar.google.com/&scioq=Theoretical+and+Empirical+Study+of+Adversarial+Examples&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJx38iC5KX", "title": "Domain Generalization via Invariant Representation under Domain-Class Dependency", "track": "main", "status": "Reject", "tldr": "Address the trade-off caused by the dependency of classes on domains in domain generalization", "abstract": "Learning domain-invariant representation is a dominant approach for domain generalization, where we need to build a classifier that is robust toward domain shifts induced by change of users, acoustic or lighting conditions, etc. However, prior domain-invariance-based methods overlooked the underlying dependency of classes (target variable) on source domains during optimization, which causes the trade-off between classification accuracy and domain-invariance, and often interferes with the domain generalization performance. This study first provides the notion of domain generalization under domain-class dependency and elaborates on the importance of considering the dependency by expanding the analysis of Xie et al. (2017). We then propose a method, invariant feature learning under optimal classifier constrains (IFLOC), which explicitly considers the dependency and maintains accuracy while improving domain-invariance. Specifically, the proposed method regularizes the representation so that it has as much domain information as the class labels, unlike prior methods that remove all domain information. Empirical validations show the superior performance of IFLOC to baseline methods, supporting the importance of the domain-class dependency in domain generalization and the efficacy of the proposed method for overcoming the issue.", "keywords": "domain generalization;adversarial learning;invariant feature learning", "primary_area": "", "supplementary_material": "", "author": "Kei Akuzawa;Yusuke Iwasawa;Yutaka Matsuo", "authorids": "akuzawa-kei@weblab.t.u-tokyo.ac.jp;iwasawa@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nakuzawa2019domain,\ntitle={Domain Generalization via Invariant Representation under Domain-Class Dependency},\nauthor={Kei Akuzawa and Yusuke Iwasawa and Yutaka Matsuo},\nyear={2019},\nurl={https://openreview.net/forum?id=HJx38iC5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJx38iC5KX", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;5", "wc_review": "132;132;116", "wc_reply_reviewers": "0;0;46", "wc_reply_authors": "1261;1804;206", "reply_reviewers": "0;0;1", "reply_authors": "2;4;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 126.66666666666667, 7.542472332656507 ], "wc_reply_reviewers_avg": [ 15.333333333333334, 21.684607956387456 ], "wc_reply_authors_avg": [ 1090.3333333333333, 663.4487336804723 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.18898223650461357, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9811404337271671307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HJx4KjRqYQ", "title": "Ergodic Measure Preserving Flows", "track": "main", "status": "Reject", "tldr": "A novel computational scalable inference framework for training deep generative models and general statistical inference.", "abstract": "Training probabilistic models with neural network components is intractable in most cases and requires to use approximations such as Markov chain Monte Carlo (MCMC), which is not scalable and requires significant hyper-parameter tuning, or mean-field variational inference (VI), which is biased. While there has been attempts at combining both approaches, the resulting methods have some important limitations in theory and in practice. As an alternative, we propose a novel method which is scalable, like mean-field VI, and, due to its theoretical foundation in ergodic theory, is also asymptotically accurate, like MCMC. We test our method on popular benchmark problems with deep generative models and Bayesian neural networks. Our results show that we can outperform existing approximate inference methods.", "keywords": "Markov chain Monte Carlo;variational inference;deep generative models", "primary_area": "", "supplementary_material": "", "author": "Yichuan Zhang;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato;Zoubin Ghahramani", "authorids": "yichuan.zhang@eng.cam.ac.uk;jmh233@cam.ac.uk;zoubin@eng.cam.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019ergodic,\ntitle={Ergodic Measure Preserving Flows},\nauthor={Yichuan Zhang and Jos\u00e9 Miguel Hern\u00e1ndez-Lobato and Zoubin Ghahramani},\nyear={2019},\nurl={https://openreview.net/forum?id=HJx4KjRqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJx4KjRqYQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;3;4", "wc_review": "637;455;395", "wc_reply_reviewers": "12;111;0", "wc_reply_authors": "1090;1229;383", "reply_reviewers": "1;1;0", "reply_authors": "4;3;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 495.6666666666667, 102.89584809677967 ], "wc_reply_reviewers_avg": [ 41.0, 49.73932046178355 ], "wc_reply_authors_avg": [ 900.6666666666666, 370.4180821120313 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3864041861557255213&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "On Random Deep Weight-Tied Autoencoders: Exact Asymptotic Analysis, Phase Transitions, and Implications to Training", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/949", "id": "HJx54i05tX", "author_site": "Ping Li, Phan-Minh Nguyen", "tldr": "We study the behavior of weight-tied multilayer vanilla autoencoders under the assumption of random weights. Via an exact characterization in the limit of large dimensions, our analysis reveals interesting phase transition phenomena.", "abstract": "We study the behavior of weight-tied multilayer vanilla autoencoders under the assumption of random weights. Via an exact characterization in the limit of large dimensions, our analysis reveals interesting phase transition phenomena when the depth becomes large. This, in particular, provides quantitative answers and insights to three questions that were yet fully understood in the literature. Firstly, we provide a precise answer on how the random deep weight-tied autoencoder model performs \u201capproximate inference\u201d as posed by Scellier et al. (2018), and its connection to reversibility considered by several theoretical studies. Secondly, we show that deep autoencoders display a higher degree of sensitivity to perturbations in the parameters, distinct from the shallow counterparts. Thirdly, we obtain insights on pitfalls in training initialization practice, and demonstrate experimentally that it is possible to train a deep autoencoder, even with the tanh activation and a depth as large as 200 layers, without resorting to techniques such as layer-wise pre-training or batch normalization. Our analysis is not specific to any depths or any Lipschitz activations, and our analytical techniques may have broader applicability.", "keywords": "Random Deep Autoencoders;Exact Asymptotic Analysis;Phase Transitions", "primary_area": "", "supplementary_material": "", "author": "Ping Li;Phan-Minh Nguyen", "authorids": "pingli98@gmail.com;npminh@stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nli2018on,\ntitle={On Random Deep Weight-Tied Autoencoders: Exact Asymptotic Analysis, Phase Transitions, and Implications to Training},\nauthor={Ping Li and Phan-Minh Nguyen},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJx54i05tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "8;8;9", "confidence": "4;4;4", "wc_review": "605;481;202", "wc_reply_reviewers": "0;25;0", "wc_reply_authors": "955;1201;87", "reply_reviewers": "0;1;0", "reply_authors": "2;3;1", "rating_avg": [ 8.333333333333334, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 429.3333333333333, 168.53156644643428 ], "wc_reply_reviewers_avg": [ 8.333333333333334, 11.785113019775793 ], "wc_reply_authors_avg": [ 747.6666666666666, 477.8349319819787 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4653790676312565945&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HJx54i05tX", "pdf": "https://openreview.net/pdf?id=HJx54i05tX", "email": ";", "author_num": 2 }, { "id": "HJx7l309Fm", "title": "Actor-Attention-Critic for Multi-Agent Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We propose an approach to learn decentralized policies in multi-agent settings using attention-based critics and demonstrate promising results in environments with complex interactions.", "abstract": "Reinforcement learning in multi-agent scenarios is important for real-world applications but presents challenges beyond those seen in single-agent settings. We present an actor-critic algorithm that trains decentralized policies in multi-agent settings, using centrally computed critics that share an attention mechanism which selects relevant information for each agent at every timestep. This attention mechanism enables more effective and scalable learning in complex multi-agent environments, when compared to recent approaches. Our approach is applicable not only to cooperative settings with shared rewards, but also individualized reward settings, including adversarial settings, and it makes no assumptions about the action spaces of the agents. As such, it is flexible enough to be applied to most multi-agent learning problems", "keywords": "multi-agent;reinforcement learning;attention;actor-critic", "primary_area": "", "supplementary_material": "", "author": "Shariq Iqbal;Fei Sha", "authorids": "shariqiqbal2810@gmail.com;feisha.work@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\niqbal2019actorattentioncritic,\ntitle={Actor-Attention-Critic for Multi-Agent Reinforcement Learning},\nauthor={Shariq Iqbal and Fei Sha},\nyear={2019},\nurl={https://openreview.net/forum?id=HJx7l309Fm},\n}", "github": "[![github](/images/github_icon.svg) shariqiqbal2810/MAAC](https://github.com/shariqiqbal2810/MAAC) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HJx7l309Fm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJx7l309Fm", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;3", "wc_review": "232;174;396", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "481;307;234", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 267.3333333333333, 94.01181958787002 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 340.6666666666667, 103.609308891088 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 1029, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=241844530313281803&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "title": "Reasoning About Physical Interactions with Object-Oriented Prediction and Planning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/727", "id": "HJx9EhC9tQ", "author_site": "Michael Janner, Sergey Levine, William Freeman, Joshua B Tenenbaum, Chelsea Finn, Jiajun Wu", "tldr": "We present a framework for learning object-centric representations suitable for planning in tasks that require an understanding of physics.", "abstract": "Object-based factorizations provide a useful level of abstraction for interacting with the world. Building explicit object representations, however, often requires supervisory signals that are difficult to obtain in practice. We present a paradigm for learning object-centric representations for physical scene understanding without direct supervision of object properties. Our model, Object-Oriented Prediction and Planning (O2P2), jointly learns a perception function to map from image observations to object representations, a pairwise physics interaction function to predict the time evolution of a collection of objects, and a rendering function to map objects back to pixels. For evaluation, we consider not only the accuracy of the physical predictions of the model, but also its utility for downstream tasks that require an actionable representation of intuitive physics. After training our model on an image prediction task, we can use its learned representations to build block towers more complicated than those observed during training.", "keywords": "structured scene representation;predictive models;intuitive physics;self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Michael Janner;Sergey Levine;William T. Freeman;Joshua B. Tenenbaum;Chelsea Finn;Jiajun Wu", "authorids": "janner@berkeley.edu;svlevine@eecs.berkeley.edu;billf@mit.edu;jbt@mit.edu;cbfinn@eecs.berkeley.edu;jiajunwu@mit.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\njanner2018reasoning,\ntitle={Reasoning About Physical Interactions with Object-Centric Models},\nauthor={Michael Janner and Sergey Levine and William T. Freeman and Joshua B. Tenenbaum and Chelsea Finn and Jiajun Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJx9EhC9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;9", "confidence": "5;4;4", "wc_review": "701;694;652", "wc_reply_reviewers": "38;0;0", "wc_reply_authors": "565;415;334", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 1.632993161855452 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 682.3333333333334, 21.63844315615664 ], "wc_reply_reviewers_avg": [ 12.666666666666666, 17.913371790059205 ], "wc_reply_authors_avg": [ 438.0, 95.69743988216194 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 145, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15432976583597993232&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HJx9EhC9tQ", "pdf": "https://openreview.net/pdf?id=HJx9EhC9tQ", "email": ";;;;;", "author_num": 6 }, { "title": "LayoutGAN: Generating Graphic Layouts with Wireframe Discriminators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/701", "id": "HJxB5sRcFQ", "author_site": "Jianan Li, Jimei Yang, Aaron Hertzmann, Jianming Zhang, Tingfa Xu", "tldr": "", "abstract": "Layout is important for graphic design and scene generation. We propose a novel Generative Adversarial Network, called LayoutGAN, that synthesizes layouts by modeling geometric relations of different types of 2D elements. The generator of LayoutGAN takes as input a set of randomly-placed 2D graphic elements and uses self-attention modules to refine their labels and geometric parameters jointly to produce a realistic layout. Accurate alignment is critical for good layouts. We thus propose a novel differentiable wireframe rendering layer that maps the generated layout to a wireframe image, upon which a CNN-based discriminator is used to optimize the layouts in image space. We validate the effectiveness of LayoutGAN in various experiments including MNIST digit generation, document layout generation, clipart abstract scene generation and tangram graphic design.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianan Li;Jimei Yang;Aaron Hertzmann;Jianming Zhang;Tingfa Xu", "authorids": "lijianan15@gmail.com;jimyang@adobe.com;hertzman@adobe.com;jianmzha@adobe.com;ciom_xtf1@bit.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2018layoutgan,\ntitle={Layout{GAN}: Generating Graphic Layouts with Wireframe Discriminator},\nauthor={Jianan Li and Tingfa Xu and Jianming Zhang and Aaron Hertzmann and Jimei Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxB5sRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "wc_review": "146;581;164", "wc_reply_reviewers": "13;126;0", "wc_reply_authors": "76;1098;57", "reply_reviewers": "1;1;0", "reply_authors": "1;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 297.0, 200.9527307602462 ], "wc_reply_reviewers_avg": [ 46.333333333333336, 56.58229012293118 ], "wc_reply_authors_avg": [ 410.3333333333333, 486.315626819547 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 262, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=79761969946922777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HJxB5sRcFQ", "pdf": "https://openreview.net/pdf?id=HJxB5sRcFQ", "email": ";;;;", "author_num": 5 }, { "id": "HJxFrs09YQ", "title": "GENERALIZED ADAPTIVE MOMENT ESTIMATION", "track": "main", "status": "Reject", "tldr": "A new adaptive gradient method is proposed for effectively training deep neural networks", "abstract": "Adaptive gradient methods have experienced great success in training deep neural networks (DNNs). The basic idea of the methods is to track and properly make use of the first and/or second moments of the gradient for model-parameter updates over iterations for the purpose of removing the need for manual interference. In this work, we propose a new adaptive gradient method, referred to as generalized adaptive moment estimation (Game). From a high level perspective, the new method introduces two more parameters w.r.t. AMSGrad (S. J. Reddi & Kumar (2018)) and one more parameter w.r.t. PAdam (Chen & Gu (2018)) to enlarge the parameter- selection space for performance enhancement while reducing the memory cost per iteration compared to AMSGrad and PAdam. The saved memory space amounts to the number of model parameters, which is significant for large-scale DNNs. Our motivation for introducing additional parameters in Game is to provide algorithmic flexibility to facilitate a reduction of the performance gap between training and validation datasets when training a DNN. Convergence analysis is provided for applying Game to solve both convex optimization and smooth nonconvex optmization. Empirical studies for training four convolutional neural networks over MNIST and CIFAR10 show that under proper parameter selection, Game produces promising validation performance as compared to AMSGrad and PAdam.", "keywords": "adaptive moment estimation;SGD;AMSGrad", "primary_area": "", "supplementary_material": "", "author": "Guoqiang Zhang;Kenta Niwa;W. Bastiaan Kleijn", "authorids": "guoqiang.zhang@uts.edu.au;niwa.kenta@lab.ntt.co.jp;bastiaan.kleijn@ecs.vuw.ac.nz", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019generalized,\ntitle={{GENERALIZED} {ADAPTIVE} {MOMENT} {ESTIMATION}},\nauthor={Guoqiang Zhang and Kenta Niwa and W. Bastiaan Kleijn},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxFrs09YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJxFrs09YQ", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;3;4", "wc_review": "720;149;99", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "167;194;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 322.6666666666667, 281.697631434053 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 120.33333333333333, 85.79950789809668 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.2773500981126145, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HJxKajC5t7", "title": "Self-Binarizing Networks", "track": "main", "status": "Withdraw", "tldr": "A method to binarize both weights and activations of a deep neural network that is efficient in computation and memory usage and performs better than the state-of-the-art.", "abstract": "We present a method to train self-binarizing neural networks, that is, networks that evolve their weights and activations during training to become binary. To obtain similar binary networks, existing methods rely on the sign activation function. This function, however, has no gradients for non-zero values, which makes standard backpropagation impossible. To circumvent the difficulty of training a network relying on the sign activation function, these methods alternate between floating-point and binary representations of the network during training, which is sub-optimal and inefficient. We approach the binarization task by training on a unique representation involving a smooth activation function, which is iteratively sharpened during training until it becomes a binary representation equivalent to the sign activation function. Additionally, we introduce a new technique to perform binary batch normalization that simplifies the conventional batch normalization by transforming it into a simple comparison operation. This is unlike existing methods, which are forced to the retain the conventional floating-point-based batch normalization. Our binary networks, apart from displaying advantages of lower memory and computation as compared to conventional floating-point and binary networks, also show higher classification accuracy than existing state-of-the-art methods on multiple benchmark datasets.", "keywords": "Binarization;Convolutional Neural Networks;Deep Learning;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Fayez Lahoud;Radhakrishna Achanta;Pablo M\u00e1rquez-Neila;Sabine S\u00fcsstrunk", "authorids": "fayez.lahoud@epfl.ch;radhakrishna.achanta@epfl.ch;pablo.marquez@artorg.unibe.ch;sabine.susstrunk@epfl.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HJxKajC5t7", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;4", "wc_review": "214;567;334", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 371.6666666666667, 146.55222808117097 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13768139742440641103&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "HJxUX2C9Ym", "title": "Iterative Binary Decisions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "The complexity of functions a neural network approximates make it hard to explain what the classification decision is based on. In this work, we present a framework that exposes more information about this decision-making process. Instead of producing a classification in a single step, our model iteratively makes binary sub-decisions which, when combined as a whole, ultimately produces the same classification result while revealing a decision tree as thought process. While there is generally a trade-off between interpretability and accuracy, the insights our model generates come at a negligible loss in accuracy. The decision tree resulting from the sequence of binary decisions of our model reveal a hierarchical clustering of the data and can be used as learned attributes in zero-shot learning.", "keywords": "explainable AI;interpretability;deep learning;decision tree;zero-shot learning", "primary_area": "", "supplementary_material": "", "author": "Stephan Alaniz;Zeynep Akata", "authorids": "s.alaniz@uva.nl;z.akata@uva.nl", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJxUX2C9Ym", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;4", "wc_review": "156;526;305", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 329.0, 152.00219296663673 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Zu0m1A9jzXEJ:scholar.google.com/&scioq=Iterative+Binary+Decisions&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HJxXynC9t7", "title": "Expressiveness in Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Representation learning in reinforcement learning (RL) algorithms focuses on extracting useful features for choosing good actions. Expressive representations are essential for learning well-performed policies. In this paper, we study the relationship between the state representation assigned by the state extractor and the performance of the RL agent. We observe that representations assigned by the better state extractor are more scattered than which assigned by the worse one. Moreover, RL agents achieving high performances always have high rank matrices which are composed by their representations. Based on our observations, we formally define expressiveness of the state extractor as the rank of the matrix composed by representations. Therefore, we propose to promote expressiveness so as to improve algorithm performances, and we call it Expressiveness Promoted DRL. We apply our method on both policy gradient and value-based algorithms, and experimental results on 55 Atari games show the superiority of our proposed method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xufang Luo;Qi Meng;Di He;Wei Chen;Yunhong Wang;Tie-Yan Liu", "authorids": "luoxufang@buaa.edu.cn;meq@microsoft.com;dihe@microsoft.com;wche@microsoft.com;yhwang@buaa.edu.cn;tyliu@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nluo2019expressiveness,\ntitle={Expressiveness in Deep Reinforcement Learning},\nauthor={Xufang Luo and Qi Meng and Di He and Wei Chen and Yunhong Wang and Tie-Yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxXynC9t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJxXynC9t7", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;3;4", "wc_review": "286;611;515", "wc_reply_reviewers": "125;205;71", "wc_reply_authors": "262;503;273", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 470.6666666666667, 136.33374083068685 ], "wc_reply_reviewers_avg": [ 133.66666666666666, 55.04745427558138 ], "wc_reply_authors_avg": [ 346.0, 111.10655546216283 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EPU0tDdI8TAJ:scholar.google.com/&scioq=Expressiveness+in+Deep+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HJxYwiC5tm", "title": "Why do deep convolutional networks generalize so poorly to small image transformations?", "track": "main", "status": "Reject", "tldr": "Modern deep CNNs are not invariant to translations, scalings and other realistic image transformations, and this lack of invariance is related to the subsampling operation and the biases contained in image datasets.", "abstract": "Deep convolutional network architectures are often assumed to guarantee generalization for small image translations and deformations. In this paper we show that modern CNNs (VGG16, ResNet50, and InceptionResNetV2) can drastically change their output when an image is translated in the image plane by a few pixels, and that this failure of generalization also happens with other realistic small image transformations. Furthermore, we see these failures to generalize more frequently in more modern networks. We show that these failures are related to the fact that the architecture of modern CNNs ignores the classical sampling theorem so that generalization is not guaranteed. We also show that biases in the statistics of commonly used image datasets makes it unlikely that CNNs will learn to be invariant to these transformations. Taken together our results suggest that the performance of CNNs in object recognition falls far short of the generalization capabilities of humans.", "keywords": "Convolutional neural networks;The sampling theorem;Sensitivity to small image transformations;Dataset bias;Shiftability", "primary_area": "", "supplementary_material": "", "author": "Aharon Azulay;Yair Weiss", "authorids": "aharon.azulay@mail.huji.ac.il;yweiss@cs.huji.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nazulay2019why,\ntitle={Why do deep convolutional networks generalize so poorly to small image transformations?},\nauthor={Aharon Azulay and Yair Weiss},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxYwiC5tm},\n}", "github": "[![github](/images/github_icon.svg) AzulEye/CNN-Failures](https://github.com/AzulEye/CNN-Failures) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HJxYwiC5tm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJxYwiC5tm", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;5", "wc_review": "926;260;449", "wc_reply_reviewers": "0;0;4", "wc_reply_authors": "371;89;681", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 545.0, 280.2391835557619 ], "wc_reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "wc_reply_authors_avg": [ 380.3333333333333, 241.7730800197206 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 711, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14028706583324396842&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "HJxdAoCcYX", "title": "Characterizing Malicious Edges targeting on Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks on graph structured data have shown increasing success in various applications. However, due to recent studies about vulnerabilities of machine learning models, researchers are encouraged to explore the robustness of graph neural networks (GNNs). So far there are two work targeting to attack GNNs by adding/deleting edges to fool graph based classification tasks. Such attacks are challenging to be detected since the manipulation is very subtle compared with traditional graph attacks. In this paper we propose the first detection mechanism against these two proposed attacks. Given a perturbed graph, we propose a novel graph generation method together with link prediction as preprocessing to detect potential malicious edges. We also propose novel features which can be leveraged to perform outlier detection when the number of added malicious edges are large. Different detection components are proposed and tested, and we also evaluate the performance of final detection pipeline. Extensive experiments are conducted to show that the proposed detection mechanism can achieve AUC above 90% against the two attack strategies on both Cora and Citeseer datasets. We also provide in-depth analysis of different attack strategies and corresponding suitable detection methods. Our results shed light on several principles for detecting different types of attacks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaojun Xu;Yue Yu;Bo Li;Le Song;Chengfeng Liu;Carl Gunter", "authorids": "xuxiaojun1005@gmail.com;yue9yu@gmail.com;lxbosky@gmail.com;lsong@cc.gatech.edu;windsonliu@tencent.com;cgunter@illinois.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nxu2019characterizing,\ntitle={Characterizing Malicious Edges targeting on Graph Neural Networks},\nauthor={Xiaojun Xu and Yue Yu and Bo Li and Le Song and Chengfeng Liu and Carl Gunter},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxdAoCcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJxdAoCcYX", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;3;5", "wc_review": "333;365;107", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "603;919;219", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 268.3333333333333, 114.82547142898605 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 580.3333333333334, 286.22291235251515 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15924401392741752597&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Learning Mixed-Curvature Representations in Product Spaces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/848", "id": "HJxeWnCcF7", "author_site": "Albert Gu, Frederic Sala, Beliz Gunel, Christopher Re", "tldr": "Product manifold embedding spaces with heterogenous curvature yield improved representations compared to traditional embedding spaces for a variety of structures.", "abstract": "The quality of the representations achieved by embeddings is determined by how well the geometry of the embedding space matches the structure of the data.\nEuclidean space has been the workhorse for embeddings; recently hyperbolic and spherical spaces have gained popularity due to their ability to better embed new types of structured data---such as hierarchical data---but most data is not structured so uniformly.\nWe address this problem by proposing learning embeddings in a product manifold combining multiple copies of these model spaces (spherical, hyperbolic, Euclidean), providing a space of heterogeneous curvature suitable for a wide variety of structures.\nWe introduce a heuristic to estimate the sectional curvature of graph data and directly determine an appropriate signature---the number of component spaces and their dimensions---of the product manifold.\nEmpirically, we jointly learn the curvature and the embedding in the product space via Riemannian optimization.\nWe discuss how to define and compute intrinsic quantities such as means---a challenging notion for product manifolds---and provably learnable optimization functions.\nOn a range of datasets and reconstruction tasks, our product space embeddings outperform single Euclidean or hyperbolic spaces used in previous works, reducing distortion by 32.55% on a Facebook social network dataset. We learn word embeddings and find that a product of hyperbolic spaces in 50 dimensions consistently improves on baseline Euclidean and hyperbolic embeddings, by 2.6\npoints in Spearman rank correlation on similarity tasks\nand 3.4 points on analogy accuracy.\n", "keywords": "embeddings;non-Euclidean geometry;manifolds;geometry of data", "primary_area": "", "supplementary_material": "", "author": "Albert Gu;Frederic Sala;Beliz Gunel;Christopher R\u00e9", "authorids": "albertgu@stanford.edu;fredsala@stanford.edu;bgunel@stanford.edu;chrismre@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngu2018learning,\ntitle={Learning Mixed-Curvature Representations in Product Spaces},\nauthor={Albert Gu and Frederic Sala and Beliz Gunel and Christopher R\u00e9},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxeWnCcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;5;2", "wc_review": "413;1021;61", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "624;2362;9", "reply_reviewers": "0;0;0", "reply_authors": "1;5;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 498.3333333333333, 396.5361129694103 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 998.3333333333334, 996.4089967924259 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 270, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1341296966114816513&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HJxeWnCcF7", "pdf": "https://openreview.net/pdf?id=HJxeWnCcF7", "email": ";;;", "author_num": 4 }, { "id": "HJxfm2CqKm", "title": "Discovering General-Purpose Active Learning Strategies", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a general-purpose approach to discovering active learning (AL) strategies from data. These strategies are transferable from one domain to another and can be used in conjunction with many machine learning models. To this end, we formalize the annotation process as a Markov decision process, design universal state and action spaces and introduce a new reward function that precisely reflects the AL objective of minimizing the annotation cost We seek to find an optimal (non-myopic) AL strategy using reinforcement learning. We evaluate the learned strategies on multiple unrelated domains and show that they consistently outperform state-of-the-art baselines.", "keywords": "active learning;meta learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Ksenia Konyushkova;Raphael Sznitman;Pascal Fua", "authorids": "ksenia.konyushkova@epfl.ch;raphael.sznitman@artorg.unibe.ch;pascal.fua@epfl.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkonyushkova2019discovering,\ntitle={Discovering General-Purpose Active Learning Strategies},\nauthor={Ksenia Konyushkova and Raphael Sznitman and Pascal Fua},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxfm2CqKm},\n}", "github": "[![github](/images/github_icon.svg) ksenia-konyushkova/LAL-RL](https://github.com/ksenia-konyushkova/LAL-RL)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=HJxfm2CqKm", "pdf_size": 0, "rating": "4;4;4;5", "confidence": "4;5;5;4", "wc_review": "440;505;146;808", "wc_reply_reviewers": "0;178;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;1;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 4.25, 0.4330127018922193 ], "confidence_avg": [ 4.5, 0.5 ], "wc_review_avg": [ 474.75, 235.1886211108012 ], "wc_reply_reviewers_avg": [ 44.5, 77.07626093681505 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5773502691896257, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2995248239191029539&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HJxpDiC5tX", "title": "Large-Scale Visual Speech Recognition", "track": "main", "status": "Reject", "tldr": "This work presents a scalable solution to continuous visual speech recognition.", "abstract": "This work presents a scalable solution to continuous visual speech recognition. To achieve this, we constructed the largest existing visual speech recognition dataset, consisting of pairs of text and video clips of faces speaking (3,886 hours of video). In tandem, we designed and trained an integrated lipreading system, consisting of a video processing pipeline that maps raw video to stable videos of lips and sequences of phonemes, a scalable deep neural network that maps the lip videos to sequences of phoneme distributions, and a production-level speech decoder that outputs sequences of words. The proposed system achieves a word error rate (WER) of 40.9% as measured on a held-out set. In comparison, professional lipreaders achieve either 86.4% or 92.9% WER on the same dataset when having access to additional types of contextual information. Our approach significantly improves on previous lipreading approaches, including variants of LipNet and of Watch, Attend, and Spell (WAS), which are only capable of 89.8% and 76.8% WER respectively.", "keywords": "visual speech recognition;speech recognition;lipreading", "primary_area": "", "supplementary_material": "", "author": "Brendan Shillingford;Yannis Assael;Matthew W. Hoffman;Thomas Paine;C\u00edan Hughes;Utsav Prabhu;Hank Liao;Hasim Sak;Kanishka Rao;Lorrayne Bennett;Marie Mulville;Ben Coppin;Ben Laurie;Andrew Senior;Nando de Freitas", "authorids": "shillingford@google.com;assael@google.com;mwhoffman@google.com;tpaine@google.com;cianh@google.com;utsavprabhu@google.com;hankliao@google.com;hasim@google.com;kanishkarao@google.com;lorrayne@google.com;mariecharlotte@google.com;coppin@google.com;benl@google.com;andrewsenior@google.com;nandodefreitas@google.com", "gender": ";;;;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;;;", "aff": ";;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;", "position": ";;;;;;;;;;;;;;", "bibtex": "@misc{\nshillingford2019,\ntitle={ Large-Scale Visual Speech Recognition},\nauthor={Brendan Shillingford and Yannis Assael and Matthew W. Hoffman and Thomas Paine and C\u00edan Hughes and Utsav Prabhu and Hank Liao and Hasim Sak and Kanishka Rao and Lorrayne Bennett and Marie Mulville and Ben Coppin and Ben Laurie and Andrew Senior and Nando de Freitas},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxpDiC5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HJxpDiC5tX", "pdf_size": 0, "rating": "3;4;9", "confidence": "5;4;4", "wc_review": "522;343;285", "wc_reply_reviewers": "0;537;0", "wc_reply_authors": "1285;904;6", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 5.333333333333333, 2.6246692913372702 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 383.3333333333333, 100.87065425032638 ], "wc_reply_reviewers_avg": [ 179.0, 253.144227664784 ], "wc_reply_authors_avg": [ 731.6666666666666, 536.1805251550596 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 15, 0 ], "corr_rating_confidence": -0.6286185570937121, "gs_citation": 220, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8876695766137500001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "HJxqMhC5YQ", "title": "End-to-End Multi-Lingual Multi-Speaker Speech Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "The expressive power of end-to-end automatic speech recognition (ASR) systems enables direct estimation of the character or word label sequence from a sequence of acoustic features. Direct optimization of the whole system is advantageous because it not only eliminates the internal linkage necessary for hybrid systems, but also extends the scope of potential application use cases by training the model for multiple objectives. Several multi-lingual ASR systems were recently proposed based on a monolithic neural network architecture without language-dependent modules, showing that modeling of multiple languages is well within the capabilities of an end-to-end framework. There has also been growing interest in multi-speaker speech recognition, which enables generation of multiple label sequences from single-channel mixed speech. In particular, a multi-speaker end-to-end ASR system that can directly model one-to-many mappings without additional auxiliary clues was recently proposed. In this paper, we propose an all-in-one end-to-end multi-lingual multi-speaker ASR system that integrates the capabilities of these two systems. The proposed model is evaluated using mixtures of two speakers generated by using 10 languages, including mixed-language utterances. ", "keywords": "end-to-end ASR;multi-lingual ASR;multi-speaker ASR;code-switching;encoder-decoder;connectionist temporal classification", "primary_area": "", "supplementary_material": "", "author": "Hiroshi Seki;Takaaki Hori;Shinji Watanabe;Jonathan Le Roux;John R. Hershey", "authorids": "seki@slp.cs.tut.ac.jp;thori@merl.com;shinjiw@ieee.org;leilujp@gmail.com;johnhershey@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nseki2019endtoend,\ntitle={End-to-End Multi-Lingual Multi-Speaker Speech Recognition},\nauthor={Hiroshi Seki and Takaaki Hori and Shinji Watanabe and Jonathan Le Roux and John R. Hershey},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxqMhC5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HJxqMhC5YQ", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;4", "wc_review": "136;118;220", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 158.0, 44.45222154178574 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5046378635249956327&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "HJxwAo09KQ", "title": "Learned optimizers that outperform on wall-clock and validation loss", "track": "main", "status": "Reject", "tldr": "We analyze problems when training learned optimizers, address those problems via variational optimization using two complementary gradient estimators, and train optimizers that are 5x faster in wall-clock time than baseline optimizers (e.g. Adam).", "abstract": "Deep learning has shown that learned functions can dramatically outperform hand-designed functions on perceptual tasks. Analogously, this suggests that learned update functions may similarly outperform current hand-designed optimizers, especially for specific tasks. However, learned optimizers are notoriously difficult to train and have yet to demonstrate wall-clock speedups over hand-designed optimizers, and thus are rarely used in practice. Typically, learned optimizers are trained by truncated backpropagation through an unrolled optimization process. The resulting gradients are either strongly biased (for short truncations) or have exploding norm (for long truncations). In this work we propose a training scheme which overcomes both of these difficulties, by dynamically weighting two unbiased gradient estimators for a variational loss on optimizer performance. This allows us to train neural networks to perform optimization faster than well tuned first-order methods. Moreover, by training the optimizer against validation loss, as opposed to training loss, we are able to use it to train models which generalize better than those trained by first order methods. We demonstrate these results on problems where our learned optimizer trains convolutional networks in a fifth of the wall-clock time compared to tuned first-order methods, and with an improvement", "keywords": "Learned Optimizers;Meta-Learning", "primary_area": "", "supplementary_material": "", "author": "Luke Metz;Niru Maheswaranathan;Jeremy Nixon;Daniel Freeman;Jascha Sohl-dickstein", "authorids": "lmetz@google.com;nirum@google.com;jeremynixon@google.com;cdfreeman@google.com;jaschasd@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmetz2019learned,\ntitle={Learned optimizers that outperform on wall-clock and validation loss},\nauthor={Luke Metz and Niru Maheswaranathan and Jeremy Nixon and Daniel Freeman and Jascha Sohl-dickstein},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxwAo09KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJxwAo09KQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "wc_review": "717;950;361", "wc_reply_reviewers": "339;132;55", "wc_reply_authors": "532;737;256", "reply_reviewers": "3;1;2", "reply_authors": "2;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 676.0, 242.199642168742 ], "wc_reply_reviewers_avg": [ 175.33333333333334, 119.9231235231786 ], "wc_reply_authors_avg": [ 508.3333333333333, 197.07922828705773 ], "reply_reviewers_avg": [ 2.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J3nCSFAaUqcJ:scholar.google.com/&scioq=Learned+optimizers+that+outperform+on+wall-clock+and+validation+loss&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "StrokeNet: A Neural Painting Environment", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1098", "id": "HJxwDiActX", "author_site": "Ningyuan Zheng, Yf Jiang, Dingjiang Huang", "tldr": "StrokeNet is a novel architecture where the agent is trained to draw by strokes on a differentiable simulation of the environment, which could effectively exploit the power of back-propagation.", "abstract": "We've seen tremendous success of image generating models these years. Generating images through a neural network is usually pixel-based, which is fundamentally different from how humans create artwork using brushes. To imitate human drawing, interactions between the environment and the agent is required to allow trials. However, the environment is usually non-differentiable, leading to slow convergence and massive computation. In this paper we try to address the discrete nature of software environment with an intermediate, differentiable simulation. We present StrokeNet, a novel model where the agent is trained upon a well-crafted neural approximation of the painting environment. With this approach, our agent was able to learn to write characters such as MNIST digits faster than reinforcement learning approaches in an unsupervised manner. Our primary contribution is the neural simulation of a real-world environment. Furthermore, the agent trained with the emulated environment is able to directly transfer its skills to real-world software.", "keywords": "image generation;differentiable model;reinforcement learning;deep learning;model based", "primary_area": "", "supplementary_material": "", "author": "Ningyuan Zheng;Yifan Jiang;Dingjiang Huang", "authorids": "zhengningyuan@qq.com;winhehe@163.com;djhuang@dase.ecnu.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzheng2018strokenet,\ntitle={StrokeNet: A Neural Painting Environment},\nauthor={Ningyuan Zheng and Yifan Jiang and Dingjiang Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxwDiActX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;5", "wc_review": "515;615;1466", "wc_reply_reviewers": "49;0;0", "wc_reply_authors": "415;410;447", "reply_reviewers": "1;0;0", "reply_authors": "2;2;2", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 865.3333333333334, 426.69296793934734 ], "wc_reply_reviewers_avg": [ 16.333333333333332, 23.098821518760555 ], "wc_reply_authors_avg": [ 424.0, 16.391054470858997 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16441956279595179741&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HJxwDiActX", "pdf": "https://openreview.net/pdf?id=HJxwDiActX", "email": ";;", "author_num": 3 }, { "title": "Harmonizing Maximum Likelihood with GANs for Multimodal Conditional Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/939", "id": "HJxyAjRcFX", "author_site": "Soochan Lee, Junsoo Ha, Gunhee Kim", "tldr": "We prove that the mode collapse in conditional GANs is largely attributed to a mismatch between reconstruction loss and GAN loss and introduce a set of novel loss functions as alternatives for reconstruction loss.", "abstract": "Recent advances in conditional image generation tasks, such as image-to-image translation and image inpainting, are largely accounted to the success of conditional GAN models, which are often optimized by the joint use of the GAN loss with the reconstruction loss. However, we reveal that this training recipe shared by almost all existing methods causes one critical side effect: lack of diversity in output samples. In order to accomplish both training stability and multimodal output generation, we propose novel training schemes with a new set of losses named moment reconstruction losses that simply replace the reconstruction loss. We show that our approach is applicable to any conditional generation tasks by performing thorough experiments on image-to-image translation, super-resolution and image inpainting using Cityscapes and CelebA dataset. Quantitative evaluations also confirm that our methods achieve a great diversity in outputs while retaining or even improving the visual fidelity of generated samples.", "keywords": "conditional GANs;conditional image generation;multimodal generation;reconstruction loss;maximum likelihood estimation;moment matching", "primary_area": "", "supplementary_material": "", "author": "Soochan Lee;Junsoo Ha;Gunhee Kim", "authorids": "soochan.lee@vision.snu.ac.kr;kuc2477@gmail.com;gunhee@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlee2018harmonizing,\ntitle={Harmonizing Maximum Likelihood with {GAN}s for Multimodal Conditional Generation},\nauthor={Soochan Lee and Junsoo Ha and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJxyAjRcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;7;8", "confidence": "5;3;4", "wc_review": "731;158;321", "wc_reply_reviewers": "683;0;0", "wc_reply_authors": "1059;206;449", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 403.3333333333333, 241.0620021672617 ], "wc_reply_reviewers_avg": [ 227.66666666666666, 321.96928770027466 ], "wc_reply_authors_avg": [ 571.3333333333334, 358.81874099451505 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7205766921228921, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=905119799608670927&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HJxyAjRcFX", "pdf": "https://openreview.net/pdf?id=HJxyAjRcFX", "email": ";;", "author_num": 3 }, { "title": "Measuring Compositionality in Representation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1055", "id": "HJz05o0qK7", "tldr": "This paper proposes a simple procedure for evaluating compositional structure in learned representations, and uses the procedure to explore the role of compositionality in four learning problems.", "abstract": "Many machine learning algorithms represent input data with vector embeddings or discrete codes. When inputs exhibit compositional structure (e.g. objects built from parts or procedures from subroutines), it is natural to ask whether this compositional structure is reflected in the the inputs\u2019 learned representations. While the assessment of compositionality in languages has received significant attention in linguistics and adjacent fields, the machine learning literature lacks general-purpose tools for producing graded measurements of compositional structure in more general (e.g. vector-valued) representation spaces. We describe a procedure for evaluating compositionality by measuring how well the true representation-producing model can be approximated by a model that explicitly composes a collection of inferred representational primitives. We use the procedure to provide formal and empirical characterizations of compositional structure in a variety of settings, exploring the relationship between compositionality and learning dynamics, human judgments, representational similarity, and generalization.", "keywords": "compositionality;representation learning;evaluation", "primary_area": "", "supplementary_material": "", "author": "Jacob Andreas", "authorids": "jda@cs.berkeley.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nandreas2018measuring,\ntitle={Measuring Compositionality in Representation Learning},\nauthor={Jacob Andreas},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJz05o0qK7},\n}", "github": "[![github](/images/github_icon.svg) jacobandreas/tre](https://github.com/jacobandreas/tre)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "757;456;255", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "200;37;47", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 489.3333333333333, 206.2915951322841 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 94.66666666666667, 74.59371436134698 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=36884338001216785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HJz05o0qK7", "pdf": "https://openreview.net/pdf?id=HJz05o0qK7", "email": "", "author_num": 1 }, { "id": "HJz1vo0cYX", "title": "Confidence Calibration in Deep Neural Networks through Stochastic Inferences", "track": "main", "status": "Withdraw", "tldr": "We propose a framework to learn confidence-calibrated networks by designing a novel loss function that incorporates predictive uncertainty estimated through stochastic inferences.", "abstract": "We propose a generic framework to calibrate accuracy and confidence (score) of a prediction through stochastic inferences in deep neural networks. We first analyze relation between variation of multiple model parameters for a single example inference and variance of the corresponding prediction scores by Bayesian modeling of stochastic regularization. Our empirical observation shows that accuracy and score of a prediction are highly correlated with variance of multiple stochastic inferences given by stochastic depth or dropout. Motivated by these facts, we design a novel variance-weighted confidence-integrated loss function that is composed of two cross-entropy loss terms with respect to ground-truth and uniform distribution, which are balanced by variance of stochastic prediction scores. The proposed loss function enables us to learn deep neural networks that predict confidence calibrated scores using a single inference. Our algorithm presents outstanding confidence calibration performance and improves classification accuracy with two popular stochastic regularization techniques---stochastic depth and dropout---in multiple models and datasets; it alleviates overconfidence issue in deep neural networks significantly by training networks to achieve prediction accuracy proportional to confidence of prediction.", "keywords": "Variance-Weighted Confidence-Integrated loss;Confidence Calibration;Stochastic Regularization;Stochastic Inferences", "primary_area": "", "supplementary_material": "", "author": "Seonguk Seo;Paul Hongsuck Seo;Bohyung Han", "authorids": "seonguk@snu.ac.kr;hsseo@postech.ac.kr;bhhan@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJz1vo0cYX", "pdf_size": 0, "rating": "3;5;5", "confidence": "2;4;4", "wc_review": "253;345;296", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 298.0, 37.58545818087983 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=536947417866596475&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HJz6QhR9YQ", "title": "Hierarchical Deep Reinforcement Learning Agent with Counter Self-play on Competitive Games", "track": "main", "status": "Withdraw", "tldr": "We develop Hierarchical Agent with Self-play (HASP), a learning approach for obtaining hierarchically structured policies that can achieve high performance than conventional self-play on competitive real-time strategic games.", "abstract": "Deep Reinforcement Learning algorithms lead to agents that can solve difficult decision making problems in complex environments. However, many difficult multi-agent competitive games, especially real-time strategy games are still considered beyond the capability of current deep reinforcement learning algorithms, although there has been a recent effort to change this \\citep{openai_2017_dota, vinyals_2017_starcraft}. Moreover, when the opponents in a competitive game are suboptimal, the current \\textit{Nash Equilibrium} seeking, self-play algorithms are often unable to generalize their strategies to opponents that play strategies vastly different from their own. This suggests that a learning algorithm that is beyond conventional self-play is necessary. We develop Hierarchical Agent with Self-play (HASP), a learning approach for obtaining hierarchically structured policies that can achieve higher performance than conventional self-play on competitive games through the use of a diverse pool of sub-policies we get from Counter Self-Play (CSP). We demonstrate that the ensemble policy generated by HASP can achieve better performance while facing unseen opponents that use sub-optimal policies. On a motivating iterated Rock-Paper-Scissor game and a partially observable real-time strategic game (http://generals.io/), we are led to the conclusion that HASP can perform better than conventional self-play as well as achieve 77% win rate against FloBot, an open-source agent which has ranked at position number 2 on the online leaderboards.", "keywords": "deep reinforcement learning;self-play;real-time strategic game;multi-agent", "primary_area": "", "supplementary_material": "", "author": "Huazhe Xu;Keiran Paster;Qibin Chen;Haoran Tang;Pieter Abbeel;Trevor Darrell;Sergey Levine", "authorids": "huazhe_xu@berkeley.edu;keirp@berkeley.edu;cqb@tsinghua.edu.cn;hrtang@math.berkeley.edu;pabbeel@cs.berkeley.edu;trevor@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HJz6QhR9YQ", "pdf_size": 0, "rating": "2;2;3", "confidence": "3;4;3", "wc_review": "415;1043;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 563.0, 347.6243182901143 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3939511882661841867&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Benchmarking Neural Network Robustness to Common Corruptions and Perturbations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/731", "id": "HJz6tiCqYm", "author_site": "Dan Hendrycks, Thomas Dietterich", "tldr": "We propose ImageNet-C to measure classifier corruption robustness and ImageNet-P to measure perturbation robustness", "abstract": "In this paper we establish rigorous benchmarks for image classifier robustness. Our first benchmark, ImageNet-C, standardizes and expands the corruption robustness topic, while showing which classifiers are preferable in safety-critical applications. Then we propose a new dataset called ImageNet-P which enables researchers to benchmark a classifier's robustness to common perturbations. Unlike recent robustness research, this benchmark evaluates performance on common corruptions and perturbations not worst-case adversarial perturbations. We find that there are negligible changes in relative corruption robustness from AlexNet classifiers to ResNet classifiers. Afterward we discover ways to enhance corruption and perturbation robustness. We even find that a bypassed adversarial defense provides substantial common perturbation robustness. Together our benchmarks may aid future work toward networks that robustly generalize.", "keywords": "robustness;benchmark;convnets;perturbations", "primary_area": "", "supplementary_material": "", "author": "Dan Hendrycks;Thomas Dietterich", "authorids": "hendrycks@berkeley.edu;tgd@oregonstate.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhendrycks2018benchmarking,\ntitle={Benchmarking Neural Network Robustness to Common Corruptions and Perturbations},\nauthor={Dan Hendrycks and Thomas Dietterich},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HJz6tiCqYm},\n}", "github": "[![github](/images/github_icon.svg) hendrycks/robustness](https://github.com/hendrycks/robustness) + [![Papers with Code](/images/pwc_icon.svg) 12 community implementations](https://paperswithcode.com/paper/?openreview=HJz6tiCqYm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;9;9", "confidence": "3;4;5", "wc_review": "82;248;666", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "10;191;258", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 8.333333333333334, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 332.0, 245.70442948116883 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 153.0, 104.75049721441262 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 4260, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4440880036617273374&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HJz6tiCqYm", "pdf": "https://openreview.net/pdf?id=HJz6tiCqYm", "email": ";", "author_num": 2 }, { "id": "HJzLdjR9FX", "title": "DeepTwist: Learning Model Compression via Occasional Weight Distortion", "track": "main", "status": "Reject", "tldr": "We propose a unified model compression framework for performing a variety of model compression techniques.", "abstract": "Model compression has been introduced to reduce the required hardware resources while maintaining the model accuracy. Lots of techniques for model compression, such as pruning, quantization, and low-rank approximation, have been suggested along with different inference implementation characteristics. Adopting model compression is, however, still challenging because the design complexity of model compression is rapidly increasing due to additional hyper-parameters and computation overhead in order to achieve a high compression ratio. In this paper, we propose a simple and efficient model compression framework called DeepTwist which distorts weights in an occasional manner without modifying the underlying training algorithms. The ideas of designing weight distortion functions are intuitive and straightforward given formats of compressed weights. We show that our proposed framework improves compression rate significantly for pruning, quantization, and low-rank approximation techniques while the efforts of additional retraining and/or hyper-parameter search are highly reduced. Regularization effects of DeepTwist are also reported.", "keywords": "deep learning;model compression;pruning;quantization;SVD;regularization;framework", "primary_area": "", "supplementary_material": "", "author": "Dongsoo Lee;Parichay Kapoor;Byeongwook Kim", "authorids": "dslee3@gmail.com;kparichay@gmail.com;quddnr145@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2019deeptwist,\ntitle={DeepTwist: Learning Model Compression via Occasional Weight Distortion},\nauthor={Dongsoo Lee and Parichay Kapoor and Byeongwook Kim},\nyear={2019},\nurl={https://openreview.net/forum?id=HJzLdjR9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HJzLdjR9FX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;3", "wc_review": "223;117;121", "wc_reply_reviewers": "272;49;102", "wc_reply_authors": "1035;424;686", "reply_reviewers": "3;1;1", "reply_authors": "4;3;2", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 153.66666666666666, 49.053259037725745 ], "wc_reply_reviewers_avg": [ 141.0, 95.12447984965104 ], "wc_reply_authors_avg": [ 715.0, 250.28117521433103 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16976996923745192339&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Hk41X2AqtQ", "title": "Hierarchically-Structured Variational Autoencoders for Long Text Generation", "track": "main", "status": "Reject", "tldr": "Propose a hierarchically-structured variational autoencoder for generating long and coherent units of text", "abstract": "Variational autoencoders (VAEs) have received much attention recently as an end-to-end architecture for text generation. Existing methods primarily focus on synthesizing relatively short sentences (with less than twenty words). In this paper, we propose a novel framework, hierarchically-structured variational autoencoder (hier-VAE), for generating long and coherent units of text. To enhance the model\u2019s plan-ahead ability, intermediate sentence representations are introduced into the generative networks to guide the word-level predictions. To alleviate the typical optimization challenges associated with textual VAEs, we further employ a hierarchy of stochastic layers between the encoder and decoder networks. Extensive experiments are conducted to evaluate the proposed method, where hier-VAE is shown to make effective use of the latent codes and achieve lower perplexity relative to language models. Moreover, the generated samples from hier-VAE also exhibit superior quality according to both automatic and human evaluations. ", "keywords": "Natural Language Processing;Text Generation;Variational Autoencoders", "primary_area": "", "supplementary_material": "", "author": "Dinghan Shen;Asli Celikyilmaz;Yizhe Zhang;Liqun Chen;Xin Wang;Lawrence Carin", "authorids": "dinghan.shen@duke.edu;asli@ieee.org;yizhe.zhang@microsoft.com;liqun.chen@duke.edu;xwang@cs.ucsb.edu;lcarin@duke.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nshen2019hierarchicallystructured,\ntitle={Hierarchically-Structured Variational Autoencoders for Long Text Generation},\nauthor={Dinghan Shen and Asli Celikyilmaz and Yizhe Zhang and Liqun Chen and Xin Wang and Lawrence Carin},\nyear={2019},\nurl={https://openreview.net/forum?id=Hk41X2AqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hk41X2AqtQ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "458;360;586", "wc_reply_reviewers": "0;0;440", "wc_reply_authors": "651;0;1962", "reply_reviewers": "0;0;2", "reply_authors": "1;0;5", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 468.0, 92.53467818427136 ], "wc_reply_reviewers_avg": [ 146.66666666666666, 207.41798914805395 ], "wc_reply_authors_avg": [ 871.0, 815.9497533549477 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 2.160246899469287 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6474285067688864135&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "ADef: an Iterative Algorithm to Construct Adversarial Deformations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/706", "id": "Hk4dFjR5K7", "author_site": "Rima Alaifari, Giovanni S Alberti, Tandri Gauksson", "tldr": "We propose a new, efficient algorithm to construct adversarial examples by means of deformations, rather than additive perturbations.", "abstract": "While deep neural networks have proven to be a powerful tool for many recognition and classification tasks, their stability properties are still not well understood. In the past, image classifiers have been shown to be vulnerable to so-called adversarial attacks, which are created by additively perturbing the correctly classified image. In this paper, we propose the ADef algorithm to construct a different kind of adversarial attack created by iteratively applying small deformations to the image, found through a gradient descent step. We demonstrate our results on MNIST with convolutional neural networks and on ImageNet with Inception-v3 and ResNet-101.", "keywords": "Adversarial examples;deformations;deep neural networks;computer vision", "primary_area": "", "supplementary_material": "", "author": "Rima Alaifari;Giovanni S. Alberti;Tandri Gauksson", "authorids": "rima.alaifari@sam.math.ethz.ch;alberti@dima.unige.it;tandrig@sam.math.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nalaifari2018adef,\ntitle={{AD}ef: an Iterative Algorithm to Construct Adversarial Deformations},\nauthor={Rima Alaifari and Giovanni S. Alberti and Tandri Gauksson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hk4dFjR5K7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Hk4dFjR5K7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;4", "wc_review": "377;233;854", "wc_reply_reviewers": "180;0;88", "wc_reply_authors": "719;234;885", "reply_reviewers": "1;0;1", "reply_authors": "2;1;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 488.0, 265.39404665515764 ], "wc_reply_reviewers_avg": [ 89.33333333333333, 73.49074015744357 ], "wc_reply_authors_avg": [ 612.6666666666666, 276.20081261446154 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4601042374122210571&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=Hk4dFjR5K7", "pdf": "https://openreview.net/pdf?id=Hk4dFjR5K7", "email": ";;", "author_num": 3 }, { "title": "Discriminator-Actor-Critic: Addressing Sample Inefficiency and Reward Bias in Adversarial Imitation Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/836", "id": "Hk4fpoA5Km", "author_site": "Ilya Kostrikov, Kumar Agrawal, Debidatta Dwibedi, Sergey Levine, Jonathan Tompson", "tldr": "We address sample inefficiency and reward bias in adversarial imitation learning algorithms such as GAIL and AIRL.", "abstract": "We identify two issues with the family of algorithms based on the Adversarial Imitation Learning framework. The first problem is implicit bias present in the reward functions used in these algorithms. While these biases might work well for some environments, they can also lead to sub-optimal behavior in others. Secondly, even though these algorithms can learn from few expert demonstrations, they require a prohibitively large number of interactions with the environment in order to imitate the expert for many real-world applications. In order to address these issues, we propose a new algorithm called Discriminator-Actor-Critic that uses off-policy Reinforcement Learning to reduce policy-environment interaction sample complexity by an average factor of 10. Furthermore, since our reward function is designed to be unbiased, we can apply our algorithm to many problems without making any task-specific adjustments. ", "keywords": "deep learning;reinforcement learning;imitation learning;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "Ilya Kostrikov;Kumar Krishna Agrawal;Debidatta Dwibedi;Sergey Levine;Jonathan Tompson", "authorids": "kostrikov@cs.nyu.edu;kumarkagrawal@gmail.com;debidatta@google.com;slevine@google.com;tompson@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nkostrikov2018discriminatoractorcritic,\ntitle={Discriminator-Actor-Critic: Addressing Sample Inefficiency and Reward Bias in Adversarial Imitation Learning},\nauthor={Ilya Kostrikov and Kumar Krishna Agrawal and Debidatta Dwibedi and Sergey Levine and Jonathan Tompson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hk4fpoA5Km},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Hk4fpoA5Km)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;2", "wc_review": "1294;317;86", "wc_reply_reviewers": "2107;0;0", "wc_reply_authors": "1647;203;12", "reply_reviewers": "4;0;0", "reply_authors": "4;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 565.6666666666666, 523.5725567886673 ], "wc_reply_reviewers_avg": [ 702.3333333333334, 993.2493253067037 ], "wc_reply_authors_avg": [ 620.6666666666666, 729.9042555172715 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 348, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10939703062864014386&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Hk4fpoA5Km", "pdf": "https://openreview.net/pdf?id=Hk4fpoA5Km", "email": ";;;;", "author_num": 5 }, { "id": "HkElFj0qYQ", "title": "PPD: Permutation Phase Defense Against Adversarial Examples in Deep Learning", "track": "main", "status": "Reject", "tldr": "Permutation phase defense is proposed as a novel method to guard against adversarial attacks in deep learning.", "abstract": "Deep neural networks have demonstrated cutting edge performance on various tasks including classification. However, it is well known that adversarially designed imperceptible perturbation of the input can mislead advanced classifiers. In this paper, Permutation Phase Defense (PPD), is proposed as a novel method to resist adversarial attacks. PPD combines random permutation of the image with phase component of its Fourier transform. The basic idea behind this approach is to turn adversarial defense problems analogously into symmetric cryptography, which relies solely on safekeeping of the keys for security. In PPD, safe keeping of the selected permutation ensures effectiveness against adversarial attacks. Testing PPD on MNIST and CIFAR-10 datasets yielded state-of-the-art robustness against the most powerful adversarial attacks currently available.", "keywords": "permutation phase defense;adversarial attacks;deep learning", "primary_area": "", "supplementary_material": "", "author": "Mehdi Jafarnia-Jahromi;Tasmin Chowdhury;Hsin-Tai Wu;Sayandev Mukherjee", "authorids": "mjafarni@usc.edu;chowdt1@unlv.nevada.edu;hwu@docomoinnovations.com;sayandev.mukherjee@huawei.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\njafarnia-jahromi2019ppd,\ntitle={{PPD}: Permutation Phase Defense Against Adversarial Examples in Deep Learning},\nauthor={Mehdi Jafarnia-Jahromi and Tasmin Chowdhury and Hsin-Tai Wu and Sayandev Mukherjee},\nyear={2019},\nurl={https://openreview.net/forum?id=HkElFj0qYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkElFj0qYQ", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;3;4", "wc_review": "256;195;101", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "426;368;192", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 184.0, 63.75473838599502 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 328.6666666666667, 99.49651026822778 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.6546536707079772, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15217712439775654438&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Doubly Reparameterized Gradient Estimators for Monte Carlo Objectives", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/755", "id": "HkG3e205K7", "author_site": "George Tucker, Dieterich Lawson, Shixiang Gu, Chris J Maddison", "tldr": "Doubly reparameterized gradient estimators provide unbiased variance reduction which leads to improved performance.", "abstract": "Deep latent variable models have become a popular model choice due to the scalable learning algorithms introduced by (Kingma & Welling 2013, Rezende et al. 2014). These approaches maximize a variational lower bound on the intractable log likelihood of the observed data. Burda et al. (2015) introduced a multi-sample variational bound, IWAE, that is at least as tight as the standard variational lower bound and becomes increasingly tight as the number of samples increases. Counterintuitively, the typical inference network gradient estimator for the IWAE bound performs poorly as the number of samples increases (Rainforth et al. 2018, Le et al. 2018). Roeder et a. (2017) propose an improved gradient estimator, however, are unable to show it is unbiased. We show that it is in fact biased and that the bias can be estimated efficiently with a second application of the reparameterization trick. The doubly reparameterized gradient (DReG) estimator does not suffer as the number of samples increases, resolving the previously raised issues. The same idea can be used to improve many recently introduced training techniques for latent variable models. In particular, we show that this estimator reduces the variance of the IWAE gradient, the reweighted wake-sleep update (RWS) (Bornschein & Bengio 2014), and the jackknife variational inference (JVI) gradient (Nowozin 2018). Finally, we show that this computationally efficient, drop-in estimator translates to improved performance for all three objectives on several modeling tasks.", "keywords": "variational autoencoder;reparameterization trick;IWAE;VAE;RWS;JVI", "primary_area": "", "supplementary_material": "", "author": "George Tucker;Dieterich Lawson;Shixiang Gu;Chris J. Maddison", "authorids": "gjt@google.com;jdl404@nyu.edu;shanegu@google.com;cmaddis@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ntucker2018doubly,\ntitle={Doubly Reparameterized Gradient Estimators for Monte Carlo Objectives},\nauthor={George Tucker and Dieterich Lawson and Shixiang Gu and Chris J. Maddison},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkG3e205K7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HkG3e205K7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;5;3", "wc_review": "472;187;92", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "159;298;11", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 250.33333333333334, 161.46895126377152 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 156.0, 117.1864611065061 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 137, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15749904107210589457&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HkG3e205K7", "pdf": "https://openreview.net/pdf?id=HkG3e205K7", "email": ";;;", "author_num": 4 }, { "id": "HkGGfhC5Y7", "title": "Towards a better understanding of Vector Quantized Autoencoders", "track": "main", "status": "Reject", "tldr": "Understand the VQ-VAE discrete autoencoder systematically using EM and use it to design non-autogressive translation model matching a strong autoregressive baseline.", "abstract": "Deep neural networks with discrete latent variables offer the promise of better symbolic reasoning, and learning abstractions that are more useful to new tasks. There has been a surge in interest in discrete latent variable models, however, despite several recent improvements, the training of discrete latent variable models has remained challenging and their performance has mostly failed to match their continuous counterparts. Recent work on vector quantized autoencoders (VQ-VAE) has made substantial progress in this direction, with its perplexity almost matching that of a VAE on datasets such as CIFAR-10. In this work, we investigate an alternate training technique for VQ-VAE, inspired by its connection to the Expectation Maximization (EM) algorithm. Training the discrete autoencoder with EM and combining it with sequence level knowledge distillation alows us to develop a non-autoregressive machine translation model whose accuracy almost matches a strong greedy autoregressive baseline Transformer, while being 3.3 times faster at inference.\n", "keywords": "machine translation;vector quantized autoencoders;non-autoregressive;NMT", "primary_area": "", "supplementary_material": "", "author": "Aurko Roy;Ashish Vaswani;Niki Parmar;Arvind Neelakantan", "authorids": "aurkor@google.com;avaswani@google.com;nikip@google.com;aneelakantan@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nroy2019towards,\ntitle={Towards a better understanding of Vector Quantized Autoencoders},\nauthor={Aurko Roy and Ashish Vaswani and Niki Parmar and Arvind Neelakantan},\nyear={2019},\nurl={https://openreview.net/forum?id=HkGGfhC5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkGGfhC5Y7", "pdf_size": 0, "rating": "3;5;6;7", "confidence": "4;4;3;4", "wc_review": "648;447;512;258", "wc_reply_reviewers": "0;131;188;0", "wc_reply_authors": "855;896;697;90", "reply_reviewers": "0;2;1;0", "reply_authors": "3;3;3;1", "rating_avg": [ 5.25, 1.479019945774904 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 466.25, 140.41434221617106 ], "wc_reply_reviewers_avg": [ 79.75, 82.25683862147876 ], "wc_reply_authors_avg": [ 634.5, 323.0282495386433 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 2.5, 0.8660254037844386 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.29277002188455997, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9165243599726573655&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HkGSniC9FQ", "title": "An Analysis of Composite Neural Network Performance from Function Composition Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": "This work investigates the performance of a composite neural network, which is composed of pre-trained neural network models and non-instantiated neural network models, connected to form a rooted directed graph. A pre-trained neural network model is generally a well trained neural network model targeted for a specific function. The advantages of adopting such a pre-trained model in a composite neural network are two folds. One is to benefit from other's intelligence and diligence and the other is saving the efforts in data preparation and resources and time in training. However, the overall performance of composite neural network is still not clear. In this work, we prove that a composite neural network, with high probability, performs better than any of its pre-trained components under certain assumptions. In addition, if an extra pre-trained component is added to a composite network, with high probability the overall performance will be improved. In the empirical evaluations, distinctively different applications support the above findings. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ming-Chuan Yang;Meng Chang Chen", "authorids": "mingchuan@iis.sinica.edu.tw;mcc@iis.sinica.edu.tw", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nyang2019an,\ntitle={An Analysis of Composite Neural Network Performance from Function Composition Perspective},\nauthor={Ming-Chuan Yang and Meng Chang Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=HkGSniC9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkGSniC9FQ", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;2", "wc_review": "401;93;167", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "463;212;317", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 220.33333333333334, 131.27409831680006 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 330.6666666666667, 102.9249996626454 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12472201995974261965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HkGTwjCctm", "title": "Pyramid Recurrent Neural Networks for Multi-Scale Change-Point Detection", "track": "main", "status": "Reject", "tldr": "We introduce a scale-invariant neural network architecture for changepoint detection in multivariate time series.", "abstract": "Many real-world time series, such as in activity recognition, finance, or climate science, have changepoints where the system's structure or parameters change. Detecting changes is important as they may indicate critical events. However, existing methods for changepoint detection face challenges when (1) the patterns of change cannot be modeled using simple and predefined metrics, and (2) changes can occur gradually, at multiple time-scales. To address this, we show how changepoint detection can be treated as a supervised learning problem, and propose a new deep neural network architecture that can efficiently identify both abrupt and gradual changes at multiple scales. Our proposed method, pyramid recurrent neural network (PRNN), is designed to be scale-invariant, by incorporating wavelets and pyramid analysis techniques from multi-scale signal processing. Through experiments on synthetic and real-world datasets, we show that PRNN can detect abrupt and gradual changes with higher accuracy than the state of the art and can extrapolate to detect changepoints at novel timescales that have not been seen in training.", "keywords": "changepoint detection;multivariate time series data;multiscale RNN", "primary_area": "", "supplementary_material": "", "author": "Zahra Ebrahimzadeh;Min Zheng;Selcuk Karakas;Samantha Kleinberg", "authorids": "shina.ebiz@gmail.com;mzheng3@stevens.edu;fkarakas@stevens.edu;samantha.kleinberg@stevens.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nebrahimzadeh2019pyramid,\ntitle={Pyramid Recurrent Neural Networks for Multi-Scale Change-Point Detection},\nauthor={Zahra Ebrahimzadeh and Min Zheng and Selcuk Karakas and Samantha Kleinberg},\nyear={2019},\nurl={https://openreview.net/forum?id=HkGTwjCctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkGTwjCctm", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;3;4", "wc_review": "274;267;307", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "655;304;451", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 282.6666666666667, 17.441967269268172 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 470.0, 143.92359083902818 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.6546536707079772, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4568536508070440011&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HkGb-3C5t7", "title": "withdrawn", "track": "main", "status": "Withdraw", "tldr": " ", "abstract": "withdrawn", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "withdrawn;withdrawn", "authorids": "aaron.chadha.14@ucl.ac.uk;i.andreopoulos@ucl.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkGb-3C5t7", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;5", "wc_review": "471;538;461", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 490.0, 34.18576701884378 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HkGmDsR9YQ", "title": "Generalization and Regularization in DQN", "track": "main", "status": "Reject", "tldr": "We study the generalization capabilities of DQN using the new modes and difficulties of Atari games. We show how regularization can improve DQN's ability to generalize across tasks, something it often fails to do.", "abstract": "Deep reinforcement learning (RL) algorithms have shown an impressive ability to learn complex control policies in high-dimensional environments. However, despite the ever-increasing performance on popular benchmarks like the Arcade Learning Environment (ALE), policies learned by deep RL algorithms can struggle to generalize when evaluated in remarkably similar environments. These results are unexpected given the fact that, in supervised learning, deep neural networks often learn robust features that generalize across tasks. In this paper, we study the generalization capabilities of DQN in order to aid in understanding this mismatch between generalization in deep RL and supervised learning methods. We provide evidence suggesting that DQN overspecializes to the domain it is trained on. We then comprehensively evaluate the impact of traditional methods of regularization from supervised learning, $\\ell_2$ and dropout, and of reusing learned representations to improve the generalization capabilities of DQN. We perform this study using different game modes of Atari 2600 games, a recently introduced modification for the ALE which supports slight variations of the Atari 2600 games used for benchmarking in the field. Despite regularization being largely underutilized in deep RL, we show that it can, in fact, help DQN learn more general features. These features can then be reused and fine-tuned on similar tasks, considerably improving the sample efficiency of DQN.", "keywords": "generalization;reinforcement learning;dqn;regularization;transfer learning;multitask", "primary_area": "", "supplementary_material": "", "author": "Jesse Farebrother;Marlos C. Machado;Michael Bowling", "authorids": "jfarebro@ualberta.ca;machado@ualberta.ca;mbowling@ualberta.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfarebrother2019generalization,\ntitle={Generalization and Regularization in {DQN}},\nauthor={Jesse Farebrother and Marlos C. Machado and Michael Bowling},\nyear={2019},\nurl={https://openreview.net/forum?id=HkGmDsR9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkGmDsR9YQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;5;3", "wc_review": "116;443;496", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "340;669;268", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 351.6666666666667, 168.04033907236547 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 425.6666666666667, 174.55530801312102 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 224, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15352839936618349338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HkGsHj05tQ", "title": "Effective and Efficient Batch Normalization Using Few Uncorrelated Data for Statistics' Estimation", "track": "main", "status": "Reject", "tldr": "We propose accelerating Batch Normalization (BN) through sampling less correlated data for reduction operations with regular execution pattern, which achieves up to 2x and 20% speedup for BN itself and the overall training, respectively.", "abstract": "Deep Neural Networks (DNNs) thrive in recent years in which Batch Normalization (BN) plays an indispensable role. However, it has been observed that BN is costly due to the reduction operations. In this paper, we propose alleviating the BN\u2019s cost by using only a small fraction of data for mean & variance estimation at each iteration. The key challenge to reach this goal is how to achieve a satisfactory balance between normalization effectiveness and execution efficiency. We identify that the effectiveness expects less data correlation while the efficiency expects regular execution pattern. To this end, we propose two categories of approach: sampling or creating few uncorrelated data for statistics\u2019 estimation with certain strategy constraints. The former includes \u201cBatch Sampling (BS)\u201d that randomly selects few samples from each batch and \u201cFeature Sampling (FS)\u201d that randomly selects a small patch from each feature map of all samples, and the latter is \u201cVirtual Dataset Normalization (VDN)\u201d that generates few synthetic random samples. Accordingly, multi-way strategies are designed to reduce the data correlation for accurate estimation and optimize the execution pattern for running acceleration in the meantime. All the proposed methods are comprehensively evaluated on various DNN models, where an overall training speedup by up to 21.7% on modern GPUs can be practically achieved without the support of any specialized libraries, and the loss of model accuracy and convergence rate are negligible. Furthermore, our methods demonstrate powerful performance when solving the well-known \u201cmicro-batch normalization\u201d problem in the case of tiny batch size.", "keywords": "batch normalization;acceleration;correlation;sampling", "primary_area": "", "supplementary_material": "", "author": "Zhaodong Chen;Lei Deng;Guoqi Li;Jiawei Sun;Xing Hu;Ling Liang;YufeiDing;Yuan Xie", "authorids": "chenzd15@mails.tsinghua.edu.cn;leideng@ucsb.edu;liguoqi@mail.tsinghua.edu.cn;sunjw15@mails.tsinghua.edu.cn;xinghu@ucsb.edu;lingliang@ucsb.edu;yufeiding@cs.ucsb.edu;yuanxie@ucsb.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nchen2019effective,\ntitle={Effective and Efficient Batch Normalization Using Few Uncorrelated Data for Statistics' Estimation},\nauthor={Zhaodong Chen and Lei Deng and Guoqi Li and Jiawei Sun and Xing Hu and Ling Liang and YufeiDing and Yuan Xie},\nyear={2019},\nurl={https://openreview.net/forum?id=HkGsHj05tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkGsHj05tQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;5;3", "wc_review": "293;172;319", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1118;656;1187", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 261.3333333333333, 64.05379683429304 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 987.0, 235.74138372377473 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15823086376531616287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HkGzUjR5tQ", "title": "DATNet: Dual Adversarial Transfer for Low-resource Named Entity Recognition", "track": "main", "status": "Reject", "tldr": "We propose a new architecture termed Dual Adversarial Transfer Network (DATNet) for addressing low-resource Named Entity Recognition (NER) and achieve new state-of-the-art performances on CoNLL and Twitter NER.", "abstract": "We propose a new architecture termed Dual Adversarial Transfer Network (DATNet) for addressing low-resource Named Entity Recognition (NER). Specifically, two variants of DATNet, i.e., DATNet-F and DATNet-P, are proposed to explore effective feature fusion between high and low resource. To address the noisy and imbalanced training data, we propose a novel Generalized Resource-Adversarial Discriminator (GRAD). Additionally, adversarial training is adopted to boost model generalization. We examine the effects of different components in DATNet across domains and languages and show that significant improvement can be obtained especially for low-resource data. Without augmenting any additional hand-crafted features, we achieve new state-of-the-art performances on CoNLL and Twitter NER---88.16% F1 for Spanish, 53.43% F1 for WNUT-2016, and 42.83% F1 for WNUT-2017.", "keywords": "Low-resource;Named Entity Recognition", "primary_area": "", "supplementary_material": "", "author": "Joey Tianyi Zhou;Hao Zhang;Di Jin;Hongyuan Zhu;Rick Siow Mong Goh;Kenneth Kwok", "authorids": "joey.tianyi.zhou@gmail.com;isaac.changhau@gmail.com;jindi15@mit.edu;hongyuanzhu.cn@gmail.com;gohsm@ihpc.a-star.edu.sg;kenkwok@ihpc.a-star.edu.sg", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzhou2019datnet,\ntitle={{DATN}et: Dual Adversarial Transfer for Low-resource Named Entity Recognition},\nauthor={Joey Tianyi Zhou and Hao Zhang and Di Jin and Hongyuan Zhu and Rick Siow Mong Goh and Kenneth Kwok},\nyear={2019},\nurl={https://openreview.net/forum?id=HkGzUjR5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkGzUjR5tQ", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;5;4", "wc_review": "364;526;466", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "456;749;653", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 452.0, 66.87301398920195 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 619.3333333333334, 121.96265366450866 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1257760774122733034&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HkM3vjCcF7", "title": "Multi-Scale Stacked Hourglass Network for Human Pose Estimation", "track": "main", "status": "Reject", "tldr": "Differentiated inputs cause functional differentiation of the network, and the interaction of loss functions between networks can affect the optimization process.", "abstract": "Stacked hourglass network has become an important model for Human pose estimation. The estimation of human body posture depends on the global information of the keypoints type and the local information of the keypoints location. The consistent processing of inputs and constraints makes it difficult to form differentiated and determined collaboration mechanisms for each stacked hourglass network. In this paper, we propose a Multi-Scale Stacked Hourglass (MSSH) network to high-light the differentiation capabilities of each Hourglass network for human pose estimation. The pre-processing network forms feature maps of different scales,and dispatch them to various locations of the stack hourglass network, where the small-scale features reach the front of stacked hourglass network, and large-scale features reach the rear of stacked hourglass network. And a new loss function is proposed for multi-scale stacked hourglass network. Different keypoints have different weight coefficients of loss function at different scales, and the keypoints weight coefficients are dynamically adjusted from the top-level hourglass network to the bottom-level hourglass network. Experimental results show that the pro-posed method is competitive with respect to the comparison algorithm on MPII and LSP datasets.", "keywords": "Human pose estimation;Hourglass network;Multi-scale analysis", "primary_area": "", "supplementary_material": "", "author": "Chunsheng Guo;Wenlong Du;Na Ying", "authorids": "guo.chsh@gmail.com;dwl1993@hdu.edu.cn;yingna@hdu.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nguo2019multiscale,\ntitle={Multi-Scale Stacked Hourglass Network for Human Pose Estimation},\nauthor={Chunsheng Guo and Wenlong Du and Na Ying},\nyear={2019},\nurl={https://openreview.net/forum?id=HkM3vjCcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkM3vjCcF7", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;4", "wc_review": "288;538;102", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 309.3333333333333, 178.63432543109462 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16738140620301526557&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HkMlGnC9KQ", "title": "On Regularization and Robustness of Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this work, we study the connection between regularization and robustness of deep neural networks by viewing them as elements of a reproducing kernel Hilbert space (RKHS) of functions and by regularizing them using the RKHS norm. Even though this norm cannot be computed, we consider various approximations based on upper and lower bounds. These approximations lead to new strategies for regularization, but also to existing ones such as spectral norm penalties or constraints, gradient penalties, or adversarial training. Besides, the kernel framework allows us to obtain margin-based bounds on adversarial generalization. We show that our new algorithms lead to empirical benefits for learning on small datasets and learning adversarially robust models. We also discuss implications of our regularization framework for learning implicit generative models.", "keywords": "regularization;robustness;deep learning;convolutional networks;kernel methods", "primary_area": "", "supplementary_material": "", "author": "Alberto Bietti*;Gr\u00e9goire Mialon*;Julien Mairal", "authorids": "alberto.bietti@inria.fr;gregoire.mialon@inria.fr;julien.mairal@inria.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbietti*2019on,\ntitle={On Regularization and Robustness of Deep Neural Networks},\nauthor={Alberto Bietti* and Gr\u00e9goire Mialon* and Julien Mairal},\nyear={2019},\nurl={https://openreview.net/forum?id=HkMlGnC9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkMlGnC9KQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;2", "wc_review": "652;298;332", "wc_reply_reviewers": "0;88;0", "wc_reply_authors": "738;327;267", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 427.3333333333333, 159.46856186164788 ], "wc_reply_reviewers_avg": [ 29.333333333333332, 41.48359782961079 ], "wc_reply_authors_avg": [ 444.0, 209.3274946107176 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16277148874385127496&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HkMwHsCctm", "title": "Principled Deep Neural Network Training through Linear Programming", "track": "main", "status": "Reject", "tldr": "Using linear programming we show that the computational complexity of approximate Deep Neural Network training depends polynomially on the data size for several architectures", "abstract": "Deep Learning has received significant attention due to its impressive performance in many state-of-the-art learning tasks. Unfortunately, while very powerful, Deep Learning is not well understood theoretically and in particular only recently results for the complexity of training deep neural networks have been obtained. In this work we show that large classes of deep neural networks with various architectures (e.g., DNNs, CNNs, Binary Neural Networks, and ResNets), activation functions (e.g., ReLUs and leaky ReLUs), and loss functions (e.g., Hinge loss, Euclidean loss, etc) can be trained to near optimality with desired target accuracy using linear programming in time that is exponential in the input data and parameter space dimension and polynomial in the size of the data set; improvements of the dependence in the input dimension are known to be unlikely assuming $P\\neq NP$, and improving the dependence on the parameter space dimension remains open. In particular, we obtain polynomial time algorithms for training for a given fixed network architecture. Our work applies more broadly to empirical risk minimization problems which allows us to generalize various previous results and obtain new complexity results for previously unstudied architectures in the proper learning setting.", "keywords": "deep learning theory;neural network training;empirical risk minimization;non-convex optimization;treewidth", "primary_area": "", "supplementary_material": "", "author": "Daniel Bienstock;Gonzalo Mu\u00f1oz;Sebastian Pokutta", "authorids": "dano@columbia.edu;gonzalo.munoz@polymtl.ca;sebastian.pokutta@isye.gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbienstock2019principled,\ntitle={Principled Deep Neural Network Training through Linear Programming},\nauthor={Daniel Bienstock and Gonzalo Mu\u00f1oz and Sebastian Pokutta},\nyear={2019},\nurl={https://openreview.net/forum?id=HkMwHsCctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkMwHsCctm", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;4;3", "wc_review": "297;602;313", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "477;1444;381", "reply_reviewers": "0;0;0", "reply_authors": "1;4;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 404.0, 140.1594330277726 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 767.3333333333334, 480.07800292128456 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7164053892966029918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "Meta-Learning Update Rules for Unsupervised Representation Learning", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/809", "id": "HkNDsiC9KQ", "author_site": "Luke Metz, Niru Maheswaranathan, Brian Cheung, Jascha Sohl-Dickstein", "tldr": "We learn an unsupervised learning algorithm that produces useful representations from a set of supervised tasks. At test-time, we apply this algorithm to new tasks without any supervision and show performance comparable to a VAE.", "abstract": "A major goal of unsupervised learning is to discover data representations that are useful for subsequent tasks, without access to supervised labels during training. Typically, this involves minimizing a surrogate objective, such as the negative log likelihood of a generative model, with the hope that representations useful for subsequent tasks will arise as a side effect. In this work, we propose instead to directly target later desired tasks by meta-learning an unsupervised learning rule which leads to representations useful for those tasks. Specifically, we target semi-supervised classification performance, and we meta-learn an algorithm -- an unsupervised weight update rule -- that produces representations useful for this task. Additionally, we constrain our unsupervised update rule to a be a biologically-motivated, neuron-local function, which enables it to generalize to different neural network architectures, datasets, and data modalities. We show that the meta-learned update rule produces useful features and sometimes outperforms existing unsupervised learning techniques. We further show that the meta-learned unsupervised update rule generalizes to train networks with different widths, depths, and nonlinearities. It also generalizes to train on data with randomly permuted input dimensions and even generalizes from image datasets to a text task.", "keywords": "Meta-learning;unsupervised learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Luke Metz;Niru Maheswaranathan;Brian Cheung;Jascha Sohl-Dickstein", "authorids": "lmetz@google.com;nirum@google.com;bcheung@berkeley.edu;jaschasd@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmetz2018learning,\ntitle={Learning Unsupervised Learning Rules},\nauthor={Luke Metz and Niru Maheswaranathan and Brian Cheung and Jascha Sohl-Dickstein},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkNDsiC9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "8;8;8", "confidence": "3;3;4", "wc_review": "247;251;357", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "229;556;351", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 285.0, 50.93787065304817 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 378.6666666666667, 134.92302332153034 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5989711063339819997&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HkNDsiC9KQ", "pdf": "https://openreview.net/pdf?id=HkNDsiC9KQ", "email": ";;;", "author_num": 4 }, { "title": "Learning Recurrent Binary/Ternary Weights", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/652", "id": "HkNGYjR9FX", "author_site": "Arash Ardakani, Zhengyun Ji, Sean Smithson, Brett Meyer, Warren Gross", "tldr": "We propose high-performance LSTMs with binary/ternary weights, that can greatly reduce implementation complexity", "abstract": "Recurrent neural networks (RNNs) have shown excellent performance in processing sequence data. However, they are both complex and memory intensive due to their recursive nature. These limitations make RNNs difficult to embed on mobile devices requiring real-time processes with limited hardware resources. To address the above issues, we introduce a method that can learn binary and ternary weights during the training phase to facilitate hardware implementations of RNNs. As a result, using this approach replaces all multiply-accumulate operations by simple accumulations, bringing significant benefits to custom hardware in terms of silicon area and power consumption. On the software side, we evaluate the performance (in terms of accuracy) of our method using long short-term memories (LSTMs) and gated recurrent units (GRUs) on various sequential models including sequence classification and language modeling. We demonstrate that our method achieves competitive results on the aforementioned tasks while using binary/ternary weights during the runtime. On the hardware side, we present custom hardware for accelerating the recurrent computations of LSTMs with binary/ternary weights. Ultimately, we show that LSTMs with binary/ternary weights can achieve up to 12x memory saving and 10x inference speedup compared to the full-precision hardware implementation design.", "keywords": "Quantized Recurrent Neural Network;Hardware Implementation;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Arash Ardakani;Zhengyun Ji;Sean C. Smithson;Brett H. Meyer;Warren J. Gross", "authorids": "arash.ardakani@mail.mcgill.ca;zhengyun.ji@mail.mcgill.ca;sean.smithson@mail.mcgill.ca;brett.meyer@mcgill.ca;warren.gross@mcgill.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nardakani2018learning,\ntitle={Learning Recurrent Binary/Ternary Weights},\nauthor={Arash Ardakani and Zhengyun Ji and Sean C. Smithson and Brett H. Meyer and Warren J. Gross},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkNGYjR9FX},\n}", "github": "[![github](/images/github_icon.svg) arashardakani/Learning-Recurrent-Binary-Ternary-Weights](https://github.com/arashardakani/Learning-Recurrent-Binary-Ternary-Weights)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;3", "wc_review": "423;497;403", "wc_reply_reviewers": "0;18;0", "wc_reply_authors": "1290;1262;941", "reply_reviewers": "0;1;0", "reply_authors": "3;3;2", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 441.0, 40.431011200150145 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 1164.3333333333333, 158.3336842101375 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14324986620118227094&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkNGYjR9FX", "pdf": "https://openreview.net/pdf?id=HkNGYjR9FX", "email": ";;;;", "author_num": 5 }, { "id": "HkNN7nR5Ym", "title": "Associate Normalization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Normalization is a key technique for training deep neural networks. It improves the stability of the training process and thus makes the networks easier to train. However, in typical normalization methods, the rescaling parameters that control the mean and variance of the output do not associate with any input information during the forward phase. Therefore, inputs of different types are treated as from the exact same distribution, which may limit the feature expressiveness of normalization module. We present Associate Normalization (AssocNorm) to overcome the above limitation. AssocNorm extracts the key information from input features and connects them with rescaling parameters by an auto-encoder-like neural network in the normalization module. Furthermore, AssocNorm normalizes the features of each example individually, so the accuracy is relatively stable for different batch sizes. The experimental results show that AssocNorm achieves better performance than Batch Normalization on several benchmark datasets under various hyper-parameter settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Song-Hao Jia;Ding-Jie Chen;Hwann-Tzong Chen", "authorids": "gasoonjia@icloud.com;djchen.tw@gmail.com;htchen@cs.nthu.edu.tw", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkNN7nR5Ym", "pdf_size": 0, "rating": "2;3;5", "confidence": "5;5;4", "wc_review": "190;154;77", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 140.33333333333334, 47.13338048088165 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Learning concise representations for regression by evolving networks of trees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/881", "id": "Hke-JhA9Y7", "author_site": "William La Cava, Tilak Raj Singh, Srinivas Suri, Srinivas Suri", "tldr": "Representing the network architecture as a set of syntax trees and optimizing their structure leads to accurate and concise regression models. ", "abstract": "We propose and study a method for learning interpretable representations for the task of regression. Features are represented as networks of multi-type expression trees comprised of activation functions common in neural networks in addition to other elementary functions. Differentiable features are trained via gradient descent, and the performance of features in a linear model is used to weight the rate of change among subcomponents of each representation. The search process maintains an archive of representations with accuracy-complexity trade-offs to assist in generalization and interpretation. We compare several stochastic optimization approaches within this framework. We benchmark these variants on 100 open-source regression problems in comparison to state-of-the-art machine learning approaches. Our main finding is that this approach produces the highest average test scores across problems while producing representations that are orders of magnitude smaller than the next best performing method (gradient boosting). We also report a negative result in which attempts to directly optimize the disentanglement of the representation result in more highly correlated features.", "keywords": "regression;stochastic optimization;evolutionary compution;feature engineering", "primary_area": "", "supplementary_material": "", "author": "William La Cava;Tilak Raj Singh;James Taggart;Srinivas Suri;Jason H. Moore", "authorids": ";tilakraj@seas.upenn.edu;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ncava2018learning,\ntitle={Learning concise representations for regression by evolving networks of trees},\nauthor={William La Cava and Tilak Raj Singh and James Taggart and Srinivas Suri and Jason Moore},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hke-JhA9Y7},\n}", "github": "[![github](/images/github_icon.svg) lacava/feat](https://github.com/lacava/feat) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=Hke-JhA9Y7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;1;4", "wc_review": "173;284;280", "wc_reply_reviewers": "41;0;0", "wc_reply_authors": "548;133;291", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 245.66666666666666, 51.409035090039275 ], "wc_reply_reviewers_avg": [ 13.666666666666666, 19.3275853524323 ], "wc_reply_authors_avg": [ 324.0, 171.02241568480628 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.3273268353539886, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8620889637656985656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Hke-JhA9Y7", "pdf": "https://openreview.net/pdf?id=Hke-JhA9Y7", "email": ";;;;", "author_num": 5 }, { "title": "Efficient Training on Very Large Corpora via Gramian Estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/890", "id": "Hke20iA9Y7", "author_site": "Walid Krichene, Nicolas Mayoraz, Steffen Rendle, Li Zhang, Xinyang Yi, Lichan Hong, Ed H. Chi, John Anderson", "tldr": "We develop efficient methods to train neural embedding models with a dot-product structure, by reformulating the objective function in terms of generalized Gram matrices, and maintaining estimates of those matrices.", "abstract": "We study the problem of learning similarity functions over very large corpora using neural network embedding models. These models are typically trained using SGD with random sampling of unobserved pairs, with a sample size that grows quadratically with the corpus size, making it expensive to scale.\nWe propose new efficient methods to train these models without having to sample unobserved pairs. Inspired by matrix factorization, our approach relies on adding a global quadratic penalty and expressing this term as the inner-product of two generalized Gramians. We show that the gradient of this term can be efficiently computed by maintaining estimates of the Gramians, and develop variance reduction schemes to improve the quality of the estimates. We conduct large-scale experiments that show a significant improvement both in training time and generalization performance compared to sampling methods.", "keywords": "similarity learning;pairwise learning;matrix factorization;Gramian estimation;variance reduction;neural embedding models;recommender systems", "primary_area": "", "supplementary_material": "", "author": "Walid Krichene;Nicolas Mayoraz;Steffen Rendle;Li Zhang;Xinyang Yi;Lichan Hong;Ed Chi;John Anderson", "authorids": "walidk@google.com;nmayoraz@google.com;srendle@google.com;liqzhang@google.com;xinyang@google.com;lichan@google.com;edchi@google.com;janders@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nkrichene2018efficient,\ntitle={Efficient Training on Very Large Corpora via Gramian Estimation},\nauthor={Walid Krichene and Nicolas Mayoraz and Steffen Rendle and Li Zhang and Xinyang Yi and Lichan Hong and Ed Chi and John Anderson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hke20iA9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "2;4;4", "wc_review": "282;322;164", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "147;174;354", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 256.0, 67.07210050883054 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 225.0, 91.88035698668132 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14969655596866173703&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Hke20iA9Y7", "pdf": "https://openreview.net/pdf?id=Hke20iA9Y7", "email": ";;;;;;;", "author_num": 8 }, { "title": "MAE: Mutual Posterior-Divergence Regularization for Variational AutoEncoders", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/725", "id": "Hke4l2AcKQ", "author_site": "Xuezhe Ma, Chunting Zhou, Eduard Hovy", "tldr": "", "abstract": "Variational Autoencoder (VAE), a simple and effective deep generative model, has led to a number of impressive empirical successes and spawned many advanced variants and theoretical investigations. However, recent studies demonstrate that, when equipped with expressive generative distributions (aka. decoders), VAE suffers from learning uninformative latent representations with the observation called KL Varnishing, in which case VAE collapses into an unconditional generative model. In this work, we introduce mutual posterior-divergence regularization, a novel regularization that is able to control the geometry of the latent space to accomplish meaningful representation learning, while achieving comparable or superior capability of density estimation.Experiments on three image benchmark datasets demonstrate that, when equipped with powerful decoders, our model performs well both on density estimation and representation learning.", "keywords": "VAE;regularization;auto-regressive", "primary_area": "", "supplementary_material": "", "author": "Xuezhe Ma;Chunting Zhou;Eduard Hovy", "authorids": "xuezhem@cs.cmu.edu;ctzhou@cs.cmu.edu;ehovy@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nma2018mae,\ntitle={{MAE}: Mutual Posterior-Divergence Regularization for Variational AutoEncoders},\nauthor={Xuezhe Ma and Chunting Zhou and Eduard Hovy},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hke4l2AcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;5", "wc_review": "516;468;500", "wc_reply_reviewers": "70;31;204", "wc_reply_authors": "362;151;418", "reply_reviewers": "1;1;1", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 494.6666666666667, 19.955506062794353 ], "wc_reply_reviewers_avg": [ 101.66666666666667, 74.0915349790754 ], "wc_reply_authors_avg": [ 310.3333333333333, 114.96182941403734 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1538532826408236978&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Hke4l2AcKQ", "pdf": "https://openreview.net/pdf?id=Hke4l2AcKQ", "email": ";;", "author_num": 3 }, { "id": "Hke8Do0cF7", "title": "Deep processing of structured data", "track": "main", "status": "Reject", "tldr": "General framework of learning representation of structured inputs.", "abstract": "We construct a general unified framework for learning representation of structured\ndata, i.e. data which cannot be represented as the fixed-length vectors (e.g. sets,\ngraphs, texts or images of varying sizes). The key factor is played by an intermediate\nnetwork called SAN (Set Aggregating Network), which maps a structured\nobject to a fixed length vector in a high dimensional latent space. Our main theoretical\nresult shows that for sufficiently large dimension of the latent space, SAN is\ncapable of learning a unique representation for every input example. Experiments\ndemonstrate that replacing pooling operation by SAN in convolutional networks\nleads to better results in classifying images with different sizes. Moreover, its direct\napplication to text and graph data allows to obtain results close to SOTA, by\nsimpler networks with smaller number of parameters than competitive models.", "keywords": "structured data;representation learning;deep neural networks", "primary_area": "", "supplementary_material": "", "author": "\u0141ukasz Maziarka;Marek \u015amieja;Aleksandra Nowak;Jacek Tabor;\u0141ukasz Struski;Przemys\u0142aw Spurek", "authorids": "l.maziarka@gmail.com;marek.smieja@uj.edu.pl;aknoow@gmail.com;jacek.tabor@uj.edu.pl;lukasz.struski@uj.edu.pl;przemyslaw.spurek@uj.edu.pl", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nmaziarka2019deep,\ntitle={Deep processing of structured data},\nauthor={\u0141ukasz Maziarka and Marek \u015amieja and Aleksandra Nowak and Jacek Tabor and \u0141ukasz Struski and Przemys\u0142aw Spurek},\nyear={2019},\nurl={https://openreview.net/forum?id=Hke8Do0cF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hke8Do0cF7", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;3", "wc_review": "514;183;877", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 524.6666666666666, 283.42469115367413 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Residual Non-local Attention Networks for Image Restoration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/795", "id": "HkeGhoA5FX", "author_site": "Yulun Zhang, Kunpeng Li, Kai Li, Bineng Zhong, Yun Fu", "tldr": "New state-of-the-art framework for image restoration", "abstract": "In this paper, we propose a residual non-local attention network for high-quality image restoration. Without considering the uneven distribution of information in the corrupted images, previous methods are restricted by local convolutional operation and equal treatment of spatial- and channel-wise features. To address this issue, we design local and non-local attention blocks to extract features that capture the long-range dependencies between pixels and pay more attention to the challenging parts. Specifically, we design trunk branch and (non-)local mask branch in each (non-)local attention block. The trunk branch is used to extract hierarchical features. Local and non-local mask branches aim to adaptively rescale these hierarchical features with mixed attentions. The local mask branch concentrates on more local structures with convolutional operations, while non-local attention considers more about long-range dependencies in the whole feature map. Furthermore, we propose residual local and non-local attention learning to train the very deep network, which further enhance the representation ability of the network. Our proposed method can be generalized for various image restoration applications, such as image denoising, demosaicing, compression artifacts reduction, and super-resolution. Experiments demonstrate that our method obtains comparable or better results compared with recently leading methods quantitatively and visually.", "keywords": "Non-local network;attention network;image restoration;residual learning", "primary_area": "", "supplementary_material": "", "author": "Yulun Zhang;Kunpeng Li;Kai Li;Bineng Zhong;Yun Fu", "authorids": "yulun100@gmail.com;kunpengli@ece.neu.edu;li.kai.gml@gmail.com;bnzhong@hqu.edu.cn;yunfu@ece.neu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhang2018residual,\ntitle={Residual Non-local Attention Networks for Image Restoration},\nauthor={Yulun Zhang and Kunpeng Li and Kai Li and Bineng Zhong and Yun Fu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkeGhoA5FX},\n}", "github": "[![github](/images/github_icon.svg) yulunzhang/RNAN](https://github.com/yulunzhang/RNAN) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HkeGhoA5FX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;5", "wc_review": "205;563;241", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "796;1655;682", "reply_reviewers": "0;0;0", "reply_authors": "2;3;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 336.3333333333333, 160.94995771633145 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1044.3333333333333, 434.3073668370005 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 904, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5425381515618577679&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkeGhoA5FX", "pdf": "https://openreview.net/pdf?id=HkeGhoA5FX", "email": ";;;;", "author_num": 5 }, { "id": "HkeILsRqFQ", "title": "An experimental study of layer-level training speed and its impact on generalization", "track": "main", "status": "Reject", "tldr": "This paper provides empirical evidence that 1) the speed at which each layer trains influences generalization and 2) this phenomenon is at the root of weight decay's and adaptive gradient methods' impact on generalization.", "abstract": "How optimization influences the generalization ability of a DNN is still an active area of research. This work aims to unveil and study a factor of influence: the speed at which each layer trains. In our preliminary work, we develop a visualization technique and an optimization algorithm to monitor and control the layer rotation rate, a tentative measure of layer-level training speed, and show that it has a remarkably consistent and substantial impact on generalization. Our experiments further suggest that weight decay's and adaptive gradients methods' impact on both generalization performance and speed of convergence are solely due to layer rotation rate changes compared to vanilla SGD, offering a novel interpretation of these widely used techniques, and providing supplementary evidence that layer-level training speed indeed impacts generalization. Besides these fundamental findings, we also expect that on a practical level, the tools we introduce will reduce the meta-parameter tuning required to get the best generalization out of a deep network.", "keywords": "generalization;optimization;vanishing gradients;experimental;fundamental research", "primary_area": "", "supplementary_material": "", "author": "Simon Carbonnelle;Christophe De Vleeschouwer", "authorids": "simon.carbonnelle@uclouvain.be;christophe.devleeschouwer@uclouvain.be", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ncarbonnelle2019an,\ntitle={An experimental study of layer-level training speed and its impact on generalization},\nauthor={Simon Carbonnelle and Christophe De Vleeschouwer},\nyear={2019},\nurl={https://openreview.net/forum?id=HkeILsRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkeILsRqFQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "2;4;3", "wc_review": "301;1100;275", "wc_reply_reviewers": "0;682;0", "wc_reply_authors": "274;1812;396", "reply_reviewers": "0;2;0", "reply_authors": "1;4;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 558.6666666666666, 382.92761137786283 ], "wc_reply_reviewers_avg": [ 227.33333333333334, 321.4978831794836 ], "wc_reply_authors_avg": [ 827.3333333333334, 698.0436153199089 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9AAziAvkkNoJ:scholar.google.com/&scioq=An+experimental+study+of+layer-level+training+speed+and+its+impact+on+generalization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HkeKVh05Fm", "title": "Multi-Grained Entity Proposal Network for Named Entity Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we focus on a new Named Entity Recognition (NER) task, i.e., the Multi-grained NER task. This task aims to simultaneously detect both fine-grained and coarse-grained entities in sentences. Correspondingly, we develop a novel Multi-grained Entity Proposal Network (MGEPN). Different from traditional NER models which regard NER as a sequential labeling task, MGEPN provides a new method that proposes entity candidates in the Proposal Network and classifies entities into different categories in the Classification Network. All possible entity candidates including fine-grained ones and coarse-grained ones are proposed in the Proposal Network, which enables the MGEPN model to identify multi-grained entities. In order to better identify named entities and determine their categories, context information is utilized and transferred from the Proposal Network to the Classification Network during the learning process. A novel Entity-Context attention mechanism is also introduced to help the model focus on entity-related context information. Experiments show that our model can obtain state-of-the-art performance on two real-world datasets for both the Multi-grained NER task and the traditional NER task.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Congying Xia;Chenwei Zhang;Tao Yang;Yaliang Li;Nan Du;Xian Wu;Wei Fan;Fenglong Ma;Philip S. Yu", "authorids": "cxia8@uic.edu;czhang99@uic.edu;tytaoyang@tencent.com;yaliangli@tencent.com;ndu@tencent.com;kevinxwu@tencent.com;davidwfan@tencent.com;fenglong@buffalo.edu;psyu@uic.edu", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@misc{\nxia2019multigrained,\ntitle={Multi-Grained Entity Proposal Network for Named Entity Recognition},\nauthor={Congying Xia and Chenwei Zhang and Tao Yang and Yaliang Li and Nan Du and Xian Wu and Wei Fan and Fenglong Ma and Philip S. Yu},\nyear={2019},\nurl={https://openreview.net/forum?id=HkeKVh05Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkeKVh05Fm", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "wc_review": "89;334;503", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "153;320;628", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 308.6666666666667, 169.96143353387228 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 367.0, 196.74518206722792 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2275420295015487239&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HkeWSnR5Y7", "title": "Provable Defenses against Spatially Transformed Adversarial Inputs: Impossibility and Possibility Results", "track": "main", "status": "Reject", "tldr": "", "abstract": "One intriguing property of neural networks is their inherent vulnerability to adversarial inputs, which are maliciously crafted samples to trigger target networks to misbehave. The state-of-the-art attacks generate adversarial inputs using either pixel perturbation or spatial transformation. Thus far, several provable defenses have been proposed against pixel perturbation-based attacks; yet, little is known about whether such solutions exist for spatial transformation-based attacks. This paper bridges this striking gap by conducting the first systematic study on provable defenses against spatially transformed adversarial inputs. Our findings convey mixed messages. On the impossibility side, we show that such defenses may not exist in practice: for any given networks, it is possible to find legitimate inputs and imperceptible transformations to generate adversarial inputs that force arbitrarily large errors. On the possibility side, we show that it is still feasible to construct adversarial training methods to significantly improve the resilience of networks against adversarial inputs over empirical datasets. We believe our findings provide insights for designing more effective defenses against spatially transformed adversarial inputs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyang Zhang;Yifan Huang;Chanh Nguyen;Shouling Ji;Ting Wang", "authorids": "xizc15@lehigh.edu;yih319@lehigh.edu;cpn217@lehigh.edu;sji@zju.edu.cn;inbox.ting@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2019provable,\ntitle={Provable Defenses against Spatially Transformed Adversarial Inputs: Impossibility and Possibility Results},\nauthor={Xinyang Zhang and Yifan Huang and Chanh Nguyen and Shouling Ji and Ting Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=HkeWSnR5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkeWSnR5Y7", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;3;3", "wc_review": "332;140;258", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 243.33333333333334, 79.06677908929612 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5Vd_EImdewYJ:scholar.google.com/&scioq=Provable+Defenses+against+Spatially+Transformed+Adversarial+Inputs:+Impossibility+and+Possibility+Results&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HkekMnR5Ym", "title": "Meta-Learning Neural Bloom Filters", "track": "main", "status": "Reject", "tldr": "We investigate the space efficiency of memory-augmented neural nets when learning set membership.", "abstract": "There has been a recent trend in training neural networks to replace data structures that have been crafted by hand, with an aim for faster execution, better accuracy, or greater compression. In this setting, a neural data structure is instantiated by training a network over many epochs of its inputs until convergence. In many applications this expensive initialization is not practical, for example streaming algorithms --- where inputs are ephemeral and can only be inspected a small number of times. In this paper we explore the learning of approximate set membership over a stream of data in one-shot via meta-learning. We propose a novel memory architecture, the Neural Bloom Filter, which we show to be more compressive than Bloom Filters and several existing memory-augmented neural networks in scenarios of skewed data or structured sets.", "keywords": "meta-learning;memory;one-shot learning;bloom filter;set membership;familiarity;compression", "primary_area": "", "supplementary_material": "", "author": "Jack W Rae;Sergey Bartunov;Timothy P Lillicrap", "authorids": "jwrae@google.com;bartunov@google.com;countzero@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrae2019metalearning,\ntitle={Meta-Learning Neural Bloom Filters},\nauthor={Jack W Rae and Sergey Bartunov and Timothy P Lillicrap},\nyear={2019},\nurl={https://openreview.net/forum?id=HkekMnR5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkekMnR5Ym", "pdf_size": 0, "rating": "3;6;7", "confidence": "1;4;3", "wc_review": "251;722;320", "wc_reply_reviewers": "0;185;0", "wc_reply_authors": "537;1023;1001", "reply_reviewers": "0;1;0", "reply_authors": "1;2;2", "rating_avg": [ 5.333333333333333, 1.699673171197595 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 431.0, 207.6872648960451 ], "wc_reply_reviewers_avg": [ 61.666666666666664, 87.20983634634086 ], "wc_reply_authors_avg": [ 853.6666666666666, 224.0972011328021 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8386278693775348, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5792515027365493091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "Hkemdj09YQ", "title": "Rectified Gradient: Layer-wise Thresholding for Sharp and Coherent Attribution Maps", "track": "main", "status": "Reject", "tldr": "We propose a new attribution method that removes noise from saliency maps through layer-wise thresholding during backpropagation.", "abstract": "Saliency map, or the gradient of the score function with respect to the input, is the most basic means of interpreting deep neural network decisions. However, saliency maps are often visually noisy. Although several hypotheses were proposed to account for this phenomenon, there is no work that provides a rigorous analysis of noisy saliency maps. This may be a problem as numerous advanced attribution methods were proposed under the assumption that the existing hypotheses are true. In this paper, we identify the cause of noisy saliency maps. Then, we propose Rectified Gradient, a simple method that significantly improves saliency maps by alleviating that cause. Experiments showed effectiveness of our method and its superiority to other attribution methods. Codes and examples for the experiments will be released in public.", "keywords": "Interpretability;Attribution Method;Attribution Map", "primary_area": "", "supplementary_material": "", "author": "Beomsu Kim;Junghoon Seo;Jeongyeol Choe;Jamyoung Koo;Seunghyeon Jeon;Taegyun Jeon", "authorids": "1202kbs@gmail.com;sjh@satreci.com;cjy@si-analytics.ai;jmkoo@si-analytics.ai;jsh@satreci.com;tgjeon@si-analytics.ai", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nkim2019rectified,\ntitle={Rectified Gradient: Layer-wise Thresholding for Sharp and Coherent Attribution Maps},\nauthor={Beomsu Kim and Junghoon Seo and Jeongyeol Choe and Jamyoung Koo and Seunghyeon Jeon and Taegyun Jeon},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkemdj09YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkemdj09YQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;5", "wc_review": "559;336;335", "wc_reply_reviewers": "200;0;0", "wc_reply_authors": "2774;1229;1721", "reply_reviewers": "2;0;0", "reply_authors": "5;2;3", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 410.0, 105.35970134100924 ], "wc_reply_reviewers_avg": [ 66.66666666666667, 94.28090415820634 ], "wc_reply_authors_avg": [ 1908.0, 644.4548083457831 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:B_1x2dtfPuQJ:scholar.google.com/&scioq=Rectified+Gradient:+Layer-wise+Thresholding+for+Sharp+and+Coherent+Attribution+Maps&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Meta-Learning For Stochastic Gradient MCMC", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1074", "id": "HkeoOo09YX", "author_site": "Wenbo Gong, Yingzhen Li, Jos\u00e9 Miguel Hern\u00e1ndez Lobato", "tldr": "This paper proposes a method to automate the design of stochastic gradient MCMC proposal using meta learning approach. ", "abstract": "Stochastic gradient Markov chain Monte Carlo (SG-MCMC) has become increasingly popular for simulating posterior samples in large-scale Bayesian modeling. However, existing SG-MCMC schemes are not tailored to any specific probabilistic model, even a simple modification of the underlying dynamical system requires significant physical intuition. This paper presents the first meta-learning algorithm that allows automated design for the underlying continuous dynamics of an SG-MCMC sampler. The learned sampler generalizes Hamiltonian dynamics with state-dependent drift and diffusion, enabling fast traversal and efficient exploration of energy landscapes. Experiments validate the proposed approach on Bayesian fully connected neural network, Bayesian convolutional neural network and Bayesian recurrent neural network tasks, showing that the learned sampler outperforms generic, hand-designed SG-MCMC algorithms, and generalizes to different datasets and larger architectures.", "keywords": "Meta Learning;MCMC", "primary_area": "", "supplementary_material": "", "author": "Wenbo Gong;Yingzhen Li;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "wg242@cam.ac.uk;yl494@cam.ac.uk;jmh233@cam.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngong2018metalearning,\ntitle={Meta-Learning For Stochastic Gradient {MCMC}},\nauthor={Wenbo Gong and Yingzhen Li and Jos\u00e9 Miguel Hern\u00e1ndez-Lobato},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkeoOo09YX},\n}", "github": "[![github](/images/github_icon.svg) WenboGong/MetaSGMCMC](https://github.com/WenboGong/MetaSGMCMC)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "wc_review": "296;369;559", "wc_reply_reviewers": "87;9;0", "wc_reply_authors": "607;714;518", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 408.0, 110.85425867627579 ], "wc_reply_reviewers_avg": [ 32.0, 39.06404996924922 ], "wc_reply_authors_avg": [ 613.0, 80.12906255951499 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5266885862075190072&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkeoOo09YX", "pdf": "https://openreview.net/pdf?id=HkeoOo09YX", "email": ";;", "author_num": 3 }, { "id": "Hkes0iR9KX", "title": "DEEP GEOMETRICAL GRAPH CLASSIFICATION", "track": "main", "status": "Reject", "tldr": "The graph analysis problem is transformed into a point cloud analysis problem. ", "abstract": "Most of the existing Graph Neural Networks (GNNs) are the mere extension of the Convolutional Neural Networks (CNNs) to graphs. Generally, they consist of several steps of message passing between the nodes followed by a global indiscriminate feature pooling function. In many data-sets, however, the nodes are unlabeled or their labels provide no information about the similarity between the nodes and the locations of the nodes in the graph. Accordingly, message passing may not propagate helpful information throughout the graph. We show that this conventional approach can fail to learn to perform even simple graph classification tasks. We alleviate this serious shortcoming of the GNNs by making them a two step method. In the first of the proposed approach, a graph embedding algorithm is utilized to obtain a continuous feature vector for each node of the graph. The embedding algorithm represents the graph as a point-cloud in the embedding space. In the second step, the GNN is applied to the point-cloud representation of the graph provided by the embedding method. The GNN learns to perform the given task by inferring the topological structure of the graph encoded in the spatial distribution of the embedded vectors. In addition, we extend the proposed approach to the graph clustering problem and a new architecture for graph clustering is proposed. Moreover, the spatial representation of the graph is utilized to design a graph pooling algorithm. We turn the problem of graph down-sampling into a column sampling problem, i.e., the sampling algorithm selects a subset of the nodes whose feature vectors preserve the spatial distribution of all the feature vectors. We apply the proposed approach to several popular benchmark data-sets and it is shown that the proposed geometrical approach strongly improves the state-of-the-art result for several data-sets. For instance, for the PTC data-set, we improve the state-of-the-art result for more than 22 %.", "keywords": "Graph classification;Deep Learning;Graph pooling;Embedding", "primary_area": "", "supplementary_material": "", "author": "Mostafa Rahmani;Ping Li", "authorids": "rahmani.sut@gmail.com;pingli98@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nrahmani2019deep,\ntitle={{DEEP} {GEOMETRICAL} {GRAPH} {CLASSIFICATION}},\nauthor={Mostafa Rahmani and Ping Li},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkes0iR9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkes0iR9KX", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;5", "wc_review": "230;272;700", "wc_reply_reviewers": "45;0;14", "wc_reply_authors": "1051;834;811", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 400.6666666666667, 212.3540021337536 ], "wc_reply_reviewers_avg": [ 19.666666666666668, 18.80307303489394 ], "wc_reply_authors_avg": [ 898.6666666666666, 108.12441393547014 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iG1dSY7drxMJ:scholar.google.com/&scioq=DEEP+GEOMETRICAL+GRAPH+CLASSIFICATION&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "Hkesr205t7", "title": "Learning shared manifold representation of images and attributes for generalized zero-shot learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many of the zero-shot learning methods have realized predicting labels of unseen images by learning the relations between images and pre-defined class-attributes. However, recent studies show that, under the more realistic generalized zero-shot learning (GZSL) scenarios, these approaches severely suffer from the issue of biased prediction, i.e., their classifier tends to predict all the examples from both seen and unseen classes as one of the seen classes. The cause of this problem is that they cannot properly learn a mapping to the representation space generalized to the unseen classes since the training set does not include any unseen class information. To solve this, we propose a concept to learn a mapping that embeds both images and attributes to the shared representation space that can be generalized even for unseen classes by interpolating from the information of seen classes, which we refer to shared manifold learning. Furthermore, we propose modality invariant variational autoencoders, which can perform shared manifold learning by training variational autoencoders with both images and attributes as inputs. The empirical validation of well-known datasets in GZSL shows that our method achieves the significantly superior performances to the existing relation-based studies.", "keywords": "zero-shot learning;variational autoencoders", "primary_area": "", "supplementary_material": "", "author": "Masahiro Suzuki;Yusuke Iwasawa;Yutaka Matsuo", "authorids": "masa@weblab.t.u-tokyo.ac.jp;iwasawa@weblab.t.u-tokyo.ac.jp;matsuo@weblab.t.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsuzuki2019learning,\ntitle={Learning shared manifold representation of images and attributes for generalized zero-shot learning},\nauthor={Masahiro Suzuki and Yusuke Iwasawa and Yutaka Matsuo},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkesr205t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hkesr205t7", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;4", "wc_review": "246;314;922", "wc_reply_reviewers": "0;0;434", "wc_reply_authors": "535;342;1310", "reply_reviewers": "0;0;1", "reply_authors": "1;1;3", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 494.0, 303.91226804238534 ], "wc_reply_reviewers_avg": [ 144.66666666666666, 204.58956202330776 ], "wc_reply_authors_avg": [ 729.0, 418.31646712347657 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8187886483548270109&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HketHo0qFm", "title": "Hybrid Policies Using Inverse Rewards for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "A broad-spectrum improvement for reinforcement learning algorithms, which combines the policies using original rewards and inverse (negative) rewards", "abstract": "This paper puts forward a broad-spectrum improvement for reinforcement learning algorithms, which combines the policies using original rewards and inverse (negative) rewards. The policies using inverse rewards are competitive with the original policies, and help the original policies correct their mis-actions. We have proved the convergence of the inverse policies. The experiments for some games in OpenAI gym show that the hybrid polices based on deep Q-learning, double Q-learning, and on-policy actor-critic obtain the rewards up to 63.8%, 97.8%, and 54.7% more than the original algorithms. The improved polices are more stable than the original policies as well.", "keywords": "Reinforcement Learning;Rewards", "primary_area": "", "supplementary_material": "", "author": "Yao Shi;Tian Xia;Guanjun Zhao;Xin Gao", "authorids": "yao.shi@huawei.com;xiatian14@huawei.com;zhaoguanjun1@huawei.com;gaoxin17@huawei.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshi2019hybrid,\ntitle={Hybrid Policies Using Inverse Rewards for Reinforcement Learning},\nauthor={Yao Shi and Tian Xia and Guanjun Zhao and Xin Gao},\nyear={2019},\nurl={https://openreview.net/forum?id=HketHo0qFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=HketHo0qFm", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;5", "wc_review": "430;397;289", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 372.0, 60.21627686929839 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jDihpLQfEu0J:scholar.google.com/&scioq=Hybrid+Policies+Using+Inverse+Rewards+for+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HkeyZhC9F7", "title": "Learning Heuristics for Automated Reasoning through Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "RL finds better heuristics for automated reasoning algorithms.", "abstract": "We demonstrate how to learn efficient heuristics for automated reasoning algorithms through deep reinforcement learning. We focus on backtracking search algorithms for quantified Boolean logics, which already can solve formulas of impressive size - up to 100s of thousands of variables. The main challenge is to find a representation of these formulas that lends itself to making predictions in a scalable way. For challenging problems, the heuristic learned through our approach reduces execution time by a factor of 10 compared to the existing handwritten heuristics.", "keywords": "reinforcement learning;deep learning;logics;formal methods;automated reasoning;backtracking search;satisfiability;quantified Boolean formulas", "primary_area": "", "supplementary_material": "", "author": "Gil Lederman;Markus N. Rabe;Edward A. Lee;Sanjit A. Seshia", "authorids": "gilled@berkeley.edu;markus.norman.rabe@gmail.com;eal@berkeley.edu;sseshia@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlederman2019learning,\ntitle={Learning Heuristics for Automated Reasoning through Reinforcement Learning},\nauthor={Gil Lederman and Markus N. Rabe and Edward A. Lee and Sanjit A. Seshia},\nyear={2019},\nurl={https://openreview.net/forum?id=HkeyZhC9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkeyZhC9F7", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;4;4", "wc_review": "132;223;274", "wc_reply_reviewers": "0;0;20", "wc_reply_authors": "375;333;550", "reply_reviewers": "0;0;1", "reply_authors": "1;1;3", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 209.66666666666666, 58.73291713813946 ], "wc_reply_reviewers_avg": [ 6.666666666666667, 9.428090415820632 ], "wc_reply_authors_avg": [ 419.3333333333333, 93.97280930614392 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11948805553355803329&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Systematic Generalization: What Is Required and Can It Be Learned?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/777", "id": "HkezXnA9YX", "author_site": "Dzmitry Bahdanau, Shikhar Murty, Mikhail Noukhovitch, Thien H Nguyen, Harm de Vries, Aaron Courville", "tldr": "We show that modular structured models are the best in terms of systematic generalization and that their end-to-end versions don't generalize as well.", "abstract": "Numerous models for grounded language understanding have been recently proposed, including (i) generic models that can be easily adapted to any given task and (ii) intuitively appealing modular models that require background knowledge to be instantiated. We compare both types of models in how much they lend themselves to a particular form of systematic generalization. Using a synthetic VQA test, we evaluate which models are capable of reasoning about all possible object pairs after training on only a small subset of them. Our findings show that the generalization of modular models is much more systematic and that it is highly sensitive to the module layout, i.e. to how exactly the modules are connected. We furthermore investigate if modular models that generalize well could be made more end-to-end by learning their layout and parametrization. We find that end-to-end methods from prior work often learn inappropriate layouts or parametrizations that do not facilitate systematic generalization. Our results suggest that, in addition to modularity, systematic generalization in language understanding may require explicit regularizers or priors.\n", "keywords": "systematic generalization;language understanding;visual questions answering;neural module networks", "primary_area": "", "supplementary_material": "", "author": "Dzmitry Bahdanau*;Shikhar Murty*;Michael Noukhovitch;Thien Huu Nguyen;Harm de Vries;Aaron Courville", "authorids": "dimabgv@gmail.com;shikhar.murty@gmail.com;michael.noukhovitch@umontreal.ca;thien@cs.uoregon.edu;mail@harmdevries.com;aaron.courville@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nbahdanau2018systematic,\ntitle={Systematic Generalization: What Is Required and Can It Be Learned?},\nauthor={Dzmitry Bahdanau and Shikhar Murty and Michael Noukhovitch and Thien Huu Nguyen and Harm de Vries and Aaron Courville},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkezXnA9YX},\n}", "github": "[![github](/images/github_icon.svg) rizar/systematic-generalization-sqoop](https://github.com/rizar/systematic-generalization-sqoop) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HkezXnA9YX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;5;3", "wc_review": "391;999;581", "wc_reply_reviewers": "458;0;63", "wc_reply_authors": "1021;1065;1161", "reply_reviewers": "1;0;1", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 657.0, 253.96587697300333 ], "wc_reply_reviewers_avg": [ 173.66666666666666, 202.69243257265975 ], "wc_reply_authors_avg": [ 1082.3333333333333, 58.45416057808793 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=376953749686735892&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkezXnA9YX", "pdf": "https://openreview.net/pdf?id=HkezXnA9YX", "email": ";;;;;", "author_num": 6 }, { "id": "HkezfhA5Y7", "title": "A Rate-Distortion Theory of Adversarial Examples", "track": "main", "status": "Reject", "tldr": "We argue that excess capacity is a significant cause of susceptibility to adversarial examples.", "abstract": "The generalization ability of deep neural networks (DNNs) is intertwined with model complexity, robustness, and capacity. Through establishing an equivalence between a DNN and a noisy communication channel, we characterize generalization and fault tolerance for unbounded adversarial attacks in terms of information-theoretic quantities. Invoking rate-distortion theory, we suggest that excess capacity is a significant cause of vulnerability to adversarial examples.", "keywords": "adversarial examples;information bottleneck;robustness", "primary_area": "", "supplementary_material": "", "author": "Angus Galloway;Anna Golubeva;Graham W. Taylor", "authorids": "gallowaa@uoguelph.ca;agolubeva@perimeterinstitute.ca;gwtaylor@uoguelph.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngalloway2019a,\ntitle={A Rate-Distortion Theory of Adversarial Examples},\nauthor={Angus Galloway and Anna Golubeva and Graham W. Taylor},\nyear={2019},\nurl={https://openreview.net/forum?id=HkezfhA5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkezfhA5Y7", "pdf_size": 0, "rating": "2;3;4", "confidence": "3;3;4", "wc_review": "189;188;209", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 195.33333333333334, 9.672412085697939 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:nyXmIEmlNhQJ:scholar.google.com/&scioq=A+Rate-Distortion+Theory+of+Adversarial+Examples&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Efficient Lifelong Learning with A-GEM", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/715", "id": "Hkf2_sC5FX", "author_site": "Arslan Chaudhry, Marc'Aurelio Ranzato, Marcus Rohrbach, Mohamed Elhoseiny", "tldr": "An efficient lifelong learning algorithm that provides a better trade-off between accuracy and time/ memory complexity compared to other algorithms. ", "abstract": "In lifelong learning, the learner is presented with a sequence of tasks, incrementally building a data-driven prior which may be leveraged to speed up learning of a new task. In this work, we investigate the efficiency of current lifelong approaches, in terms of sample complexity, computational and memory cost. Towards this end, we first introduce a new and a more realistic evaluation protocol, whereby learners observe each example only once and hyper-parameter selection is done on a small and disjoint set of tasks, which is not used for the actual learning experience and evaluation. Second, we introduce a new metric measuring how quickly a learner acquires a new skill. Third, we propose an improved version of GEM (Lopez-Paz & Ranzato, 2017), dubbed Averaged GEM (A-GEM), which enjoys the same or even better performance as GEM, while being almost as computationally and memory efficient as EWC (Kirkpatrick et al., 2016) and other regularization-based methods. Finally, we show that all algorithms including A-GEM can learn even more quickly if they are provided with task descriptors specifying the classification tasks under consideration. Our experiments on several standard lifelong learning benchmarks demonstrate that A-GEM has the best trade-off between accuracy and efficiency", "keywords": "Lifelong Learning;Continual Learning;Catastrophic Forgetting;Few-shot Transfer", "primary_area": "", "supplementary_material": "", "author": "Arslan Chaudhry;Marc\u2019Aurelio Ranzato;Marcus Rohrbach;Mohamed Elhoseiny", "authorids": "arslan.chaudhry@eng.ox.ac.uk;ranzato@fb.com;mrf@fb.com;elhoseiny@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchaudhry2018efficient,\ntitle={Efficient Lifelong Learning with A-{GEM}},\nauthor={Arslan Chaudhry and Marc\u2019Aurelio Ranzato and Marcus Rohrbach and Mohamed Elhoseiny},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkf2_sC5FX},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/agem](https://github.com/facebookresearch/agem) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Hkf2_sC5FX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "764;117;116", "wc_reply_reviewers": "0;44;15", "wc_reply_authors": "772;844;459", "reply_reviewers": "0;1;1", "reply_authors": "2;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 332.3333333333333, 305.23470022627214 ], "wc_reply_reviewers_avg": [ 19.666666666666668, 18.263503375736967 ], "wc_reply_authors_avg": [ 691.6666666666666, 167.1253687771216 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1788, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14191909055509326948&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Hkf2_sC5FX", "pdf": "https://openreview.net/pdf?id=Hkf2_sC5FX", "email": ";;;", "author_num": 4 }, { "title": "Multi-step Retriever-Reader Interaction for Scalable Open-domain Question Answering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1001", "id": "HkfPSh05K7", "author_site": "Rajarshi Das, Shehzaad Dhuliawala, Manzil Zaheer, Andrew McCallum", "tldr": "Paragraph retriever and machine reader interacts with each other via reinforcement learning to yield large improvements on open domain datasets", "abstract": "This paper introduces a new framework for open-domain question answering in which the retriever and the reader \\emph{iteratively interact} with each other. The framework is agnostic to the architecture of the machine reading model provided it has \\emph{access} to the token-level hidden representations of the reader. The retriever uses fast nearest neighbor search that allows it to scale to corpora containing millions of paragraphs. A gated recurrent unit updates the query at each step conditioned on the \\emph{state} of the reader and the \\emph{reformulated} query is used to re-rank the paragraphs by the retriever. We conduct analysis and show that iterative interaction helps in retrieving informative paragraphs from the corpus. Finally, we show that our multi-step-reasoning framework brings consistent improvement when applied to two widely used reader architectures (\\drqa and \\bidaf) on various large open-domain datasets ---\\tqau, \\quasart, \\searchqa, and \\squado\\footnote{Code and pretrained models are available at \\url{https://github.com/rajarshd/Multi-Step-Reasoning}}.", "keywords": "Open domain Question Answering;Reinforcement Learning;Query reformulation", "primary_area": "", "supplementary_material": "", "author": "Rajarshi Das;Shehzaad Dhuliawala;Manzil Zaheer;Andrew McCallum", "authorids": "rajarshi@cs.umass.edu;sdhuliawala@cs.umass.edu;manzil@cmu.edu;mccallum@cs.umass.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ndas2018multistep,\ntitle={Multi-step Retriever-Reader Interaction for Scalable Open-domain Question Answering},\nauthor={Rajarshi Das and Shehzaad Dhuliawala and Manzil Zaheer and Andrew McCallum},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkfPSh05K7},\n}", "github": "[![github](/images/github_icon.svg) rajarshd/Multi-Step-Reasoning](https://github.com/rajarshd/Multi-Step-Reasoning)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;4", "wc_review": "386;210;707", "wc_reply_reviewers": "0;0;85", "wc_reply_authors": "1166;697;1330", "reply_reviewers": "0;0;1", "reply_authors": "2;1;3", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 434.3333333333333, 205.75767840404455 ], "wc_reply_reviewers_avg": [ 28.333333333333332, 40.069384267237695 ], "wc_reply_authors_avg": [ 1064.3333333333333, 268.23414315771873 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 225, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17865791345794061973&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkfPSh05K7", "pdf": "https://openreview.net/pdf?id=HkfPSh05K7", "email": ";;;", "author_num": 4 }, { "title": "Double Viterbi: Weight Encoding for High Compression Ratio and Fast On-Chip Reconstruction for Deep Neural Network", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1103", "id": "HkfYOoCcYX", "author_site": "Daehyun Ahn, Dongsoo Lee, Taesu Kim, Jae-Joon Kim", "tldr": "We present a new weight encoding scheme which enables high compression ratio and fast sparse-to-dense matrix conversion.", "abstract": "Weight pruning has been introduced as an efficient model compression technique. Even though pruning removes significant amount of weights in a network, memory requirement reduction was limited since conventional sparse matrix formats require significant amount of memory to store index-related information. Moreover, computations associated with such sparse matrix formats are slow because sequential sparse matrix decoding process does not utilize highly parallel computing systems efficiently. As an attempt to compress index information while keeping the decoding process parallelizable, Viterbi-based pruning was suggested. Decoding non-zero weights, however, is still sequential in Viterbi-based pruning. In this paper, we propose a new sparse matrix format in order to enable a highly parallel decoding process of the entire sparse matrix. The proposed sparse matrix is constructed by combining pruning and weight quantization. For the latest RNN models on PTB and WikiText-2 corpus, LSTM parameter storage requirement is compressed 19x using the proposed sparse matrix format compared to the baseline model. Compressed weight and indices can be reconstructed into a dense matrix fast using Viterbi encoders. Simulation results show that the proposed scheme can feed parameters to processing elements 20 % to 106 % faster than the case where the dense matrix values directly come from DRAM.", "keywords": "quantization;pruning;memory footprint;model compression;sparse matrix", "primary_area": "", "supplementary_material": "", "author": "Daehyun Ahn;Dongsoo Lee;Taesu Kim;Jae-Joon Kim", "authorids": "daehyun.ahn@postech.ac.kr;dslee3@gmail.com;taesukim@postech.ac.kr;jaejoon@postech.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nahn2018double,\ntitle={Double Viterbi: Weight Encoding for High Compression Ratio and Fast On-Chip Reconstruction for Deep Neural Network},\nauthor={Daehyun Ahn and Dongsoo Lee and Taesu Kim and Jae-Joon Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkfYOoCcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;2", "wc_review": "329;372;356", "wc_reply_reviewers": "123;288;89", "wc_reply_authors": "1089;1262;605", "reply_reviewers": "1;3;1", "reply_authors": "2;4;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 352.3333333333333, 17.745108872274887 ], "wc_reply_reviewers_avg": [ 166.66666666666666, 86.91119349977627 ], "wc_reply_authors_avg": [ 985.3333333333334, 278.05555000554995 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16044429059282632198&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=HkfYOoCcYX", "pdf": "https://openreview.net/pdf?id=HkfYOoCcYX", "email": ";;;", "author_num": 4 }, { "id": "HkfwpiA9KX", "title": "Automata Guided Skill Composition", "track": "main", "status": "Reject", "tldr": "A formal method's approach to skill composition in reinforcement learning tasks", "abstract": "Skills learned through (deep) reinforcement learning often generalizes poorly\nacross tasks and re-training is necessary when presented with a new task. We\npresent a framework that combines techniques in formal methods with reinforcement\nlearning (RL) that allows for the convenient specification of complex temporal\ndependent tasks with logical expressions and construction of new skills from existing\nones with no additional exploration. We provide theoretical results for our\ncomposition technique and evaluate on a simple grid world simulation as well as\na robotic manipulation task.", "keywords": "Skill composition;temporal logic;finite state automata", "primary_area": "", "supplementary_material": "", "author": "Xiao Li;Yao Ma;Calin Belta", "authorids": "xli87@bu.edu;yaoma@bu.edu;cbelta@bu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2019automata,\ntitle={Automata Guided Skill Composition},\nauthor={Xiao Li and Yao Ma and Calin Belta},\nyear={2019},\nurl={https://openreview.net/forum?id=HkfwpiA9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkfwpiA9KX", "pdf_size": 0, "rating": "5;5;6;7", "confidence": "2;2;4;3", "wc_review": "228;686;172;225", "wc_reply_reviewers": "0;79;0;0", "wc_reply_authors": "462;937;360;45", "reply_reviewers": "0;1;0;0", "reply_authors": "1;2;1;1", "rating_avg": [ 5.75, 0.82915619758885 ], "confidence_avg": [ 2.75, 0.82915619758885 ], "wc_review_avg": [ 327.75, 208.0316983058111 ], "wc_reply_reviewers_avg": [ 19.75, 34.208003449485325 ], "wc_reply_authors_avg": [ 451.0, 319.935149678806 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.6363636363636364, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:W2cHH_baGrkJ:scholar.google.com/&scioq=Automata+Guided+Skill+Composition&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Hkg1YiAcK7", "title": "Learning Implicit Generative Models by Teaching Explicit Ones", "track": "main", "status": "Reject", "tldr": "", "abstract": "Implicit generative models are difficult to train as no explicit probability density functions are defined. Generative adversarial nets (GANs) propose a minimax framework to train such models, which suffer from mode collapse in practice due to the nature of the JS-divergence. In contrast, we propose a learning by teaching (LBT) framework to learn implicit models, which intrinsically avoid the mode collapse problem because of using the KL-divergence. In LBT, an auxiliary explicit model is introduced to learn the distribution defined by the implicit model while the later one's goal is to teach the explicit model to match the data distribution. LBT is formulated as a bilevel optimization problem, whose optimum implies that we obtain the maximum likelihood estimation of the implicit model. We adopt an unrolling approach to solve the challenging learning problem. Experimental results demonstrate the effectiveness of our method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chao Du;Kun Xu;Chongxuan Li;Jun Zhu;Bo Zhang", "authorids": "duchao0726@gmail.com;kunxu.thu@gmail.com;chongxuanli1991@gmail.com;dcszj@tsinghua.edu.cn;dcszb@tsinghua.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndu2019learning,\ntitle={Learning Implicit Generative Models by Teaching Explicit Ones},\nauthor={Chao Du and Kun Xu and Chongxuan Li and Jun Zhu and Bo Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkg1YiAcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hkg1YiAcK7", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;3", "wc_review": "291;379;276", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "903;737;47", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 315.3333333333333, 45.433712397538265 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 562.3333333333334, 370.6438841919409 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10539811251539836360&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Hkg1csA5Y7", "title": "A fast quasi-Newton-type method for large-scale stochastic optimisation", "track": "main", "status": "Reject", "tldr": "", "abstract": "During recent years there has been an increased interest in stochastic adaptations of limited memory quasi-Newton methods, which compared to pure gradient-based routines can improve the convergence by incorporating second order information. In this work we propose a direct least-squares approach conceptually similar to the limited memory quasi-Newton methods, but that computes the search direction in a slightly different way. This is achieved in a fast and numerically robust manner by maintaining a Cholesky factor of low dimension. This is combined with a stochastic line search relying upon fulfilment of the Wolfe condition in a backtracking manner, where the step length is adaptively modified with respect to the optimisation progress. We support our new algorithm by providing several theoretical results guaranteeing its performance. The performance is demonstrated on real-world benchmark problems which shows improved results in comparison with already established methods.", "keywords": "optimisation;large-scale;stochastic", "primary_area": "", "supplementary_material": "", "author": "Adrian Wills;Thomas B. Sch\u00f6n;Carl Jidling", "authorids": "adrian.wills@newcastle.edu.au;thomas.schon@it.uu.se;carl.jidling@it.uu.se", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwills2019a,\ntitle={A fast quasi-Newton-type method for large-scale stochastic optimisation},\nauthor={Adrian Wills and Thomas B. Sch\u00f6n and Carl Jidling},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkg1csA5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hkg1csA5Y7", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;5;5", "wc_review": "1409;526;440", "wc_reply_reviewers": "583;0;0", "wc_reply_authors": "1345;738;609", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 791.6666666666666, 437.9302329011881 ], "wc_reply_reviewers_avg": [ 194.33333333333334, 274.82883562117144 ], "wc_reply_authors_avg": [ 897.3333333333334, 320.89908417167885 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7742547547191828207&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "Hkg313AcFX", "title": "Metropolis-Hastings view on variational inference and adversarial training", "track": "main", "status": "Reject", "tldr": "Learning to sample via lower bounding the acceptance rate of the Metropolis-Hastings algorithm", "abstract": "In this paper we propose to view the acceptance rate of the Metropolis-Hastings algorithm as a universal objective for learning to sample from target distribution -- given either as a set of samples or in the form of unnormalized density. This point of view unifies the goals of such approaches as Markov Chain Monte Carlo (MCMC), Generative Adversarial Networks (GANs), variational inference. To reveal the connection we derive the lower bound on the acceptance rate and treat it as the objective for learning explicit and implicit samplers. The form of the lower bound allows for doubly stochastic gradient optimization in case the target distribution factorizes (i.e. over data points). We empirically validate our approach on Bayesian inference for neural networks and generative models for images.", "keywords": "MCMC;GANs;Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Kirill Neklyudov;Dmitry Vetrov", "authorids": "k.necludov@gmail.com;vetrodim@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nneklyudov2019metropolishastings,\ntitle={Metropolis-Hastings view on variational inference and adversarial training},\nauthor={Kirill Neklyudov and Dmitry Vetrov},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkg313AcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkg313AcFX", "pdf_size": 0, "rating": "5;6;9", "confidence": "3;4;4", "wc_review": "159;875;229", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "213;1118;281", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 421.0, 322.2959302669934 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 537.3333333333334, 411.5307467924548 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.6933752452815364, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12245157922144157597&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Overcoming the Disentanglement vs Reconstruction Trade-off via Jacobian Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1110", "id": "Hkg4W2AcFm", "tldr": "A method for learning image representations that are good for both disentangling factors of variation and obtaining faithful reconstructions.", "abstract": "A major challenge in learning image representations is the disentangling of the factors of variation underlying the image formation. This is typically achieved with an autoencoder architecture where a subset of the latent variables is constrained to correspond to specific factors, and the rest of them are considered nuisance variables. This approach has an important drawback: as the dimension of the nuisance variables is increased, image reconstruction is improved, but the decoder has the flexibility to ignore the specified factors, thus losing the ability to condition the output on them. In this work, we propose to overcome this trade-off by progressively growing the dimension of the latent code, while constraining the Jacobian of the output image with respect to the disentangled variables to remain the same. As a result, the obtained models are effective at both disentangling and reconstruction. We demonstrate the applicability of this method in both unsupervised and supervised scenarios for learning disentangled representations. In a facial attribute manipulation task, we obtain high quality image generation while smoothly controlling dozens of attributes with a single model. This is an order of magnitude more disentangled factors than state-of-the-art methods, while obtaining visually similar or superior results, and avoiding adversarial training.", "keywords": "disentangling;autoencoders;jacobian;face manipulation", "primary_area": "", "supplementary_material": "", "author": "Jos\u00e9 Lezama", "authorids": "jlezama@fing.edu.uy", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nlezama2018overcoming,\ntitle={Overcoming the Disentanglement vs Reconstruction Trade-off via Jacobian Supervision},\nauthor={Jos\u00e9 Lezama},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkg4W2AcFm},\n}", "github": "[![github](/images/github_icon.svg) jlezama/disentangling-jacobian](https://github.com/jlezama/disentangling-jacobian)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;4", "wc_review": "295;865;271", "wc_reply_reviewers": "29;0;0", "wc_reply_authors": "423;821;225", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 477.0, 274.5323296080081 ], "wc_reply_reviewers_avg": [ 9.666666666666666, 13.67073110293992 ], "wc_reply_authors_avg": [ 489.6666666666667, 247.8404504694278 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=72617481773116679&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Hkg4W2AcFm", "pdf": "https://openreview.net/pdf?id=Hkg4W2AcFm", "email": "", "author_num": 1 }, { "id": "HkgDTiCctQ", "title": "Knowledge Distillation from Few Samples", "track": "main", "status": "Reject", "tldr": "This paper proposes a novel and simple method for knowledge distillation from few samples.", "abstract": "Current knowledge distillation methods require full training data to distill knowledge from a large \"teacher\" network to a compact \"student\" network by matching certain statistics between \"teacher\" and \"student\" such as softmax outputs and feature responses. This is not only time-consuming but also inconsistent with human cognition in which children can learn knowledge from adults with few examples. This paper proposes a novel and simple method for knowledge distillation from few samples. Taking the assumption that both \"teacher\" and \"student\" have the same feature map sizes at each corresponding block, we add a $1\\times 1$ conv-layer at the end of each block in the student-net, and align the block-level outputs between \"teacher\" and \"student\" by estimating the parameters of the added layer with limited samples. We prove that the added layer can be absorbed/merged into the previous conv-layer \\hl{to formulate a new conv-layer with the same size of parameters and computation cost as previous one. Experiments verifies that the proposed method is very efficient and effective to distill knowledge from teacher-net to student-net constructing in different ways on various datasets.", "keywords": "knowledge distillation;few-sample learning;network compression", "primary_area": "", "supplementary_material": "", "author": "Tianhong Li;Jianguo Li;Zhuang Liu;Changshui Zhang", "authorids": "tianhong@mit.edu;jianguo.li@intel.com;zhuangl@berkeley.edu;zcs@mail.tsinghua.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2019knowledge,\ntitle={Knowledge Distillation from Few Samples},\nauthor={Tianhong Li and Jianguo Li and Zhuang Liu and Changshui Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgDTiCctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkgDTiCctQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;4", "wc_review": "398;237;1032", "wc_reply_reviewers": "173;0;0", "wc_reply_authors": "944;196;471", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 555.6666666666666, 343.17180666378886 ], "wc_reply_reviewers_avg": [ 57.666666666666664, 81.55298209684848 ], "wc_reply_authors_avg": [ 537.0, 308.915306624108 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13275659867084185475&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/870", "id": "HkgEQnRqYQ", "author_site": "Zhiqing Sun, Zhi-Hong Deng, Jian-Yun Nie, Jian Tang", "tldr": "A new state-of-the-art approach for knowledge graph embedding.", "abstract": "We study the problem of learning representations of entities and relations in knowledge graphs for predicting missing links. The success of such a task heavily relies on the ability of modeling and inferring the patterns of (or between) the relations. In this paper, we present a new approach for knowledge graph embedding called RotatE, which is able to model and infer various relation patterns including: symmetry/antisymmetry, inversion, and composition. Specifically, the RotatE model defines each relation as a rotation from the source entity to the target entity in the complex vector space. In addition, we propose a novel self-adversarial negative sampling technique for efficiently and effectively training the RotatE model. Experimental results on multiple benchmark knowledge graphs show that the proposed RotatE model is not only scalable, but also able to infer and model various relation patterns and significantly outperform existing state-of-the-art models for link prediction.", "keywords": "knowledge graph embedding;knowledge graph completion;adversarial sampling", "primary_area": "", "supplementary_material": "", "author": "Zhiqing Sun;Zhi-Hong Deng;Jian-Yun Nie;Jian Tang", "authorids": "1500012783@pku.edu.cn;zhdeng@pku.edu.cn;nie@iro.umontreal.ca;jian.tang@hec.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsun2018rotate,\ntitle={RotatE: Knowledge Graph Embedding by Relational Rotation in Complex Space},\nauthor={Zhiqing Sun and Zhi-Hong Deng and Jian-Yun Nie and Jian Tang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgEQnRqYQ},\n}", "github": "[![github](/images/github_icon.svg) DeepGraphLearning/KnowledgeGraphEmbedding](https://github.com/DeepGraphLearning/KnowledgeGraphEmbedding) + [![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=HkgEQnRqYQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;3", "wc_review": "450;311;219", "wc_reply_reviewers": "62;0;0", "wc_reply_authors": "480;225;322", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 326.6666666666667, 94.95378993079855 ], "wc_reply_reviewers_avg": [ 20.666666666666668, 29.227080289043965 ], "wc_reply_authors_avg": [ 342.3333333333333, 105.09149452844517 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 41, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3076, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9820389801132772086&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HkgEQnRqYQ", "pdf": "https://openreview.net/pdf?id=HkgEQnRqYQ", "email": ";;;", "author_num": 4 }, { "id": "HkgHk3RctX", "title": "Seq2Slate: Re-ranking and Slate Optimization with RNNs", "track": "main", "status": "Reject", "tldr": "A pointer network architecture for re-ranking items, learned from click-through logs.", "abstract": "Ranking is a central task in machine learning and information retrieval. In this task, it is especially important to present the user with a slate of items that is appealing as a whole. This in turn requires taking into account interactions between items, since intuitively, placing an item on the slate affects the decision of which other items should be chosen alongside it.\nIn this work, we propose a sequence-to-sequence model for ranking called seq2slate. At each step, the model predicts the next item to place on the slate given the items already chosen. The recurrent nature of the model allows complex dependencies between items to be captured directly in a flexible and scalable way. We show how to learn the model end-to-end from weak supervision in the form of easily obtained click-through data. We further demonstrate the usefulness of our approach in experiments on standard ranking benchmarks as well as in a real-world recommendation system.", "keywords": "Recurrent neural networks;learning to rank;pointer networks", "primary_area": "", "supplementary_material": "", "author": "Irwan Bello;Sayali Kulkarni;Sagar Jain;Craig Boutilier;Ed Chi;Elad Eban;Xiyang Luo;Alan Mackey;Ofer Meshi", "authorids": "ibello@google.com;sayali@google.com;sagarj@google.com;cboutilier@google.com;edchi@google.com;elade@google.com;xyluo@google.com;mackeya@google.com;meshi@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@misc{\nbello2019seqslate,\ntitle={Seq2Slate: Re-ranking and Slate Optimization with {RNN}s},\nauthor={Irwan Bello and Sayali Kulkarni and Sagar Jain and Craig Boutilier and Ed Chi and Elad Eban and Xiyang Luo and Alan Mackey and Ofer Meshi},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgHk3RctX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HkgHk3RctX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkgHk3RctX", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;5", "wc_review": "633;567;666", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "322;541;1035", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 622.0, 41.15823125451335 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 632.6666666666666, 298.21058927468613 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7056246182645092863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "title": "Guiding Policies with Language via Meta-Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1041", "id": "HkgSEnA5KQ", "author_site": "John Co-Reyes, Abhishek Gupta, Suvansh Q Sanjeev, Nicholas Altieri, Jacob Andreas, John DeNero, Pieter Abbeel, Sergey Levine", "tldr": "We propose a meta-learning method for interactively correcting policies with natural language.", "abstract": "Behavioral skills or policies for autonomous agents are conventionally learned from reward functions, via reinforcement learning, or from demonstrations, via imitation learning. However, both modes of task specification have their disadvantages: reward functions require manual engineering, while demonstrations require a human expert to be able to actually perform the task in order to generate the demonstration. Instruction following from natural language instructions provides an appealing alternative: in the same way that we can specify goals to other humans simply by speaking or writing, we would like to be able to specify tasks for our machines. However, a single instruction may be insufficient to fully communicate our intent or, even if it is, may be insufficient for an autonomous agent to actually understand how to perform the desired task. In this work, we propose an interactive formulation of the task specification problem, where iterative language corrections are provided to an autonomous agent, guiding it in acquiring the desired skill. Our proposed language-guided policy learning algorithm can integrate an instruction and a sequence of corrections to acquire new skills very quickly. In our experiments, we show that this method can enable a policy to follow instructions and corrections for simulated navigation and manipulation tasks, substantially outperforming direct, non-interactive instruction following.", "keywords": "meta-learning;language grounding;interactive", "primary_area": "", "supplementary_material": "", "author": "John D. Co-Reyes;Abhishek Gupta;Suvansh Sanjeev;Nick Altieri;Jacob Andreas;John DeNero;Pieter Abbeel;Sergey Levine", "authorids": "jcoreyes@eecs.berkeley.edu;abhigupta@berkeley.edu;suvansh@berkeley.edu;naltieri@berkeley.edu;j.d.andreas@gmail.com;denero@berkeley.edu;pabbeel@cs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nco-reyes2018metalearning,\ntitle={Meta-Learning Language-Guided Policy Learning},\nauthor={John D Co-Reyes and Abhishek Gupta and Suvansh Sanjeev and Nick Altieri and John DeNero and Pieter Abbeel and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgSEnA5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "wc_review": "890;201;186", "wc_reply_reviewers": "114;0;0", "wc_reply_authors": "1426;459;572", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 425.6666666666667, 328.39035037927385 ], "wc_reply_reviewers_avg": [ 38.0, 53.74011537017761 ], "wc_reply_authors_avg": [ 819.0, 431.6858425599184 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18060553357406887446&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=HkgSEnA5KQ", "pdf": "https://openreview.net/pdf?id=HkgSEnA5KQ", "email": ";;;;;;;", "author_num": 8 }, { "id": "HkgSk2A9Y7", "title": "Stochastic Gradient Push for Distributed Deep Learning", "track": "main", "status": "Reject", "tldr": "For distributed training over high-latency networks, use gossip-based approximate distributed averaging instead of exact distribute averaging like AllReduce.", "abstract": "Large mini-batch parallel SGD is commonly used for distributed training of deep networks. Approaches that use tightly-coupled exact distributed averaging based on AllReduce are sensitive to slow nodes and high-latency communication. In this work we show the applicability of Stochastic Gradient Push (SGP) for distributed training. SGP uses a gossip algorithm called PushSum for approximate distributed averaging, allowing for much more loosely coupled communications which can be beneficial in high-latency or high-variability scenarios. The tradeoff is that approximate distributed averaging injects additional noise in the gradient which can affect the train and test accuracies. We prove that SGP converges to a stationary point of smooth, non-convex objective functions. Furthermore, we validate empirically the potential of SGP. For example, using 32 nodes with 8 GPUs per node to train ResNet-50 on ImageNet, where nodes communicate over 10Gbps Ethernet, SGP completes 90 epochs in around 1.5 hours while AllReduce SGD takes over 5 hours, and the top-1 validation accuracy of SGP remains within 1.2% of that obtained using AllReduce SGD.", "keywords": "optimization;distributed;large scale;deep learning", "primary_area": "", "supplementary_material": "", "author": "Mahmoud Assran;Nicolas Loizou;Nicolas Ballas;Mike Rabbat", "authorids": "massran@fb.com;n.loizou@sms.ed.ac.uk;ballasn@fb.com;mikerabbat@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nassran2019stochastic,\ntitle={Stochastic Gradient Push for Distributed Deep Learning},\nauthor={Mahmoud Assran and Nicolas Loizou and Nicolas Ballas and Mike Rabbat},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgSk2A9Y7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HkgSk2A9Y7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkgSk2A9Y7", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;3", "wc_review": "335;203;271", "wc_reply_reviewers": "148;0;0", "wc_reply_authors": "1262;940;1048", "reply_reviewers": "1;0;0", "reply_authors": "3;3;3", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 269.6666666666667, 53.89702115042063 ], "wc_reply_reviewers_avg": [ 49.333333333333336, 69.76786907707269 ], "wc_reply_authors_avg": [ 1083.3333333333333, 133.80915099083802 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 436, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4514037379069260169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "title": "AdaShift: Decorrelation and Convergence of Adaptive Learning Rate Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/718", "id": "HkgTkhRcKQ", "author_site": "Zhiming Zhou, Qingru Zhang, Guansong Lu, Hongwei Wang, Weinan Zhang, Yong Yu", "tldr": "We analysis and solve the non-convergence issue of Adam.", "abstract": "Adam is shown not being able to converge to the optimal solution in certain cases. Researchers recently propose several algorithms to avoid the issue of non-convergence of Adam, but their efficiency turns out to be unsatisfactory in practice. In this paper, we provide a new insight into the non-convergence issue of Adam as well as other adaptive learning rate methods. We argue that there exists an inappropriate correlation between gradient $g_t$ and the second moment term $v_t$ in Adam ($t$ is the timestep), which results in that a large gradient is likely to have small step size while a small gradient may have a large step size. We demonstrate that such unbalanced step sizes are the fundamental cause of non-convergence of Adam, and we further prove that decorrelating $v_t$ and $g_t$ will lead to unbiased step size for each gradient, thus solving the non-convergence problem of Adam. Finally, we propose AdaShift, a novel adaptive learning rate method that decorrelates $v_t$ and $g_t$ by temporal shifting, i.e., using temporally shifted gradient $g_{t-n}$ to calculate $v_t$. The experiment results demonstrate that AdaShift is able to address the non-convergence issue of Adam, while still maintaining a competitive performance with Adam in terms of both training speed and generalization. ", "keywords": "optimizer;Adam;convergence;decorrelation", "primary_area": "", "supplementary_material": "", "author": "Zhiming Zhou*;Qingru Zhang*;Guansong Lu;Hongwei Wang;Weinan Zhang;Yong Yu", "authorids": "heyohai@apex.sjtu.edu.cn;neverquit@sjtu.edu.cn;gslu@apex.sjtu.edu.cn;wanghongwei55@gmail.com;wnzhang@sjtu.edu.cn;yyu@apex.sjtu.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nzhou2018adashift,\ntitle={AdaShift: Decorrelation and Convergence of Adaptive Learning Rate Methods},\nauthor={Zhiming Zhou and Qingru Zhang and Guansong Lu and Hongwei Wang and Weinan Zhang and Yong Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgTkhRcKQ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HkgTkhRcKQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;4;4", "wc_review": "254;348;438", "wc_reply_reviewers": "209;0;0", "wc_reply_authors": "341;485;636", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 346.6666666666667, 75.12360185424788 ], "wc_reply_reviewers_avg": [ 69.66666666666667, 98.52354484532562 ], "wc_reply_authors_avg": [ 487.3333333333333, 120.44454694542584 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9276200117186011487&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkgTkhRcKQ", "pdf": "https://openreview.net/pdf?id=HkgTkhRcKQ", "email": ";;;;;", "author_num": 6 }, { "title": "AD-VAT: An Asymmetric Dueling mechanism for learning Visual Active Tracking", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1082", "id": "HkgYmhR9KX", "author_site": "Fangwei Zhong, peng sun, Wenhan Luo, Tingyun Yan, Yizhou Wang", "tldr": "We propose AD-VAT, where the tracker and the target object, viewed as two learnable agents, are opponents and can mutually enhance during training.", "abstract": "Visual Active Tracking (VAT) aims at following a target object by autonomously controlling the motion system of a tracker given visual observations. Previous work has shown that the tracker can be trained in a simulator via reinforcement learning and deployed in real-world scenarios. However, during training, such a method requires manually specifying the moving path of the target object to be tracked, which cannot ensure the tracker\u2019s generalization on the unseen object moving patterns. To learn a robust tracker for VAT, in this paper, we propose a novel adversarial RL method which adopts an Asymmetric Dueling mechanism, referred to as AD-VAT. In AD-VAT, both the tracker and the target are approximated by end-to-end neural networks, and are trained via RL in a dueling/competitive manner: i.e., the tracker intends to lockup the target, while the target tries to escape from the tracker. They are asymmetric in that the target is aware of the tracker, but not vice versa. Specifically, besides its own observation, the target is fed with the tracker\u2019s observation and action, and learns to predict the tracker\u2019s reward as an auxiliary task. We show that such an asymmetric dueling mechanism produces a stronger target, which in turn induces a more robust tracker. To stabilize the training, we also propose a novel partial zero-sum reward for the tracker/target. The experimental results, in both 2D and 3D environments, demonstrate that the proposed method leads to a faster convergence in training and yields more robust tracking behaviors in different testing scenarios. For supplementary videos, see: https://www.youtube.com/playlist?list=PL9rZj4Mea7wOZkdajK1TsprRg8iUf51BS \n The code is available at https://github.com/zfw1226/active_tracking_rl", "keywords": "Active tracking;reinforcement learning;adversarial learning;multi agent", "primary_area": "", "supplementary_material": "", "author": "Fangwei Zhong;Peng Sun;Wenhan Luo;Tingyun Yan;Yizhou Wang", "authorids": "zfw@pku.edu.cn;pengsun000@gmail.com;whluo.china@gmail.com;yanty18@pku.edu.cn;yizhou.wang@pku.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhong2018advat,\ntitle={{AD}-{VAT}: An Asymmetric Dueling mechanism for learning Visual Active Tracking},\nauthor={Fangwei Zhong and Peng Sun and Wenhan Luo and Tingyun Yan and Yizhou Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgYmhR9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;4;4", "wc_review": "340;222;138", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "684;672;174", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 233.33333333333334, 82.85462503016286 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 510.0, 237.63838073846574 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1365795723229073364&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=HkgYmhR9KX", "pdf": "https://openreview.net/pdf?id=HkgYmhR9KX", "email": ";;;;", "author_num": 5 }, { "id": "HkghV209tm", "title": "Optimistic Acceleration for Optimization", "track": "main", "status": "Reject", "tldr": "We consider new variants of optimization algorithms for training deep nets.", "abstract": "We consider new variants of optimization algorithms. Our algorithms are based on the observation that mini-batch of stochastic gradients in consecutive iterations do not change drastically and consequently may be predictable. Inspired by the similar setting in online learning literature called Optimistic Online learning, we propose two new optimistic algorithms for AMSGrad and Adam, respectively, by exploiting the predictability of gradients. The new algorithms combine the idea of momentum method, adaptive gradient method, and algorithms in Optimistic Online learning, which leads to speed up in training deep neural nets in practice.", "keywords": "optimization;Adam;AMSGrad", "primary_area": "", "supplementary_material": "", "author": "Jun-Kun Wang;Xiaoyun Li;Ping Li", "authorids": "jimwang@gatech.edu;xl374@scarletmail.rutgers.edu;pingli98@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2019optimistic,\ntitle={Optimistic Acceleration for Optimization},\nauthor={Jun-Kun Wang and Xiaoyun Li and Ping Li},\nyear={2019},\nurl={https://openreview.net/forum?id=HkghV209tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=HkghV209tm", "pdf_size": 0, "rating": "4;5;5;6", "confidence": "4;4;4;2", "wc_review": "762;1088;749;205", "wc_reply_reviewers": "85;166;186;0", "wc_reply_authors": "251;203;438;165", "reply_reviewers": "1;1;1;0", "reply_authors": "1;1;1;1", "rating_avg": [ 5.0, 0.7071067811865476 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "wc_review_avg": [ 701.0, 316.9424237933445 ], "wc_reply_reviewers_avg": [ 109.25, 73.54377947862076 ], "wc_reply_authors_avg": [ 264.25, 104.84124903872521 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.816496580927726, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J9blDZx51YEJ:scholar.google.com/&scioq=Optimistic+Acceleration+for+Optimization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HkgmzhC5F7", "title": "A Modern Take on the Bias-Variance Tradeoff in Neural Networks", "track": "main", "status": "Reject", "tldr": "We revisit empirically and theoretically the bias-variance tradeoff for neural networks to shed more light on their generalization properties.", "abstract": "We revisit the bias-variance tradeoff for neural networks in light of modern empirical findings. The traditional bias-variance tradeoff in machine learning suggests that as model complexity grows, variance increases. Classical bounds in statistical learning theory point to the number of parameters in a model as a measure of model complexity, which means the tradeoff would indicate that variance increases with the size of neural networks. However, we empirically find that variance due to training set sampling is roughly constant (with both width and depth) in practice. Variance caused by the non-convexity of the loss landscape is different. We find that it decreases with width and increases with depth, in our setting. We provide theoretical analysis, in a simplified setting inspired by linear models, that is consistent with our empirical findings for width. We view bias-variance as a useful lens to study generalization through and encourage further theoretical explanation from this perspective.", "keywords": "bias-variance tradeoff;deep learning theory;generalization;concentration", "primary_area": "", "supplementary_material": "", "author": "Brady Neal;Sarthak Mittal;Aristide Baratin;Vinayak Tantia;Matthew Scicluna;Simon Lacoste-Julien;Ioannis Mitliagkas", "authorids": "bradyneal11@gmail.com;sarthmit@gmail.com;aristidebaratin@hotmail.com;tantia.vinayak1@gmail.com;mattcscicluna@gmail.com;slacoste@iro.umontreal.ca;ioannis@iro.umontreal.ca", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nneal2019a,\ntitle={A Modern Take on the Bias-Variance Tradeoff in Neural Networks},\nauthor={Brady Neal and Sarthak Mittal and Aristide Baratin and Vinayak Tantia and Matthew Scicluna and Simon Lacoste-Julien and Ioannis Mitliagkas},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgmzhC5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkgmzhC5F7", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;4", "wc_review": "356;301;250", "wc_reply_reviewers": "0;286;0", "wc_reply_authors": "833;1090;409", "reply_reviewers": "0;2;0", "reply_authors": "2;4;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 302.3333333333333, 43.28458796795409 ], "wc_reply_reviewers_avg": [ 95.33333333333333, 134.8216929462351 ], "wc_reply_authors_avg": [ 777.3333333333334, 280.7897592307969 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 240, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2740349431492993556&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "Hkgnii09Ym", "title": "Set Transformer", "track": "main", "status": "Reject", "tldr": "Attention-based neural network to process set-structured data", "abstract": "Many machine learning tasks such as multiple instance learning, 3D shape recognition and few-shot image classification are defined on sets of instances. Since solutions to such problems do not depend on the permutation of elements of the set, models used to address them should be permutation invariant. We present an attention-based neural network module, the Set Transformer, specifically designed to model interactions among elements in the input set. The model consists of an encoder and a decoder, both of which rely on attention mechanisms. In an effort to reduce computational complexity, we introduce an attention scheme inspired by inducing point methods from sparse Gaussian process literature. It reduces computation time of self-attention from quadratic to linear in the number of elements in the set. We show that our model is theoretically attractive and we evaluate it on a range of tasks, demonstrating increased performance compared to recent methods for set-structured data.", "keywords": "attention;meta-learning;set-input neural networks;permutation invariant modeling", "primary_area": "", "supplementary_material": "", "author": "Juho Lee;Yoonho Lee;Jungtaek Kim;Adam R. Kosiorek;Seungjin Choi;Yee Whye Teh", "authorids": "juho.lee@stats.ox.ac.uk;einet89@gmail.com;jtkim@postech.ac.kr;adamk@robots.ox.ac.uk;seungjin@postech.ac.kr;y.w.teh@stats.ox.ac.uk", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlee2019set,\ntitle={Set Transformer},\nauthor={Juho Lee and Yoonho Lee and Jungtaek Kim and Adam R. Kosiorek and Seungjin Choi and Yee Whye Teh},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkgnii09Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hkgnii09Ym", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;5;3", "wc_review": "381;349;130", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "480;218;66", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 286.6666666666667, 111.5476978197618 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 254.66666666666666, 170.99187764985277 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9250245404089116989&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HkgnpiR9Y7", "title": "Recycling the discriminator for improving the inference mapping of GAN", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative adversarial networks (GANs) have achieved outstanding success in generating the high-quality data. Focusing on the generation process, existing GANs learn a unidirectional mapping from the latent vector to the data. Later, various studies point out that the latent space of GANs is semantically meaningful and can be utilized in advanced data analysis and manipulation. In order to analyze the real data in the latent space of GANs, it is necessary to investigate the inverse generation mapping from the data to the latent vector. To tackle this problem, the bidirectional generative models introduce an encoder to establish the inverse path of the generation process. Unfortunately, this effort leads to the degradation of generation quality because the imperfect generator rather interferes the encoder training and vice versa. \nIn this paper, we propose an effective algorithm to infer the latent vector based on existing unidirectional GANs by preserving their generation quality.\nIt is important to note that we focus on increasing the accuracy and efficiency of the inference mapping but not influencing the GAN performance (i.e., the quality or the diversity of the generated sample).\nFurthermore, utilizing the proposed inference mapping algorithm, we suggest a new metric for evaluating the GAN models by measuring the reconstruction error of unseen real data.\nThe experimental analysis demonstrates that the proposed algorithm achieves more accurate inference mapping than the existing method and provides the robust metric for evaluating GAN performance. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Duhyeon Bang;Hyunjung Shim", "authorids": "duhyeonbang@yonsei.ac.kr;kateshim@yonsei.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbang2019recycling,\ntitle={Recycling the discriminator for improving the inference mapping of {GAN}},\nauthor={Duhyeon Bang and Hyunjung Shim},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgnpiR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkgnpiR9Y7", "pdf_size": 0, "rating": "3;3;7", "confidence": "4;5;4", "wc_review": "329;494;513", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "806;489;166", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 445.3333333333333, 82.62498949403195 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 487.0, 261.28273319656364 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Nfr3RUvzPW4J:scholar.google.com/&scioq=Recycling+the+discriminator+for+improving+the+inference+mapping+of+GAN&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Marginal Policy Gradients: A Unified Family of Estimators for Bounded Action Spaces with Applications", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/947", "id": "HkgqFiAcFm", "author_site": "Carson Eisenach, Haichuan Yang, Ji Liu, Han Liu", "tldr": "", "abstract": "Many complex domains, such as robotics control and real-time strategy (RTS) games, require an agent to learn a continuous control. In the former, an agent learns a policy over R^d and in the latter, over a discrete set of actions each of which is parametrized by a continuous parameter. Such problems are naturally solved using policy based reinforcement learning (RL) methods, but unfortunately these often suffer from high variance leading to instability and slow convergence. Unnecessary variance is introduced whenever policies over bounded action spaces are modeled using distributions with unbounded support by applying a transformation T to the sampled action before execution in the environment. Recently, the variance reduced clipped action policy gradient (CAPG) was introduced for actions in bounded intervals, but to date no variance reduced methods exist when the action is a direction, something often seen in RTS games. To this end we introduce the angular policy gradient (APG), a stochastic policy gradient method for directional control. With the marginal policy gradients family of estimators we present a unified analysis of the variance reduction properties of APG and CAPG; our results provide a stronger guarantee than existing analyses for CAPG. Experimental results on a popular RTS game and a navigation task show that the APG estimator offers a substantial improvement over the standard policy gradient.", "keywords": "reinforcement learning;policy gradient;MOBA games", "primary_area": "", "supplementary_material": "", "author": "Carson Eisenach;Haichuan Yang;Ji Liu;Han Liu", "authorids": "eisenach@princeton.edu;h.yang@rochester.edu;ji.liu.uwisc@gmail.com;hanliu.cmu@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\neisenach2018marginal,\ntitle={Marginal Policy Gradients: A Unified Family of Estimators for Bounded Action Spaces with Applications},\nauthor={Carson Eisenach and Haichuan Yang and Ji Liu and Han Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgqFiAcFm},\n}", "github": "[![github](/images/github_icon.svg) ceisenach/MPG](https://github.com/ceisenach/MPG)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;3", "wc_review": "168;518;218", "wc_reply_reviewers": "0;0;92", "wc_reply_authors": "236;641;154", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 301.3333333333333, 154.56030825826173 ], "wc_reply_reviewers_avg": [ 30.666666666666668, 43.36921591277491 ], "wc_reply_authors_avg": [ 343.6666666666667, 212.89486189718676 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14825352687327812567&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkgqFiAcFm", "pdf": "https://openreview.net/pdf?id=HkgqFiAcFm", "email": ";;;", "author_num": 4 }, { "id": "HkgxasA5Ym", "title": "Reliable Uncertainty Estimates in Deep Neural Networks using Noise Contrastive Priors", "track": "main", "status": "Reject", "tldr": "We train neural networks to be uncertain on noisy inputs to avoid overconfident predictions outside of the training distribution.", "abstract": "Obtaining reliable uncertainty estimates of neural network predictions is a long standing challenge. Bayesian neural networks have been proposed as a solution, but it remains open how to specify their prior. In particular, the common practice of a standard normal prior in weight space imposes only weak regularities, causing the function posterior to possibly generalize in unforeseen ways on inputs outside of the training distribution. We propose noise contrastive priors (NCPs) to obtain reliable uncertainty estimates. The key idea is to train the model to output high uncertainty for data points outside of the training distribution. NCPs do so using an input prior, which adds noise to the inputs of the current mini batch, and an output prior, which is a wide distribution given these inputs. NCPs are compatible with any model that can output uncertainty estimates, are easy to scale, and yield reliable uncertainty estimates throughout training. Empirically, we show that NCPs prevent overfitting outside of the training distribution and result in uncertainty estimates that are useful for active learning. We demonstrate the scalability of our method on the flight delays data set, where we significantly improve upon previously published results.", "keywords": "uncertainty estimates;out of distribution;bayesian neural network;neural network priors;regression;active learning", "primary_area": "", "supplementary_material": "", "author": "Danijar Hafner;Dustin Tran;Timothy Lillicrap;Alex Irpan;James Davidson", "authorids": "mail@danijar.com;trandustin@google.com;countzero@google.com;alexirpan@google.com;james@electric-thought.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhafner2019reliable,\ntitle={Reliable Uncertainty Estimates in Deep Neural Networks using Noise Contrastive Priors},\nauthor={Danijar Hafner and Dustin Tran and Timothy Lillicrap and Alex Irpan and James Davidson},\nyear={2019},\nurl={https://openreview.net/forum?id=HkgxasA5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HkgxasA5Ym", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;3", "wc_review": "260;480;227", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "772;580;255", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 322.3333333333333, 112.29821409483274 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 535.6666666666666, 213.37968246505778 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5206970468867122045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "Hkl-di09FQ", "title": "Decoupling feature extraction from policy learning: assessing benefits of state representation learning in goal based robotics", "track": "main", "status": "Reject", "tldr": "We evaluate the benefits of decoupling feature extraction from policy learning in robotics and propose a new way of combining state representation learning methods.", "abstract": "Scaling end-to-end reinforcement learning to control real robots from vision presents a series of challenges, in particular in terms of sample efficiency. Against end-to-end learning, state representation learning can help learn a compact, efficient and relevant representation of states that speeds up policy learning, reducing the number of samples needed, and that is easier to interpret. We evaluate several state representation learning methods on goal based robotics tasks and propose a new unsupervised model that stacks representations and combines strengths of several of these approaches. This method encodes all the relevant features, performs on par or better than end-to-end learning, and is robust to hyper-parameters change.", "keywords": "reinforcement learning;state representation learning;feature extraction;robotics;deep learning", "primary_area": "", "supplementary_material": "", "author": "Antonin Raffin;Ashley Hill;Ren\u00e9 Traor\u00e9;Timoth\u00e9e Lesort;Natalia D\u00edaz-Rodr\u00edguez;David Filliat", "authorids": "antonin.raffin@ensta-paristech.fr;ashley.hill@u-psud.fr;krb.traore@protonmail.com;timothee.lesort@ensta-paristech.fr;diaz.rodriguez.natalia@gmail.com;david.filliat@ensta-paristech.fr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nraffin2019decoupling,\ntitle={Decoupling feature extraction from policy learning: assessing benefits of state representation learning in goal based robotics},\nauthor={Antonin Raffin and Ashley Hill and Ren\u00e9 Traor\u00e9 and Timoth\u00e9e Lesort and Natalia D\u00edaz-Rodr\u00edguez and David Filliat},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkl-di09FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hkl-di09FQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "413;344;404", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "796;434;807", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 387.0, 30.62678566222711 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 679.0, 173.2993556441185 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 73, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8674286623377515161&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "title": "On Self Modulation for Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/981", "id": "Hkl5aoR5tm", "author_site": "Ting Chen, Mario Lucic, Neil Houlsby, Sylvain Gelly", "tldr": "A simple GAN modification that improves performance across many losses, architectures, regularization schemes, and datasets. ", "abstract": "Training Generative Adversarial Networks (GANs) is notoriously challenging. We propose and study an architectural modification, self-modulation, which improves GAN performance across different data sets, architectures, losses, regularizers, and hyperparameter settings. Intuitively, self-modulation allows the intermediate feature maps of a generator to change as a function of the input noise vector. While reminiscent of other conditioning techniques, it requires no labeled data. In a large-scale empirical study we observe a relative decrease of 5%-35% in FID. Furthermore, all else being equal, adding this modification to the generator leads to improved performance in 124/144 (86%) of the studied settings. Self-modulation is a simple architectural change that requires no additional parameter tuning, which suggests that it can be applied readily to any GAN.", "keywords": "unsupervised learning;generative adversarial networks;deep generative modelling", "primary_area": "", "supplementary_material": "", "author": "Ting Chen;Mario Lucic;Neil Houlsby;Sylvain Gelly", "authorids": "iamtingchen@gmail.com;lucic@google.com;neilhoulsby@google.com;sylvaingelly@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchen2018on,\ntitle={On Self Modulation for Generative Adversarial Networks},\nauthor={Ting Chen and Mario Lucic and Neil Houlsby and Sylvain Gelly},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkl5aoR5tm},\n}", "github": "[![github](/images/github_icon.svg) google/compare_gan](https://github.com/google/compare_gan) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Hkl5aoR5tm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "5;4;4", "wc_review": "268;441;287", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "271;355;453", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 332.0, 77.46397011944758 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 359.6666666666667, 74.3744281023764 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14481067201346722037&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Hkl5aoR5tm", "pdf": "https://openreview.net/pdf?id=Hkl5aoR5tm", "email": ";;;", "author_num": 4 }, { "id": "Hkl84iCcFm", "title": "RESIDUAL NETWORKS CLASSIFY INPUTS BASED ON THEIR NEURAL TRANSIENT DYNAMICS", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this study, we analyze the input-output behavior of residual networks from a dynamical system point of view by disentangling the residual dynamics from the output activities before the classification stage. For a network with simple skip connections between every successive layer, and for logistic activation function, and shared weights between layers, we show analytically that there is a cooperation and competition dynamics between residuals corresponding to each input dimension.\nInterpreting these kind of networks as nonlinear filters, the steady state value of the residuals in the case of attractor networks are indicative of the common features between different input dimensions that the network has observed during training, and has encoded in those components. In cases where residuals do not converge to an attractor state, their internal dynamics are separable for each input class, and the network can reliably approximate the output. We bring analytical and\nempirical evidence that residual networks classify inputs based on the integration of the transient dynamics of the residuals, and will show how the network responds to input perturbations. We compare the network dynamics for a ResNet and a\nMulti-Layer Perceptron and show that the internal dynamics, and the noise evolution are fundamentally different in these networks, and ResNets are more robust to noisy inputs. Based on these findings, we also develop a new method to adjust the depth for residual networks during training. As it turns out, after pruning the depth of a ResNet using this algorithm,the network is still capable of classifying inputs with a high accuracy.", "keywords": "Residual Networks;Dynamical Systems;Classification", "primary_area": "", "supplementary_material": "", "author": "Fereshteh Lagzi", "authorids": "lagzi@informatik.uni-freiburg.de", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nlagzi2019residual,\ntitle={{RESIDUAL} {NETWORKS} {CLASSIFY} {INPUTS} {BASED} {ON} {THEIR} {NEURAL} {TRANSIENT} {DYNAMICS}},\nauthor={Fereshteh Lagzi},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkl84iCcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=Hkl84iCcFm", "pdf_size": 0, "rating": "2;4;5", "confidence": "4;4;5", "wc_review": "323;464;395", "wc_reply_reviewers": "184;0;0", "wc_reply_authors": "282;193;203", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 394.0, 57.56735185849702 ], "wc_reply_reviewers_avg": [ 61.333333333333336, 86.73843182554982 ], "wc_reply_authors_avg": [ 226.0, 39.80787191833629 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_SyNqSX3dAQJ:scholar.google.com/&scioq=RESIDUAL+NETWORKS+CLASSIFY+INPUTS+BASED+ON+THEIR+NEURAL+TRANSIENT+DYNAMICS&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "HklAhi09Y7", "title": "Question Generation using a Scratchpad Encoder", "track": "main", "status": "Reject", "tldr": "In this paper we introduce the Scratchpad Encoder, a novel addition to the sequence to sequence (seq2seq) framework and explore its effectiveness in generating natural language questions from a given logical form.", "abstract": "In this paper we introduce the Scratchpad Encoder, a novel addition to the sequence to sequence (seq2seq) framework and explore its effectiveness in generating natural language questions from a given logical form. The Scratchpad encoder enables the decoder at each time step to modify all the encoder outputs, thus using the encoder as a \"scratchpad\" memory to keep track of what has been generated so far and to guide future generation. Experiments on a knowledge based question generation dataset show that our approach generates more fluent and expressive questions according to quantitative metrics and human judgments.", "keywords": "Question Generation;Natural Language Generation;Scratchpad Encoder;Sequence to Sequence", "primary_area": "", "supplementary_material": "", "author": "Ryan Y Benmalek;Madian Khabsa;Suma Desu;Claire Cardie;Michele Banko", "authorids": "ryanai3@cs.cornell.edu;me@madiankhabsa.com;desuma24@gmail.com;cardie@cs.cornell.edu;mbanko@apple.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbenmalek2019question,\ntitle={Question Generation using a Scratchpad Encoder},\nauthor={Ryan Y Benmalek and Madian Khabsa and Suma Desu and Claire Cardie and Michele Banko},\nyear={2019},\nurl={https://openreview.net/forum?id=HklAhi09Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HklAhi09Y7", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;4", "wc_review": "432;414;269", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "95;230;118", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 371.6666666666667, 72.96726815649755 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 147.66666666666666, 58.97080256835204 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rlUBLNOJLDEJ:scholar.google.com/&scioq=Question+Generation+using+a+Scratchpad+Encoder&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HklJV3A9Ym", "title": "Approximation capability of neural networks on sets of probability measures and tree-structured data", "track": "main", "status": "Reject", "tldr": "This paper extends the proof of density of neural networks in the space of continuous (or even measurable) functions on Euclidean spaces to functions on compact sets of probability measures. ", "abstract": "This paper extends the proof of density of neural networks in the space of continuous (or even measurable) functions on Euclidean spaces to functions on compact sets of probability measures.\nBy doing so the work parallels a more then a decade old results on mean-map embedding of probability measures in reproducing kernel Hilbert spaces. \nThe work has wide practical consequences for multi-instance learning, where it theoretically justifies some recently proposed constructions.\nThe result is then extended to Cartesian products, yielding universal approximation theorem for tree-structured domains, which naturally occur in data-exchange formats like JSON, XML, YAML, AVRO, and ProtoBuffer. This has important practical implications, as it enables to automatically create an architecture of neural networks for processing structured data (AutoML paradigms), as demonstrated by an accompanied library for JSON format.", "keywords": "multi-instance learning;hierarchical models;universal approximation theorem", "primary_area": "", "supplementary_material": "", "author": "Tom\u00e1\u0161 Pevn\u00fd;Vojt\u011bch Kova\u0159\u00edk", "authorids": "pevnak@gmail.com;vojta.kovarik@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npevn\u00fd2019approximation,\ntitle={Approximation capability of neural networks on sets of probability measures and tree-structured data},\nauthor={Tom\u00e1\u0161 Pevn\u00fd and Vojt\u011bch Kova\u0159\u00edk},\nyear={2019},\nurl={https://openreview.net/forum?id=HklJV3A9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HklJV3A9Ym", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;5;3", "wc_review": "272;111;133", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "139;105;99", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 172.0, 71.27879535083815 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 114.33333333333333, 17.613126418163876 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RjaEq9H9_gYJ:scholar.google.com/&scioq=Approximation+capability+of+neural+networks+on+sets+of+probability+measures+and+tree-structured+data&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HklKWhC5F7", "title": "How Training Data Affect the Accuracy and Robustness of Neural Networks for Image Classification", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent work has demonstrated the lack of robustness of well-trained deep neural networks (DNNs) to adversarial examples. For example, visually indistinguishable perturbations, when mixed with an original image, can easily lead deep learning models to misclassifications. In light of a recent study on the mutual influence between robustness and accuracy over 18 different ImageNet models, this paper investigates how training data affect the accuracy and robustness of deep neural\nnetworks. We conduct extensive experiments on four different datasets, including CIFAR-10, MNIST, STL-10, and Tiny ImageNet, with several representative neural networks. Our results reveal previously unknown phenomena that exist between the size of training data and characteristics of the resulting models. In particular, besides confirming that the model accuracy improves as the amount of training data increases, we also observe that the model robustness improves initially, but there exists a turning point after which robustness starts to decrease. How and when such turning points occur vary for different neural networks and different datasets.", "keywords": "Adversarial attacks;Robustness;CW;I-FGSM", "primary_area": "", "supplementary_material": "", "author": "Suhua Lei;Huan Zhang;Ke Wang;Zhendong Su", "authorids": "sulei@ucdavis.edu;huan@huan-zhang.com;kewang@visa.com;zhendong.su@inf.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlei2019how,\ntitle={How Training Data Affect the Accuracy and Robustness of Neural Networks for Image Classification},\nauthor={Suhua Lei and Huan Zhang and Ke Wang and Zhendong Su},\nyear={2019},\nurl={https://openreview.net/forum?id=HklKWhC5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HklKWhC5F7", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "126;138;350", "wc_reply_reviewers": "0;27;0", "wc_reply_authors": "262;137;741", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 204.66666666666666, 102.88288919392228 ], "wc_reply_reviewers_avg": [ 9.0, 12.727922061357855 ], "wc_reply_authors_avg": [ 380.0, 260.3164740593009 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6074015553926515934&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Off-Policy Evaluation and Learning from Logged Bandit Feedback: Error Reduction via Surrogate Policy", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/873", "id": "HklKui0ct7", "author_site": "Yuan Xie, Boyi Liu, Qiang Liu, Zhaoran Wang, Yuan Zhou, Jian Peng", "tldr": "", "abstract": " When learning from a batch of logged bandit feedback, the discrepancy between the policy to be learned and the off-policy training data imposes statistical and computational challenges. Unlike classical supervised learning and online learning settings, in batch contextual bandit learning, one only has access to a collection of logged feedback from the actions taken by a historical policy, and expect to learn a policy that takes good actions in possibly unseen contexts. Such a batch learning setting is ubiquitous in online and interactive systems, such as ad platforms and recommendation systems. Existing approaches based on inverse propensity weights, such as Inverse Propensity Scoring (IPS) and Policy Optimizer for Exponential Models (POEM), enjoy unbiasedness but often suffer from large mean squared error. In this work, we introduce a new approach named Maximum Likelihood Inverse Propensity Scoring (MLIPS) for batch learning from logged bandit feedback. Instead of using the given historical policy as the proposal in inverse propensity weights, we estimate a maximum likelihood surrogate policy based on the logged action-context pairs, and then use this surrogate policy as the proposal. We prove that MLIPS is asymptotically unbiased, and moreover, has a smaller nonasymptotic mean squared error than IPS. Such an error reduction phenomenon is somewhat surprising as the estimated surrogate policy is less accurate than the given historical policy. Results on multi-label classification problems and a large-scale ad placement dataset demonstrate the empirical effectiveness of MLIPS. Furthermore, the proposed surrogate policy technique is complementary to existing error reduction techniques, and when combined, is able to consistently boost the performance of several widely used approaches.", "keywords": "Causal inference;Policy Optimization;Non-asymptotic analysis", "primary_area": "", "supplementary_material": "", "author": "Yuan Xie;Boyi Liu;Qiang Liu;Zhaoran Wang;Yuan Zhou;Jian Peng", "authorids": "xieyuan@umail.iu.edu;boyiliu2018@u.northwestern.edu;lqiang@cs.utexas.edu;zhaoranwang@gmail.com;yzhoucs@iu.edu;jianpeng@illinois.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nxie2018offpolicy,\ntitle={Off-Policy Evaluation and Learning from Logged Bandit Feedback: Error Reduction via Surrogate Policy},\nauthor={Yuan Xie and Boyi Liu and Qiang Liu and Zhaoran Wang and Yuan Zhou and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HklKui0ct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;4;4", "wc_review": "356;422;416", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "551;415;514", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 398.0, 29.79932885150268 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 493.3333333333333, 57.41273571449302 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11720089646814691493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HklKui0ct7", "pdf": "https://openreview.net/pdf?id=HklKui0ct7", "email": ";;;;;", "author_num": 6 }, { "id": "HklQxnC5tX", "title": "Overlapping Community Detection with Graph Neural Networks", "track": "main", "status": "Reject", "tldr": "Detecting overlapping communities in graphs using graph neural networks", "abstract": "Community detection in graphs is of central importance in graph mining, machine learning and network science. Detecting overlapping communities is especially challenging, and remains an open problem. Motivated by the success of graph-based deep learning in other graph-related tasks, we study the applicability of this framework for overlapping community detection. We propose a probabilistic model for overlapping community detection based on the graph neural network architecture. Despite its simplicity, our model outperforms the existing approaches in the community recovery task by a large margin. Moreover, due to the inductive formulation, the proposed model is able to perform out-of-sample community detection for nodes that were not present at training time", "keywords": "community detection;deep learning for graphs", "primary_area": "", "supplementary_material": "", "author": "Oleksandr Shchur;Stephan G\u00fcnnemann", "authorids": "shchur@in.tum.de;guennemann@in.tum.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshchur2019overlapping,\ntitle={Overlapping Community Detection with Graph Neural Networks},\nauthor={Oleksandr Shchur and Stephan G\u00fcnnemann},\nyear={2019},\nurl={https://openreview.net/forum?id=HklQxnC5tX},\n}", "github": "[![github](/images/github_icon.svg) shchur/overlapping-community-detection](https://github.com/shchur/overlapping-community-detection)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HklQxnC5tX", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;5;4", "wc_review": "335;387;251", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 324.3333333333333, 56.031737038535184 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 176, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3644572692099612175&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Subgradient Descent Learns Orthogonal Dictionaries", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/904", "id": "HklSf3CqKm", "author_site": "Yu Bai, Qijia Jiang, Ju Sun", "tldr": "Efficient dictionary learning by L1 minimization via a novel analysis of the non-convex non-smooth geometry.", "abstract": "This paper concerns dictionary learning, i.e., sparse coding, a fundamental representation learning problem. We show that a subgradient descent algorithm, with random initialization, can recover orthogonal dictionaries on a natural nonsmooth, nonconvex L1 minimization formulation of the problem, under mild statistical assumption on the data. This is in contrast to previous provable methods that require either expensive computation or delicate initialization schemes. Our analysis develops several tools for characterizing landscapes of nonsmooth functions, which might be of independent interest for provable training of deep networks with nonsmooth activations (e.g., ReLU), among other applications. Preliminary synthetic and real experiments corroborate our analysis and show that our algorithm works well empirically in recovering orthogonal dictionaries.", "keywords": "Dictionary learning;Sparse coding;Non-convex optimization;Theory", "primary_area": "", "supplementary_material": "", "author": "Yu Bai;Qijia Jiang;Ju Sun", "authorids": "yub@stanford.edu;qjiang2@stanford.edu;sunju@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbai2018subgradient,\ntitle={Subgradient Descent Learns Orthogonal Dictionaries},\nauthor={Yu Bai and Qijia Jiang and Ju Sun},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HklSf3CqKm},\n}", "github": "[![github](/images/github_icon.svg) sunju/ODL_L1](https://github.com/sunju/ODL_L1)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer5", "pdf_size": 0, "rating": "6;7;7;7;7", "confidence": "1;3;4;3;2", "wc_review": "119;238;180;75;211", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "131;354;702;62;42", "reply_reviewers": "0;0;0;0;0", "reply_authors": "2;1;1;2;1", "rating_avg": [ 6.8, 0.39999999999999997 ], "confidence_avg": [ 2.6, 1.019803902718557 ], "wc_review_avg": [ 164.6, 59.808360619565555 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 258.2, 248.01322545380518 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.7844645405527363, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3757427846147866582&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HklSf3CqKm", "pdf": "https://openreview.net/pdf?id=HklSf3CqKm", "email": ";;", "author_num": 3 }, { "id": "HklUN3RcFX", "title": "Confidence-based Graph Convolutional Networks for Semi-Supervised Learning", "track": "main", "status": "Withdraw", "tldr": "We propose a confidence based Graph Convolutional Network for Semi-Supervised Learning.", "abstract": "Predicting properties of nodes in a graph is an important problem with applications in a variety of domains. Graph-based Semi Supervised Learning (SSL) methods aim to address this problem by labeling a small subset of the nodes as seeds, and then utilizing the graph structure to predict label scores for the rest of the nodes in the graph. Recently, Graph Convolutional Networks (GCNs) have achieved impressive performance on the graph-based SSL task. In addition to label scores, it is also desirable to have a confidence score associated with them. Unfortunately, confidence estimation in the context of GCN has not been previously explored. We fill this important gap in this paper and propose ConfGCN, which estimates labels scores along with their confidences jointly in GCN-based setting. ConfGCN uses these estimated confidences to determine the influence of one node on another during neighborhood aggregation, thereby acquiring anisotropic capabilities. Through extensive analysis and experiments on standard benchmarks, we find that ConfGCN is able to significantly outperform state-of-the-art baselines. We have made ConfGCN\u2019s source code available to encourage reproducible research.", "keywords": "Graph Convolutional Networks;GCN;Confidence;Semi-Supervised Learning;Deep Learning;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Shikhar Vashishth;Prateek Yadav;Manik Bhandari;Partha Talukdar", "authorids": "shikhar@iisc.ac.in;prateekyadav@iisc.ac.in;mbbhandarimanik@gmail.com;ppt@iisc.ac.in", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HklUN3RcFX", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 3, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15761652077324197534&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HklVMnR5tQ", "title": "Exploring the interpretability of LSTM neural networks over multi-variable data", "track": "main", "status": "Reject", "tldr": "", "abstract": "In learning a predictive model over multivariate time series consisting of target and exogenous variables, the forecasting performance and interpretability of the model are both essential for deployment and uncovering knowledge behind the data.\nTo this end, we propose the interpretable multi-variable LSTM recurrent neural network (IMV-LSTM) capable of providing accurate forecasting as well as both temporal and variable level importance interpretation.\nIn particular, IMV-LSTM is equipped with tensorized hidden states and update process, so as to learn variables-wise hidden states. \nOn top of it, we develop a mixture attention mechanism and associated summarization methods to quantify the temporal and variable importance in data. \nExtensive experiments using real datasets demonstrate the prediction performance and interpretability of IMV-LSTM in comparison to a variety of baselines. \nIt also exhibits the prospect as an end-to-end framework for both forecasting and knowledge extraction over multi-variate data. ", "keywords": "Interpretability;recurrent neural network;attention", "primary_area": "", "supplementary_material": "", "author": "Tian Guo;Tao Lin", "authorids": "tian.guo@gess.ethz.ch;tao.lin@epfl.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nguo2019exploring,\ntitle={Exploring the interpretability of {LSTM} neural networks over multi-variable data},\nauthor={Tian Guo and Tao Lin},\nyear={2019},\nurl={https://openreview.net/forum?id=HklVMnR5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HklVMnR5tQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;5;5", "wc_review": "688;595;431", "wc_reply_reviewers": "0;29;199", "wc_reply_authors": "0;33;289", "reply_reviewers": "0;1;1", "reply_authors": "0;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 571.3333333333334, 106.24604567804971 ], "wc_reply_reviewers_avg": [ 76.0, 87.7762306474063 ], "wc_reply_authors_avg": [ 107.33333333333333, 129.16225798927832 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9783644020520548002&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HklVTi09tm", "title": "Detecting Topological Defects in 2D Active Nematics Using Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "An interesting application of CNN in soft condensed matter physics experiments.", "abstract": "Active matter consists of active agents which transform energy extracted from surroundings into momentum, producing a variety of collective phenomena. A model, synthetic active system composed of microtubule polymers driven by protein motors spontaneously forms a liquid-crystalline nematic phase. Extensile stress created by the protein motors precipitates continuous buckling and folding of the microtubules creating motile topological defects and turbulent fluid flows. Defect motion is determined by the rheological properties of the material; however, these remain largely unquantified. Measuring defects dynamics can yield fundamental insights into active nematics, a class of materials that include bacterial films and animal cells. Current methods for defect detection lack robustness and precision, and require fine-tuning for datasets with different visual quality. In this study, we applied Deep Learning to train a defect detector to automatically analyze microscopy videos of the microtubule active nematic. Experimental results indicate that our method is robust and accurate. It is expected to significantly increase the amount of video data that can be processed.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruoshi Liu;Michael M. Norton;Seth Fraden;Pengyu Hong", "authorids": "ruoshiliu@brandeis.edu;mmnorton@brandeis.edu;fraden@brandeis.edu;hongpeng@brandeis.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nliu2019detecting,\ntitle={Detecting Topological Defects in 2D Active Nematics Using Convolutional Neural Networks},\nauthor={Ruoshi Liu and Michael M. Norton and Seth Fraden and Pengyu Hong},\nyear={2019},\nurl={https://openreview.net/forum?id=HklVTi09tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HklVTi09tm", "pdf_size": 0, "rating": "2;4;4", "confidence": "5;4;4", "wc_review": "214;325;219", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 252.66666666666666, 51.188106257432715 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1JrPhEV7JVQJ:scholar.google.com/&scioq=Detecting+Topological+Defects+in+2D+Active+Nematics+Using+Convolutional+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/680", "id": "HklY120cYm", "author_site": "Wei Ping, Kainan Peng, Jitong Chen", "tldr": "", "abstract": "In this work, we propose a new solution for parallel wave generation by WaveNet. In contrast to parallel WaveNet (van Oord et al., 2018), we distill a Gaussian inverse autoregressive flow from the autoregressive WaveNet by minimizing a regularized KL divergence between their highly-peaked output distributions. Our method computes the KL divergence in closed-form, which simplifies the training algorithm and provides very efficient distillation. In addition, we introduce the first text-to-wave neural architecture for speech synthesis, which is fully convolutional and enables fast end-to-end training from scratch. It significantly outperforms the previous pipeline that connects a text-to-spectrogram model to a separately trained WaveNet (Ping et al., 2018). We also successfully distill a parallel waveform synthesizer conditioned on the hidden representation in this end-to-end model.", "keywords": "text-to-speech;deep generative models;end-to-end training;text to waveform", "primary_area": "", "supplementary_material": "", "author": "Wei Ping;Kainan Peng;Jitong Chen", "authorids": "weiping.thu@gmail.com;pengkainan@baidu.com;jitongc@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nping2018clarinet,\ntitle={ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech},\nauthor={Wei Ping and Kainan Peng and Jitong Chen},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HklY120cYm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=HklY120cYm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;4;4", "wc_review": "490;651;1022", "wc_reply_reviewers": "0;44;328", "wc_reply_authors": "531;449;1334", "reply_reviewers": "0;1;2", "reply_authors": "1;1;2", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 721.0, 222.75696771743563 ], "wc_reply_reviewers_avg": [ 124.0, 145.36391115633435 ], "wc_reply_authors_avg": [ 771.3333333333334, 399.27128065458896 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 448, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1675505652651694755&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HklY120cYm", "pdf": "https://openreview.net/pdf?id=HklY120cYm", "email": ";;", "author_num": 3 }, { "id": "HklbTjRcKX", "title": "What Information Does a ResNet Compress?", "track": "main", "status": "Reject", "tldr": "The Information Bottleneck Principle applied to ResNets, using PixelCNN++ models to decode mutual information and conditionally generate images for information illustration", "abstract": "The information bottleneck principle (Shwartz-Ziv & Tishby, 2017) suggests that SGD-based training of deep neural networks results in optimally compressed hidden layers, from an information theoretic perspective. However, this claim was established on toy data. The goal of the work we present here is to test these claims in a realistic setting using a larger and deeper convolutional architecture, a ResNet model. We trained PixelCNN++ models as inverse representation decoders to measure the mutual information between hidden layers of a ResNet and input image data, when trained for (1) classification and (2) autoencoding. We find that two stages of learning happen for both training regimes, and that compression does occur, even for an autoencoder. Sampling images by conditioning on hidden layers\u2019 activations offers an intuitive visualisation to understand what a ResNets learns to forget.", "keywords": "Deep Learning;Information Bottleneck;Residual Neural Networks;Information Theory", "primary_area": "", "supplementary_material": "", "author": "Luke Nicholas Darlow;Amos Storkey", "authorids": "l.n.darlow@sms.ed.ac.uk;a.storkey@ed.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndarlow2019what,\ntitle={What Information Does a ResNet Compress?},\nauthor={Luke Nicholas Darlow and Amos Storkey},\nyear={2019},\nurl={https://openreview.net/forum?id=HklbTjRcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HklbTjRcKX", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;5", "wc_review": "351;1719;486", "wc_reply_reviewers": "50;893;280", "wc_reply_authors": "0;414;114", "reply_reviewers": "1;2;2", "reply_authors": "0;2;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 852.0, 615.5339145814794 ], "wc_reply_reviewers_avg": [ 407.6666666666667, 355.7961338869338 ], "wc_reply_authors_avg": [ 176.0, 174.6081326857372 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6956633072774250162&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "Hklc6oAcFX", "title": "Co-manifold learning with missing data", "track": "main", "status": "Reject", "tldr": "Nonlinear representations of observations and features of a data matrix with missing entries and coupled geometries", "abstract": " Representation learning is typically applied to only one mode of a data matrix, either its rows or columns. Yet in many applications, there is an underlying geometry to both the rows and the columns. We propose utilizing this coupled structure to perform co-manifold learning: uncovering the underlying geometry of both the rows and the columns of a given matrix, where we focus on a missing data setting. Our unsupervised approach consists of three components. We first solve a family of optimization problems to estimate a complete matrix at multiple scales of smoothness. We then use this collection of smooth matrix estimates to compute pairwise distances on the rows and columns based on a new multi-scale metric that implicitly introduces a coupling between the rows and the columns. Finally, we construct row and column representations from these multi-scale metrics. We demonstrate that our approach outperforms competing methods in both data visualization and clustering. ", "keywords": "nonlinear dimensionality reduction;missing data;manifold learning;co-clustering;optimization", "primary_area": "", "supplementary_material": "", "author": "Gal Mishne;Eric C. Chi;Ronald R. Coifman", "authorids": "gal.mishne@yale.edu;eric_chi@ncsu.edu;coifman.ronald@yale.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmishne2019comanifold,\ntitle={Co-manifold learning with missing data},\nauthor={Gal Mishne and Eric C. Chi and Ronald R. Coifman},\nyear={2019},\nurl={https://openreview.net/forum?id=Hklc6oAcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hklc6oAcFX", "pdf_size": 0, "rating": "4;4;7", "confidence": "3;4;4", "wc_review": "227;72;962", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1248;276;2214", "reply_reviewers": "0;0;0", "reply_authors": "2;1;4", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 420.3333333333333, 388.20813432430236 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1246.0, 791.1864508445528 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17186109268561397667&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "Hklgis0cF7", "title": "Radial Basis Feature Transformation to Arm CNNs Against Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "A new nonlinear defense against adversarial attacks.", "abstract": "The linear and non-flexible nature of deep convolutional models makes them vulnerable to carefully crafted adversarial perturbations. To tackle this problem, in this paper, we propose a nonlinear radial basis convolutional feature transformation by learning the Mahalanobis distance function that maps the input convolutional features from the same class into tight clusters. In such a space, the clusters become compact and well-separated, which prevent small adversarial perturbations from forcing a sample to cross the decision boundary. We test the proposed method on three publicly available image classification and segmentation data-sets namely, MNIST, ISBI ISIC skin lesion, and NIH ChestX-ray14. We evaluate the robustness of our method to different gradient (targeted and untargeted) and non-gradient based attacks and compare it to several non-gradient masking defense strategies. Our results demonstrate that the proposed method can boost the performance of deep convolutional neural networks against adversarial perturbations without accuracy drop on clean data.", "keywords": "Radial basis feature transformation;convolutional neural networks;adversarial defense", "primary_area": "", "supplementary_material": "", "author": "Saeid Asgari Taghanaki;Shekoofeh Azizi;Ghassan Hamarneh", "authorids": "sasgarit@sfu.ca;shazizi@ece.ubc.ca;hamarneh@sfu.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntaghanaki2019radial,\ntitle={Radial Basis Feature Transformation to Arm {CNN}s Against Adversarial Attacks},\nauthor={Saeid Asgari Taghanaki and Shekoofeh Azizi and Ghassan Hamarneh},\nyear={2019},\nurl={https://openreview.net/forum?id=Hklgis0cF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hklgis0cF7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;4", "wc_review": "352;204;667", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 407.6666666666667, 193.07396395049113 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jEXHm7aeKP4J:scholar.google.com/&scioq=Radial+Basis+Feature+Transformation+to+Arm+CNNs+Against+Adversarial+Attacks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "MARGINALIZED AVERAGE ATTENTIONAL NETWORK FOR WEAKLY-SUPERVISED LEARNING", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1079", "id": "HkljioCcFQ", "author_site": "Yuan Yuan, YUEMING LYU, Xi SHEN, Ivor Wai-Hung Tsang, Dit-Yan Yeung", "tldr": "A novel marginalized average attentional network for weakly-supervised temporal action localization ", "abstract": "In weakly-supervised temporal action localization, previous works have failed to locate dense and integral regions for each entire action due to the overestimation of the most salient regions. To alleviate this issue, we propose a marginalized average attentional network (MAAN) to suppress the dominant response of the most salient regions in a principled manner. The MAAN employs a novel marginalized average aggregation (MAA) module and learns a set of latent discriminative probabilities in an end-to-end fashion. MAA samples multiple subsets from the video snippet features according to a set of latent discriminative probabilities and takes the expectation over all the averaged subset features. Theoretically, we prove that the MAA module with learned latent discriminative probabilities successfully reduces the difference in responses between the most salient regions and the others. Therefore, MAAN is able to generate better class activation sequences and identify dense and integral action regions in the videos. Moreover, we propose a fast algorithm to reduce the complexity of constructing MAA from $O(2^T)$ to $O(T^2)$. Extensive experiments on two large-scale video datasets show that our MAAN achieves a superior performance on weakly-supervised temporal action localization.\n\n\n", "keywords": "feature aggregation;weakly supervised learning;temporal action localization", "primary_area": "", "supplementary_material": "", "author": "Yuan Yuan;Yueming Lyu;Xi Shen;Ivor W. Tsang;Dit-Yan Yeung", "authorids": "yuanyuan910115@gmail.com;lv_yueming@outlook.com;shenxiluc@gmail.com;ivor.tsang@uts.edu.au;dyyeung@cse.ust.hk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyuan2018marginalized,\ntitle={{MARGINALIZED} {AVERAGE} {ATTENTIONAL} {NETWORK} {FOR} {WEAKLY}-{SUPERVISED} {LEARNING}},\nauthor={Yuan Yuan and Yueming Lyu and Xi Shen and Ivor W. Tsang and Dit-Yan Yeung},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkljioCcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;3;4", "wc_review": "637;186;326", "wc_reply_reviewers": "0;0;82", "wc_reply_authors": "553;107;153", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 383.0, 188.47988398411823 ], "wc_reply_reviewers_avg": [ 27.333333333333332, 38.6551707048646 ], "wc_reply_authors_avg": [ 271.0, 200.28646151616604 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3372820484309967388&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 15, "openreview": "https://openreview.net/forum?id=HkljioCcFQ", "pdf": "https://openreview.net/pdf?id=HkljioCcFQ", "email": ";;;;", "author_num": 5 }, { "id": "HklmIsC9Y7", "title": "UNSUPERVISED CONVOLUTIONAL NEURAL NETWORKS FOR ACCURATE VIDEO FRAME INTERPOLATION WITH INTEGRATION OF MOTION COMPONENTS", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Optical flow and video frame interpolation are considered as a chicken-egg problem such that one problem affects the other and vice versa. This paper presents a deep neural network that integrates the flow network into the frame interpolation problem, with end-to-end learning. The proposed approach exploits the relationship between the two problems for quality enhancement of interpolation frames. Unlike recent convolutional neural networks, the proposed approach learns motions from natural video frames without graphical ground truth flows for training. This makes the network learn from extensive data and improve the performance. The motion information from the flow network guides interpolator networks to be trained to synthesize the interpolated frame accurately from motion scenarios. In addition, diverse datasets to cover various challenging cases that previous interpolations usually fail in is used for comparison. In all experimental datasets, the proposed network achieves better performance than state-of-art CNN based interpolations. With Middebury benchmark, compared with the top-ranked algorithm, the proposed network reduces an average interpolation error by about 9.3%. The proposed interpolation is ranked the 1st in Standard Deviation (SD) interpolation error, the 2nd in Average Interpolation Error among over 150 algorithms listed in the Middlebury interpolation benchmark.", "keywords": "Frame Interpolation;Frame Rate Up Conversion;Convolutional Neural Networks;CNN;Unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Thang Van Nguyen;Kyu-Joong Lee;Hyuk-Jae Lee", "authorids": "itmanhieu@snu.ac.kr;kyujoonglee@sunmoon.ac.kr;hjlee@capp.snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HklmIsC9Y7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;4", "wc_review": "152;1131;100", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 461.0, 474.236930939237 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_HMqUpfVJY4J:scholar.google.com/&scioq=UNSUPERVISED+CONVOLUTIONAL+NEURAL+NETWORKS+FOR+ACCURATE+VIDEO+FRAME+INTERPOLATION+WITH+INTEGRATION+OF+MOTION+COMPONENTS&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HklnzhR9YQ", "title": "Approximation and non-parametric estimation of ResNet-type convolutional neural networks via block-sparse fully-connected neural networks", "track": "main", "status": "Reject", "tldr": "It is shown that ResNet-type CNNs are a universal approximator and its expression ability is not worse than fully connected neural networks (FNNs) with a \\textit{block-sparse} structure even if the size of each layer in the CNN is fixed.", "abstract": "We develop new approximation and statistical learning theories of convolutional neural networks (CNNs) via the ResNet-type structure where the channel size, filter size, and width are fixed. It is shown that a ResNet-type CNN is a universal approximator and its expression ability is no worse than fully-connected neural networks (FNNs) with a \\textit{block-sparse} structure even if the size of each layer in the CNN is fixed. Our result is general in the sense that we can automatically translate any approximation rate achieved by block-sparse FNNs into that by CNNs. Thanks to the general theory, it is shown that learning on CNNs satisfies optimality in approximation and estimation of several important function classes.\n\nAs applications, we consider two types of function classes to be estimated: the Barron class and H\\\"older class. We prove the clipped empirical risk minimization (ERM) estimator can achieve the same rate as FNNs even the channel size, filter size, and width of CNNs are constant with respect to the sample size. This is minimax optimal (up to logarithmic factors) for the H\\\"older class. Our proof is based on sophisticated evaluations of the covering number of CNNs and the non-trivial parameter rescaling technique to control the Lipschitz constant of CNNs to be constructed.", "keywords": "CNN;ResNet;learning theory;approximation theory;non-parametric estimation;block-sparse", "primary_area": "", "supplementary_material": "", "author": "Kenta Oono;Taiji Suzuki", "authorids": "k.oono.delta@gmail.com;taiji@mist.i.u-tokyo.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\noono2019approximation,\ntitle={Approximation and non-parametric estimation of ResNet-type convolutional neural networks via block-sparse fully-connected neural networks},\nauthor={Kenta Oono and Taiji Suzuki},\nyear={2019},\nurl={https://openreview.net/forum?id=HklnzhR9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HklnzhR9YQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;3", "wc_review": "499;246;215", "wc_reply_reviewers": "0;0;6", "wc_reply_authors": "1050;618;499", "reply_reviewers": "0;0;1", "reply_authors": "2;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 320.0, 127.20324943438617 ], "wc_reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "wc_reply_authors_avg": [ 722.3333333333334, 236.7337932972158 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2445772080503397178&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HklyMhCqYQ", "title": "Super-Resolution via Conditional Implicit Maximum Likelihood Estimation", "track": "main", "status": "Reject", "tldr": "We propose a new method for image super-resolution based on IMLE. ", "abstract": "Single-image super-resolution (SISR) is a canonical problem with diverse applications. Leading methods like SRGAN produce images that contain various artifacts, such as high-frequency noise, hallucinated colours and shape distortions, which adversely affect the realism of the result. In this paper, we propose an alternative approach based on an extension of the method of Implicit Maximum Likelihood Estimation (IMLE). We demonstrate greater effectiveness at noise reduction and preservation of the original colours and shapes, yielding more realistic super-resolved images. ", "keywords": "super-resolution", "primary_area": "", "supplementary_material": "", "author": "Ke Li*;Shichong Peng*;Jitendra Malik", "authorids": "ke.li@eecs.berkeley.edu;shichong.peng@mail.utoronto.ca;malik@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli*2019superresolution,\ntitle={Super-Resolution via Conditional Implicit Maximum Likelihood Estimation},\nauthor={Ke Li* and Shichong Peng* and Jitendra Malik},\nyear={2019},\nurl={https://openreview.net/forum?id=HklyMhCqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HklyMhCqYQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;5;5", "wc_review": "188;572;344", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "206;300;336", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 368.0, 157.6832267554162 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 280.6666666666667, 54.804703772172296 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "Hkx-ii05FQ", "title": "The Cakewalk Method", "track": "main", "status": "Reject", "tldr": "A new policy gradient algorithm designed to approach black-box combinatorial optimization problems. The algorithm relies only on function evaluations, and returns locally optimal solutions with high probability.", "abstract": "Combinatorial optimization is a common theme in computer science. While in general such problems are NP-Hard, from a practical point of view, locally optimal solutions can be useful. In some combinatorial problems however, it can be hard to define meaningful solution neighborhoods that connect large portions of the search space, thus hindering methods that search this space directly. We suggest to circumvent such cases by utilizing a policy gradient algorithm that transforms the problem to the continuous domain, and to optimize a new surrogate objective that renders the former as generic stochastic optimizer. This is achieved by producing a surrogate objective whose distribution is fixed and predetermined, thus removing the need to fine-tune various hyper-parameters in a case by case manner. Since we are interested in methods which can successfully recover locally optimal solutions, we use the problem of finding locally maximal cliques as a challenging experimental benchmark, and we report results on a large dataset of graphs that is designed to test clique finding algorithms. Notably, we show in this benchmark that fixing the distribution of the surrogate is key to consistently recovering locally optimal solutions, and that our surrogate objective leads to an algorithm that outperforms other methods we have tested in a number of measures.", "keywords": "policy gradient;combinatorial optimization;blackbox optimization;stochastic optimization;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Uri Patish;Shimon Ullman", "authorids": "uri.patish@gmail.com;shimon.ullman@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npatish2019the,\ntitle={The Cakewalk Method},\nauthor={Uri Patish and Shimon Ullman},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkx-ii05FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hkx-ii05FQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "389;365;158", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "930;791;325", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 304.0, 103.70149468546728 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 682.0, 258.7366743750616 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wELWE2IcCZoJ:scholar.google.com/&scioq=The+Cakewalk+Method&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HkxAisC9FQ", "title": "Improved robustness to adversarial examples using Lipschitz regularization of the loss", "track": "main", "status": "Reject", "tldr": "Improvements to adversarial robustness, as well as provable robustness guarantees, are obtained by augmenting adversarial training with a tractable Lipschitz regularization", "abstract": "We augment adversarial training (AT) with worst case adversarial training\n(WCAT) which improves adversarial robustness by 11% over the current state-\nof-the-art result in the `2-norm on CIFAR-10. We interpret adversarial training as\nTotal Variation Regularization, which is a fundamental tool in mathematical im-\nage processing, and WCAT as Lipschitz regularization, which appears in Image\nInpainting. We obtain verifiable worst and average case robustness guarantees,\nbased on the expected and maximum values of the norm of the gradient of the\nloss.", "keywords": "Adversarial training;adversarial examples;deep neural networks;regularization;Lipschitz constant", "primary_area": "", "supplementary_material": "", "author": "Chris Finlay;Adam M. Oberman;Bilal Abbasi", "authorids": "christopher.finlay@mail.mcgill.ca;adam.oberman@mcgill.ca;bilal.abbasi@mail.mcgill.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfinlay2019improved,\ntitle={Improved robustness to adversarial examples using Lipschitz regularization of the loss},\nauthor={Chris Finlay and Adam M. Oberman and Bilal Abbasi},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxAisC9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkxAisC9FQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;3;1", "wc_review": "479;243;91", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "224;188;89", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 271.0, 159.6329122288592 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 167.0, 57.07889277132134 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12654222286589186689&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HkxCEhAqtQ", "title": "Accelerated Gradient Flow for Probability Distributions", "track": "main", "status": "Reject", "tldr": "Methodology and numerical algorithms for constructing accelerated gradient flows on the space of probability distributions.", "abstract": "This paper presents a methodology and numerical algorithms for constructing accelerated gradient flows on the space of probability distributions. In particular, we extend the recent variational formulation of accelerated gradient methods in wibisono2016 from vector valued variables to probability distributions. The variational problem is modeled as a mean-field optimal control problem. The maximum principle of optimal control theory is used to derive Hamilton's equations for the optimal gradient flow. The Hamilton's equation are shown to achieve the accelerated form of density transport from any initial probability distribution to a target probability distribution. A quantitative estimate on the asymptotic convergence rate is provided based on a Lyapunov function construction, when the objective functional is displacement convex. Two numerical approximations are presented to implement the Hamilton's equations as a system of N interacting particles. The continuous limit of the Nesterov's algorithm is shown to be a special case with N=1. The algorithm is illustrated with numerical examples. ", "keywords": "Optimal transportation;Mean-field optimal control;Wasserstein gradient flow;Markov-chain Monte-Carlo", "primary_area": "", "supplementary_material": "", "author": "Amirhossein Taghvaei;Prashant G. Mehta", "authorids": "amirhoseintghv@gmail.com;mehtapg@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntaghvaei2019accelerated,\ntitle={Accelerated Gradient Flow for Probability Distributions},\nauthor={Amirhossein Taghvaei and Prashant G. Mehta},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxCEhAqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HkxCEhAqtQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;4", "wc_review": "691;136;245", "wc_reply_reviewers": "171;0;0", "wc_reply_authors": "1665;262;133", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 357.3333333333333, 240.09766531328222 ], "wc_reply_reviewers_avg": [ 57.0, 80.61017305526642 ], "wc_reply_authors_avg": [ 686.6666666666666, 693.7878318013047 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11541742841445721259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HkxCenR5F7", "title": "Variational recurrent models for representation learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study the problem of learning representations of sequence data. Recent work has built on variational autoencoders to develop variational recurrent models for generation. Our main goal is not generation but rather representation learning for downstream prediction tasks. Existing variational recurrent models typically use stochastic recurrent connections to model the dependence among neighboring latent variables, while generation assumes independence of generated data per time step given the latent sequence. In contrast, our models assume independence among all latent variables given non-stochastic hidden states, which speeds up inference, while assuming dependence of observations at each time step on all latent variables, which improves representation quality. In addition, we propose and study extensions for improving downstream performance, including hierarchical auxiliary latent variables and prior updating during training. Experiments show improved performance on several speech and language tasks with different levels of supervision, as well as in a multi-view learning setting.", "keywords": "Representation learning;variational model", "primary_area": "", "supplementary_material": "", "author": "Qingming Tang;Mingda Chen;Weiran Wang;Karen Livescu", "authorids": "qmtang@ttic.edu;mchen@ttic.edu;weiranw@amazon.com;klivescu@ttic.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ntang2019variational,\ntitle={Variational recurrent models for representation learning},\nauthor={Qingming Tang and Mingda Chen and Weiran Wang and Karen Livescu},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxCenR5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkxCenR5F7", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;3;3", "wc_review": "378;156;770", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "561;79;700", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 434.6666666666667, 253.84684796589897 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 446.6666666666667, 266.1006492956294 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7147623001749570297&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Towards GAN Benchmarks Which Require Generalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1123", "id": "HkxKH2AcFm", "author_site": "Ishaan Gulrajani, Colin Raffel, Luke Metz", "tldr": "We argue that GAN benchmarks must require a large sample from the model to penalize memorization and investigate whether neural network divergences have this property.", "abstract": "For many evaluation metrics commonly used as benchmarks for unconditional image generation, trivially memorizing the training set attains a better score than models which are considered state-of-the-art; we consider this problematic.\nWe clarify a necessary condition for an evaluation metric not to behave this way: estimating the function must require a large sample from the model. In search of such a metric, we turn to neural network divergences (NNDs), which are defined in terms of a neural network trained to distinguish between distributions. The resulting benchmarks cannot be ``won'' by training set memorization, while still being perceptually correlated and computable only from samples. We survey past work on using NNDs for evaluation, implement an example black-box metric based on these ideas, and validate experimentally that it can measure a notion of generalization.\n", "keywords": "evaluation;generative adversarial networks;adversarial divergences", "primary_area": "", "supplementary_material": "", "author": "Ishaan Gulrajani;Colin Raffel;Luke Metz", "authorids": "igul222@gmail.com;craffel@gmail.com;lmetz@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngulrajani2018towards,\ntitle={Towards {GAN} Benchmarks Which Require Generalization},\nauthor={Ishaan Gulrajani and Colin Raffel and Luke Metz},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxKH2AcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "3;6;7", "confidence": "4;4;4", "wc_review": "290;364;566", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "503;335;753", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 406.6666666666667, 116.64571240393803 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 530.3333333333334, 171.73881978813708 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9003774771707079711&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HkxKH2AcFm", "pdf": "https://openreview.net/pdf?id=HkxKH2AcFm", "email": ";;", "author_num": 3 }, { "title": "A Closer Look at Few-shot Classification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/980", "id": "HkxLXnAcFQ", "author_site": "Wei-Yu Chen, Yen-Cheng Liu, Zsolt Kira, Yu-Chiang Frank Wang, Jia-Bin Huang", "tldr": " A detailed empirical study in few-shot classification that revealing challenges in standard evaluation setting and showing a new direction.", "abstract": "Few-shot classi\ufb01cation aims to learn a classi\ufb01er to recognize unseen classes during training with limited labeled examples. While signi\ufb01cant progress has been made, the growing complexity of network designs, meta-learning algorithms, and differences in implementation details make a fair comparison dif\ufb01cult. In this paper, we present 1) a consistent comparative analysis of several representative few-shot classi\ufb01cation algorithms, with results showing that deeper backbones signi\ufb01cantly reduce the gap across methods including the baseline, 2) a slightly modi\ufb01ed baseline method that surprisingly achieves competitive performance when compared with the state-of-the-art on both the mini-ImageNet and the CUB datasets, and 3) a new experimental setting for evaluating the cross-domain generalization ability for few-shot classi\ufb01cation algorithms. Our results reveal that reducing intra-class variation is an important factor when the feature backbone is shallow, but not as critical when using deeper backbones. In a realistic, cross-domain evaluation setting, we show that a baseline method with a standard \ufb01ne-tuning practice compares favorably against other state-of-the-art few-shot learning algorithms.", "keywords": "few shot classification;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Wei-Yu Chen;Yen-Cheng Liu;Zsolt Kira;Yu-Chiang Frank Wang;Jia-Bin Huang", "authorids": "weiyuc@andrew.cmu.edu;ycliu@gatech.edu;zkira@gatech.edu;ycwang@ntu.edu.tw;jbhuang@vt.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nchen2018a,\ntitle={A Closer Look at Few-shot Classification},\nauthor={Wei-Yu Chen and Yen-Cheng Liu and Zsolt Kira and Yu-Chiang Frank Wang and Jia-Bin Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxLXnAcFQ},\n}", "github": "[![github](/images/github_icon.svg) wyharveychen/CloserLookFewShot](https://github.com/wyharveychen/CloserLookFewShot) + [![Papers with Code](/images/pwc_icon.svg) 12 community implementations](https://paperswithcode.com/paper/?openreview=HkxLXnAcFQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;5;2", "wc_review": "309;450;74", "wc_reply_reviewers": "0;120;0", "wc_reply_authors": "660;1365;342", "reply_reviewers": "0;2;0", "reply_authors": "2;4;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 277.6666666666667, 155.0920873402494 ], "wc_reply_reviewers_avg": [ 40.0, 56.568542494923804 ], "wc_reply_authors_avg": [ 789.0, 427.4833330084344 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2355, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10436738309048088927&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HkxLXnAcFQ", "pdf": "https://openreview.net/pdf?id=HkxLXnAcFQ", "email": ";;;;", "author_num": 5 }, { "id": "HkxMG209K7", "title": "An Alarm System for Segmentation Algorithm Based on Shape Model", "track": "main", "status": "Reject", "tldr": "We use VAE to capture the shape feature for automatic segmentation evaluation", "abstract": "It is usually hard for a learning system to predict correctly on the rare events, and there is no exception for segmentation algorithms. Therefore, we hope to build an alarm system to set off alarms when the segmentation result is possibly unsatisfactory. One plausible solution is to project the segmentation results into a low dimensional feature space, and then learn classifiers/regressors in the feature space to predict the qualities of segmentation results. In this paper, we form the feature space using shape feature which is a strong prior information shared among different data, so it is capable to predict the qualities of segmentation results given different segmentation algorithms on different datasets. The shape feature of a segmentation result is captured using the value of loss function when the segmentation result is tested using a Variational Auto-Encoder(VAE). The VAE is trained using only the ground truth masks, therefore the bad segmentation results with bad shapes become the rare events for VAE and will result in large loss value. By utilizing this fact, the VAE is able to detect all kinds of shapes that are out of the distribution of normal shapes in ground truth (GT). Finally, we learn the representation in the one-dimensional feature space to predict the qualities of segmentation results. We evaluate our alarm system on several recent segmentation algorithms for the medical segmentation task. The segmentation algorithms perform differently on different datasets, but our system consistently provides reliable prediction on the qualities of segmentation results.\n", "keywords": "segmentation evaluation;shape feature;variational auto-encoder", "primary_area": "", "supplementary_material": "", "author": "Fengze Liu;Yingda Xia;Dong Yang;Alan Yuille;Daguang Xu", "authorids": "liufz13@gmail.com;yxia25@jhu.edu;don.yang.mech@gmail.com;alan.l.yuille@gmail.com;cathy.xudg@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nliu2019an,\ntitle={An Alarm System for Segmentation Algorithm Based on Shape Model},\nauthor={Fengze Liu and Yingda Xia and Dong Yang and Alan Yuille and Daguang Xu},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxMG209K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkxMG209K7", "pdf_size": 0, "rating": "3;5;6;7", "confidence": "5;4;4;3", "wc_review": "346;703;222;358", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "417;696;215;173", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 5.25, 1.479019945774904 ], "confidence_avg": [ 4.0, 0.7071067811865476 ], "wc_review_avg": [ 407.25, 178.85940707717893 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 375.25, 206.88689542839586 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9561828874675149, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5240131424427788701&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "HkxOoiAcYX", "title": "Estimating Information Flow in DNNs", "track": "main", "status": "Reject", "tldr": "Deterministic deep neural networks do not discard information, but they do cluster their inputs.", "abstract": "We study the evolution of internal representations during deep neural network (DNN) training, aiming to demystify the compression aspect of the information bottleneck theory. The theory suggests that DNN training comprises a rapid fitting phase followed by a slower compression phase, in which the mutual information I(X;T) between the input X and internal representations T decreases. Several papers observe compression of estimated mutual information on different DNN models, but the true I(X;T) over these networks is provably either constant (discrete X) or infinite (continuous X). This work explains the discrepancy between theory and experiments, and clarifies what was actually measured by these past works. To this end, we introduce an auxiliary (noisy) DNN framework for which I(X;T) is a meaningful quantity that depends on the network's parameters. This noisy framework is shown to be a good proxy for the original (deterministic) DNN both in terms of performance and the learned representations. We then develop a rigorous estimator for I(X;T) in noisy DNNs and observe compression in various models. By relating I(X;T) in the noisy DNN to an information-theoretic communication problem, we show that compression is driven by the progressive clustering of hidden representations of inputs from the same class. Several methods to directly monitor clustering of hidden representations, both in noisy and deterministic DNNs, are used to show that meaningful clusters form in the T space. Finally, we return to the estimator of I(X;T) employed in past works, and demonstrate that while it fails to capture the true (vacuous) mutual information, it does serve as a measure for clustering. This clarifies the past observations of compression and isolates the geometric clustering of hidden representations as the true phenomenon of interest.", "keywords": "information theory;representation learning;deep learning;differential entropy estimation", "primary_area": "", "supplementary_material": "", "author": "Ziv Goldfeld;Ewout van den Berg;Kristjan Greenewald;Brian Kingsbury;Igor Melnyk;Nam Nguyen;Yury Polyanskiy", "authorids": "zivg@mit.edu;evandenberg@us.ibm.com;kristjan.h.greenewald@ibm.com;bedk@us.ibm.com;igor.melnyk@ibm.com;nnguyen@us.ibm.com;yp@mit.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ngoldfeld2019estimating,\ntitle={Estimating Information Flow in {DNN}s},\nauthor={Ziv Goldfeld and Ewout van den Berg and Kristjan Greenewald and Brian Kingsbury and Igor Melnyk and Nam Nguyen and Yury Polyanskiy},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxOoiAcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkxOoiAcYX", "pdf_size": 0, "rating": "4;7;7", "confidence": "5;4;4", "wc_review": "403;652;325", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1090;1473;746", "reply_reviewers": "0;0;0", "reply_authors": "2;3;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 460.0, 139.44891537763928 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1103.0, 296.93882647216526 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12275190315768944080&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Meta-Learning Probabilistic Inference for Prediction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1071", "id": "HkxStoC5F7", "author_site": "Jonathan Gordon, John Bronskill, Matthias Bauer, Sebastian Nowozin, Richard E Turner", "tldr": "Novel framework for meta-learning that unifies and extends a broad class of existing few-shot learning methods. Achieves strong performance on few-shot learning benchmarks without requiring iterative test-time inference. ", "abstract": "This paper introduces a new framework for data efficient and versatile learning. Specifically:\n1) We develop ML-PIP, a general framework for Meta-Learning approximate Probabilistic Inference for Prediction. ML-PIP extends existing probabilistic interpretations of meta-learning to cover a broad class of methods. \n2) We introduce \\Versa{}, an instance of the framework employing a flexible and versatile amortization network that takes few-shot learning datasets as inputs, with arbitrary numbers of shots, and outputs a distribution over task-specific parameters in a single forward pass. \\Versa{} substitutes optimization at test time with forward passes through inference networks, amortizing the cost of inference and relieving the need for second derivatives during training.\n3) We evaluate \\Versa{} on benchmark datasets where the method sets new state-of-the-art results, and can handle arbitrary number of shots, and for classification, arbitrary numbers of classes at train and test time. The power of the approach is then demonstrated through a challenging few-shot ShapeNet view reconstruction task.", "keywords": "probabilistic models;approximate inference;few-shot learning;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Jonathan Gordon;John Bronskill;Matthias Bauer;Sebastian Nowozin;Richard Turner", "authorids": "jg801@cam.ac.uk;jfb54@cam.ac.uk;bauer@tue.mpg.de;nowozin@google.com;ret26@cam.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngordon2018metalearning,\ntitle={Meta-Learning Probabilistic Inference for Prediction},\nauthor={Jonathan Gordon and John Bronskill and Matthias Bauer and Sebastian Nowozin and Richard Turner},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxStoC5F7},\n}", "github": "[![github](/images/github_icon.svg) Gordonjo/versa](https://github.com/Gordonjo/versa)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "2;4;4", "wc_review": "126;322;783", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "283;287;640", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 410.3333333333333, 275.3958766737238 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 403.3333333333333, 167.3565720915143 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 330, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18291407046711557858&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=HkxStoC5F7", "pdf": "https://openreview.net/pdf?id=HkxStoC5F7", "email": ";;;;", "author_num": 5 }, { "id": "HkxWrsC5FQ", "title": "Provable Guarantees on Learning Hierarchical Generative Models with Deep CNNs", "track": "main", "status": "Reject", "tldr": "A generative model for deep CNNs with provable theoretical guarantees that actually works", "abstract": "Learning deep networks is computationally hard in the general case. To show any positive theoretical results, one must make assumptions on the data distribution. Current theoretical works often make assumptions that are very far from describing real data, like sampling from Gaussian distribution or linear separability of the data. We describe an algorithm that learns convolutional neural network,\nassuming the data is sampled from a deep generative model that generates images level by level,\nwhere lower resolution images correspond to latent semantic classes. We analyze the convergence rate of our algorithm assuming the data is indeed generated according to this model (as well as\nadditional assumptions). While we do not pretend to claim that the assumptions are realistic for natural images, we do believe that they capture some true properties of real data. Furthermore, we show that on CIFAR-10, the algorithm we analyze achieves results in the same ballpark with vanilla convolutional neural networks that are trained with SGD.", "keywords": "deep learning;theory", "primary_area": "", "supplementary_material": "", "author": "Eran Malach;Shai Shalev-Shwartz", "authorids": "eran.malach@mail.huji.ac.il;shais@cs.huji.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmalach2019provable,\ntitle={Provable Guarantees on Learning Hierarchical Generative Models with Deep {CNN}s},\nauthor={Eran Malach and Shai Shalev-Shwartz},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxWrsC5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkxWrsC5FQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;3", "wc_review": "597;361;320", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "262;133;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 426.0, 122.06828689986055 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 131.66666666666666, 106.96520721971027 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:scG66Nuv5_EJ:scholar.google.com/&scioq=Provable+Guarantees+on+Learning+Hierarchical+Generative+Models+with+Deep+CNNs&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Deep reinforcement learning with relational inductive biases", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/995", "id": "HkxaFoC9KQ", "author_site": "Vinicius Zambaldi, David Raposo, Adam Santoro, Victor Bapst, Yujia Li, Igor Babuschkin, Karl Tuyls, David P Reichert, Timothy Lillicrap, Edward Lockhart, Murray Shanahan, Victoria Langston, Razvan Pascanu, Matthew Botvinick, Oriol Vinyals, Peter Battaglia", "tldr": "Relational inductive biases improve out-of-distribution generalization capacities in model-free reinforcement learning agents", "abstract": "We introduce an approach for augmenting model-free deep reinforcement learning agents with a mechanism for relational reasoning over structured representations, which improves performance, learning efficiency, generalization, and interpretability. Our architecture encodes an image as a set of vectors, and applies an iterative message-passing procedure to discover and reason about relevant entities and relations in a scene. In six of seven StarCraft II Learning Environment mini-games, our agent achieved state-of-the-art performance, and surpassed human grandmaster-level on four. In a novel navigation and planning task, our agent's performance and learning efficiency far exceeded non-relational baselines, it was able to generalize to more complex scenes than it had experienced during training. Moreover, when we examined its learned internal representations, they reflected important structure about the problem and the agent's intentions. The main contribution of this work is to introduce techniques for representing and reasoning about states in model-free deep reinforcement learning agents via relational inductive biases. Our experiments show this approach can offer advantages in efficiency, generalization, and interpretability, and can scale up to meet some of the most challenging test environments in modern artificial intelligence.", "keywords": "relational reasoning;reinforcement learning;graph neural networks;starcraft;generalization;inductive bias", "primary_area": "", "supplementary_material": "", "author": "Vinicius Zambaldi;David Raposo;Adam Santoro;Victor Bapst;Yujia Li;Igor Babuschkin;Karl Tuyls;David Reichert;Timothy Lillicrap;Edward Lockhart;Murray Shanahan;Victoria Langston;Razvan Pascanu;Matthew Botvinick;Oriol Vinyals;Peter Battaglia", "authorids": "vzambaldi@google.com;draposo@google.com;adamsantoro@google.com;vbapst@google.com;yujiali@google.com;ibab@google.com;karltuyls@google.com;reichert@google.com;countzero@google.com;locked@google.com;mshanahan@google.com;vlangston@google.com;razp@google.com;botvinick@google.com;vinyals@google.com;peterbattaglia@google.com", "gender": ";;;;;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;;;;", "aff": ";;;;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;;;", "position": ";;;;;;;;;;;;;;;", "bibtex": "@inproceedings{\nzambaldi2018deep,\ntitle={Deep reinforcement learning with relational inductive biases},\nauthor={Vinicius Zambaldi and David Raposo and Adam Santoro and Victor Bapst and Yujia Li and Igor Babuschkin and Karl Tuyls and David Reichert and Timothy Lillicrap and Edward Lockhart and Murray Shanahan and Victoria Langston and Razvan Pascanu and Matthew Botvinick and Oriol Vinyals and Peter Battaglia},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxaFoC9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "wc_review": "504;783;558", "wc_reply_reviewers": "64;33;21", "wc_reply_authors": "506;307;513", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 615.0, 120.82218339361361 ], "wc_reply_reviewers_avg": [ 39.333333333333336, 18.116904322268255 ], "wc_reply_authors_avg": [ 442.0, 95.50218147595722 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 16, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11171574201032061032&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HkxaFoC9KQ", "pdf": "https://openreview.net/pdf?id=HkxaFoC9KQ", "email": ";;;;;;;;;;;;;;;", "author_num": 16 }, { "id": "Hkxarj09Y7", "title": "Unified recurrent network for many feature types", "track": "main", "status": "Reject", "tldr": "We introduce a unified RNN that handles five different feature types, each in a different manner.", "abstract": "There are time series that are amenable to recurrent neural network (RNN) solutions when treated as sequences, but some series, e.g. asynchronous time series, provide a richer variation of feature types than current RNN cells take into account. In order to address such situations, we introduce a unified RNN that handles five different feature types, each in a different manner. Our RNN framework separates sequential features into two groups dependent on their frequency, which we call sparse and dense features, and which affect cell updates differently. Further, we also incorporate time features at the sequential level that relate to the time between specified events in the sequence and are used to modify the cell's memory state. We also include two types of static (whole sequence level) features, one related to time and one not, which are combined with the encoder output. The experiments show that the proposed modeling framework does increase performance compared to standard cells.", "keywords": "sparse;recurrent;asynchronous;time;series", "primary_area": "", "supplementary_material": "", "author": "Alexander Stec;Diego Klabjan;Jean Utke", "authorids": "stec@u.northwestern.edu;d-klabjan@northwestern.edu;jutke@allstate.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nstec2019unified,\ntitle={Unified recurrent network for many feature types},\nauthor={Alexander Stec and Diego Klabjan and Jean Utke},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkxarj09Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hkxarj09Y7", "pdf_size": 0, "rating": "4;4;6;7", "confidence": "4;4;3;2", "wc_review": "184;423;207;452", "wc_reply_reviewers": "198;37;0;121", "wc_reply_authors": "396;354;186;525", "reply_reviewers": "1;1;0;1", "reply_authors": "2;1;1;1", "rating_avg": [ 5.25, 1.299038105676658 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "wc_review_avg": [ 316.5, 121.70558738200971 ], "wc_reply_reviewers_avg": [ 89.0, 76.69745758498127 ], "wc_reply_authors_avg": [ 365.25, 121.16388694656507 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.986440050415621, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:q51Ajl0mtW4J:scholar.google.com/&scioq=Unified+recurrent+network+for+many+feature+types&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "title": "Relaxed Quantization for Discretized Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/855", "id": "HkxjYoCqKX", "author_site": "Christos Louizos, Matthias Reisser, Tijmen Blankevoort, Efstratios Gavves, Max Welling", "tldr": "We introduce a technique that allows for gradient based training of quantized neural networks.", "abstract": "Neural network quantization has become an important research area due to its great impact on deployment of large models on resource constrained devices. In order to train networks that can be effectively discretized without loss of performance, we introduce a differentiable quantization procedure. Differentiability can be achieved by transforming continuous distributions over the weights and activations of the network to categorical distributions over the quantization grid. These are subsequently relaxed to continuous surrogates that can allow for efficient gradient-based optimization. We further show that stochastic rounding can be seen as a special case of the proposed approach and that under this formulation the quantization grid itself can also be optimized with gradient descent. We experimentally validate the performance of our method on MNIST, CIFAR 10 and Imagenet classification.", "keywords": "Quantization;Compression;Neural Networks;Efficiency", "primary_area": "", "supplementary_material": "", "author": "Christos Louizos;Matthias Reisser;Tijmen Blankevoort;Efstratios Gavves;Max Welling", "authorids": "c.louizos@uva.nl;m.reisser@uva.nl;tijmen@qti.qualcomm.com;egavves@uva.nl;m.welling@uva.nl", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlouizos2018relaxed,\ntitle={Relaxed Quantization for Discretized Neural Networks},\nauthor={Christos Louizos and Matthias Reisser and Tijmen Blankevoort and Efstratios Gavves and Max Welling},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkxjYoCqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;4", "wc_review": "226;167;225", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "164;129;195", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 206.0, 27.58018612458347 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 162.66666666666666, 26.96087700518826 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 224, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3712050618324227952&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HkxjYoCqKX", "pdf": "https://openreview.net/pdf?id=HkxjYoCqKX", "email": ";;;;", "author_num": 5 }, { "id": "Hkxr1nCcFm", "title": "An investigation of model-free planning", "track": "main", "status": "Reject", "tldr": "", "abstract": "The field of reinforcement learning (RL) is facing increasingly challenging domains with combinatorial complexity. For an RL agent to address these challenges, it is essential that it can plan effectively. Prior work has typically utilized an explicit model of the environment, combined with a specific planning algorithm (such as tree search). More recently, a new family of methods have been proposed that learn how to plan, by providing the structure for planning via an inductive bias in the function approximator (such as a tree structured neural network), trained end-to-end by a model-free RL algorithm. In this paper, we go even further, and demonstrate empirically that an entirely model-free approach, without special structure beyond standard neural network components such as convolutional networks and LSTMs, can learn to exhibit many of the hallmarks that we would typically associate with a model-based planner. We measure our agent's effectiveness at planning in terms of its ability to generalize across a combinatorial and irreversible state space, its data efficiency, and its ability to utilize additional thinking time. We find that our agent has the characteristics that one might expect to find in a planning algorithm. Furthermore, it exceeds the state-of-the-art in challenging combinatorial domains such as Sokoban and outperforms other model-free approaches that utilize strong inductive biases toward planning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Arthur Guez;Mehdi Mirza;Karol Gregor;Rishabh Kabra;S\u00e9bastien Racani\u00e8re;Th\u00e9ophane Weber;David Raposo;Adam Santoro;Laurent Orseau;Tom Eccles;Greg Wayne;David Silver;Timothy Lillicrap", "authorids": "aguez@google.com;mmirza@google.com;karolg@google.com;rkabra@google.com;sracaniere@google.com;theophane@google.com;draposo@google.com;adamsantoro@google.com;lorseau@google.com;eccles@google.com;gregwayne@google.com;davidsilver@google.com;countzero@google.com", "gender": ";;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;", "aff": ";;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;", "position": ";;;;;;;;;;;;", "bibtex": "@misc{\nguez2019an,\ntitle={An investigation of model-free planning},\nauthor={Arthur Guez and Mehdi Mirza and Karol Gregor and Rishabh Kabra and S\u00e9bastien Racani\u00e8re and Th\u00e9ophane Weber and David Raposo and Adam Santoro and Laurent Orseau and Tom Eccles and Greg Wayne and David Silver and Timothy Lillicrap},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkxr1nCcFm},\n}", "github": "[![github](/images/github_icon.svg) deepmind/boxoban-levels](https://github.com/deepmind/boxoban-levels)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hkxr1nCcFm", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;3;4", "wc_review": "585;403;643", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "403;427;709", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 543.6666666666666, 102.24589097964878 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 513.0, 138.93883546366726 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 13, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7566080617462830679&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "Hkxx3o0qFX", "title": "High Resolution and Fast Face Completion via Progressively Attentive GANs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Face completion is a challenging task with the difficulty level increasing significantly with respect to high resolution, the complexity of \"holes\" and the controllable attributes of filled-in fragments. Our system addresses the challenges by learning a fully end-to-end framework that trains generative adversarial networks (GANs) progressively from low resolution to high resolution with conditional vectors encoding controllable attributes. We design a novel coarse-to-fine attentive module network architecture. Our model is encouraged to attend on finer details while the network is growing to a higher resolution, thus being capable of showing progressive attention to different frequency components in a coarse-to-fine way. We term the module Frequency-oriented Attentive Module (FAM). Our system can complete faces with large structural and appearance variations using a single feed-forward pass of computation with mean inference time of 0.54 seconds for images at 1024x1024 resolution. A pilot human study shows our approach outperforms state-of-the-art face completion methods. The code will be released upon publication. ", "keywords": "Face Completion;progressive GANs;Attribute Control;Frequency-oriented Attention", "primary_area": "", "supplementary_material": "", "author": "Zeyuan Chen;Shaoliang Nie;Tianfu Wu;Christopher G. Healey", "authorids": "zchen23@ncsu.edu;snie@ncsu.edu;tianfu_wu@ncsu.edu;healey@ncsu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2019high,\ntitle={High Resolution and Fast Face Completion via Progressively Attentive {GAN}s},\nauthor={Zeyuan Chen and Shaoliang Nie and Tianfu Wu and Christopher G. Healey},\nyear={2019},\nurl={https://openreview.net/forum?id=Hkxx3o0qFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hkxx3o0qFX", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;2;5", "wc_review": "187;210;252", "wc_reply_reviewers": "147;0;0", "wc_reply_authors": "443;681;576", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 216.33333333333334, 26.911377189252544 ], "wc_reply_reviewers_avg": [ 49.0, 69.29646455628166 ], "wc_reply_authors_avg": [ 566.6666666666666, 97.38697152197629 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Fqlv-L5vF3QJ:scholar.google.com/&scioq=High+Resolution+and+Fast+Face+Completion+via+Progressively+Attentive+GANs&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HkxzDiAcK7", "title": "Classification of Building Noise Type/Position via Supervised Learning", "track": "main", "status": "Withdraw", "tldr": "This paper presents noise type/position classification of various impact noises generated in a building which is a serious conflict issue in apartment complexes", "abstract": "This paper presents noise type/position classification of various impact noises generated in a building which is a serious conflict issue in apartment complexes. For this study, a collection of floor impact noise dataset is recorded with a single microphone. Noise types/positions are selected based on a report by the Floor Management Center under Korea Environmental Corporation. Using a convolutional neural networks based classifier, the impact noise signals converted to log-scaled Mel-spectrograms are classified into noise types or positions. Also, our model is evaluated on a standard environmental sound dataset ESC-50 to show extensibility on environmental sound classification.\n", "keywords": "impact noise;noise type classification;noise position classification;convolutional neural networks;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Hwiyong Choi;Haesang Yang;Seungjun Lee;Woojae Seong", "authorids": "its_me_chy@snu.ac.kr;coupon3@snu.ac.kr;tl7qns7ch@snu.ac.kr;wseong@snu.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=HkxzDiAcK7", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;2", "wc_review": "181;65;126", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 124.0, 47.37791327893902 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZNdHPt_vgkUJ:scholar.google.com/&scioq=Classification+of+Building+Noise+Type/Position+via+Supervised+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HkzL4hR9Ym", "title": "Shaping representations through communication", "track": "main", "status": "Withdraw", "tldr": "Motivated by theories of language and communication, we introduce community-based autoencoders, in which multiple encoders and decoders collectively learn structured and reusable representations.", "abstract": "Good representations facilitate transfer learning and few-shot learning. Motivated by theories of language and communication that explain why communities with large number of speakers have, on average, simpler languages with more regularity, we cast the representation learning problem in terms of learning to communicate. Our starting point sees traditional autoencoders as a single encoder with a fixed decoder partner that must learn to communicate. Generalizing from there, we introduce community-based autoencoders in which multiple encoders and decoders collectively learn representations by being randomly paired up on successive training iterations. Our experiments show that increasing community sizes reduce idiosyncrasies in the learned codes, resulting in more invariant representations with increased reusability and structure.", "keywords": "communication;language;representation learning;autoencoders", "primary_area": "", "supplementary_material": "", "author": "Olivier Tieleman;Angeliki Lazaridou;Shibl Mourad;Charles Blundell;Doina Precup", "authorids": "tieleman@google.com;angeliki@google.com;shibl@google.com;cblundell@google.com;doinap@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HkzL4hR9Ym", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "253;306;161", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 240.0, 59.90548110704618 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16541944653768233385&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HkzNXhC9KQ", "title": "Adaptive Sample-space & Adaptive Probability coding: a neural-network based approach for compression", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose Adaptive Sample-space & Adaptive Probability (ASAP) coding, an efficient neural-network based method for lossy data compression.\nOur ASAP coding distinguishes itself from the conventional method based on adaptive arithmetic coding in that it models the probability distribution for the quantization process in such a way that one can conduct back-propagation for the quantization width that determines the support of the distribution. \nOur ASAP also trains the model with a novel, hyper-parameter free multiplicative loss for the rate-distortion tradeoff. \nWith our ASAP encoder, we are able to compress the image files in the Kodak dataset to as low as one fifth the size of the JPEG-compressed image without compromising their visual quality, and achieved the state-of-the-art result in terms of MS-SSIM based rate-distortion tradeoff. ", "keywords": "Data compression;Image compression;Deep Learning;Convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Ken Nakanishi;Shin-ichi Maeda;Takeru Miyato;Masanori Koyama", "authorids": "ikyhn1.ken.n@gmail.com;ichi@preferred.jp;miyato@preferred.jp;masomatics@preferred.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nnakanishi2019adaptive,\ntitle={Adaptive Sample-space & Adaptive Probability coding: a neural-network based approach for compression},\nauthor={Ken Nakanishi and Shin-ichi Maeda and Takeru Miyato and Masanori Koyama},\nyear={2019},\nurl={https://openreview.net/forum?id=HkzNXhC9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HkzNXhC9KQ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;3", "wc_review": "465;318;79", "wc_reply_reviewers": "39;0;0", "wc_reply_authors": "816;669;204", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 287.3333333333333, 159.06881809106676 ], "wc_reply_reviewers_avg": [ 13.0, 18.384776310850235 ], "wc_reply_authors_avg": [ 563.0, 260.84861510078986 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:BEwkD2sbFSIJ:scholar.google.com/&scioq=Adaptive+Sample-space+%26+Adaptive+Probability+coding:+a+neural-network+based+approach+for+compression&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HkzOWnActX", "title": "Model-Agnostic Meta-Learning for Multimodal Task Distributions", "track": "main", "status": "Reject", "tldr": "We proposed a meta-learner that generalizes across a multimodal task distribution by identifying the modes of a task distribution and modulating its meta-learned prior parameters accordingly, allowing faster adaptation through gradient updates.", "abstract": "Gradient-based meta-learners such as MAML (Finn et al., 2017) are able to learn a meta-prior from similar tasks to adapt to novel tasks from the same distribution with few gradient updates. One important limitation of such frameworks is that they seek a common initialization shared across the entire task distribution, substantially limiting the diversity of the task distributions that they are able to learn from. In this paper, we augment MAML with the capability to identify tasks sampled from a multimodal task distribution and adapt quickly through gradient updates. Specifically, we propose a multimodal MAML algorithm that is able to modulate its meta-learned prior according to the identified task, allowing faster adaptation. We evaluate the proposed model on a diverse set of problems including regression, few-shot image classification, and reinforcement learning. The results demonstrate the effectiveness of our model in modulating the meta-learned prior in response to the characteristics of tasks sampled from a multimodal distribution.", "keywords": "Meta-learning;gradient-based meta-learning;model-based meta-learning", "primary_area": "", "supplementary_material": "", "author": "Risto Vuorio;Shao-Hua Sun;Hexiang Hu;Joseph J. Lim", "authorids": "vuoristo@gmail.com;shaohuas@usc.edu;hexiangh@usc.edu;limjj@usc.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nvuorio2019modelagnostic,\ntitle={Model-Agnostic Meta-Learning for Multimodal Task Distributions},\nauthor={Risto Vuorio and Shao-Hua Sun and Hexiang Hu and Joseph J. Lim},\nyear={2019},\nurl={https://openreview.net/forum?id=HkzOWnActX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HkzOWnActX", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;3;4", "wc_review": "778;235;172", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1765;766;511", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 395.0, 272.04043817050433 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1014.0, 541.1450822099375 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:U7zR3Z601MMJ:scholar.google.com/&scioq=Model-Agnostic+Meta-Learning+for+Multimodal+Task+Distributions&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Tree-Structured Recurrent Switching Linear Dynamical Systems for Multi-Scale Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/888", "id": "HkzRQhR9YX", "author_site": "Josue Nassar, Scott W Linderman, Monica Bugallo, Il Memming Park", "tldr": "", "abstract": "Many real-world systems studied are governed by complex, nonlinear dynamics. By modeling these dynamics, we can gain insight into how these systems work, make predictions about how they will behave, and develop strategies for controlling them. While there are many methods for modeling nonlinear dynamical systems, existing techniques face a trade off between offering interpretable descriptions and making accurate predictions. Here, we develop a class of models that aims to achieve both simultaneously, smoothly interpolating between simple descriptions and more complex, yet also more accurate models. Our probabilistic model achieves this multi-scale property through of a hierarchy of locally linear dynamics that jointly approximate global nonlinear dynamics. We call it the tree-structured recurrent switching linear dynamical system. To fit this model, we present a fully-Bayesian sampling procedure using Polya-Gamma data augmentation to allow for fast and conjugate Gibbs sampling. Through a variety of synthetic and real examples, we show how these models outperform existing methods in both interpretability and predictive capability.", "keywords": "machine learning;bayesian statistics;dynamical systems", "primary_area": "", "supplementary_material": "", "author": "Josue Nassar;Scott Linderman;Monica Bugallo;Il Memming Park", "authorids": "josue.nassar@stonybrook.edu;scott.linderman@columbia.edu;monica.bugallo@stonybrook.edu;memming.park@stonybrook.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nnassar2018treestructured,\ntitle={Tree-Structured Recurrent Switching Linear Dynamical Systems for Multi-Scale Modeling},\nauthor={Josue Nassar and Scott Linderman and Monica Bugallo and Il Memming Park},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkzRQhR9YX},\n}", "github": "[![github](/images/github_icon.svg) catniplab/tree_structured_rslds](https://github.com/catniplab/tree_structured_rslds)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;2;2", "wc_review": "381;434;159", "wc_reply_reviewers": "47;32;43", "wc_reply_authors": "515;718;319", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 324.6666666666667, 119.12551737091802 ], "wc_reply_reviewers_avg": [ 40.666666666666664, 6.342099196813483 ], "wc_reply_authors_avg": [ 517.3333333333334, 162.89942363993256 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10945679458649765039&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HkzRQhR9YX", "pdf": "https://openreview.net/pdf?id=HkzRQhR9YX", "email": ";;;", "author_num": 4 }, { "title": "STCN: Stochastic Temporal Convolutional Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1126", "id": "HkzSQhCcK7", "author_site": "Emre Aksan, Otmar Hilliges", "tldr": "We combine the computational advantages of temporal convolutional architectures with the expressiveness of stochastic latent variables.", "abstract": "Convolutional architectures have recently been shown to be competitive on many\nsequence modelling tasks when compared to the de-facto standard of recurrent neural networks (RNNs) while providing computational and modelling advantages due to inherent parallelism. However, currently, there remains a performance\ngap to more expressive stochastic RNN variants, especially those with several layers of dependent random variables. In this work, we propose stochastic temporal convolutional networks (STCNs), a novel architecture that combines the computational advantages of temporal convolutional networks (TCN) with the representational power and robustness of stochastic latent spaces. In particular, we propose a hierarchy of stochastic latent variables that captures temporal dependencies at different time-scales. The architecture is modular and flexible due to the decoupling of the deterministic and stochastic layers. We show that the proposed architecture achieves state of the art log-likelihoods across several tasks. Finally, the model is capable of predicting high-quality synthetic samples over a long-range temporal horizon in modelling of handwritten text.", "keywords": "latent variables;variational inference;temporal convolutional networks;sequence modeling;auto-regressive modeling", "primary_area": "", "supplementary_material": "", "author": "Emre Aksan;Otmar Hilliges", "authorids": "eaksan@inf.ethz.ch;otmar.hilliges@inf.ethz.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\naksan2018stcn,\ntitle={{STCN}: Stochastic Temporal Convolutional Networks},\nauthor={Emre Aksan and Otmar Hilliges},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HkzSQhCcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;5", "wc_review": "469;565;586", "wc_reply_reviewers": "325;0;0", "wc_reply_authors": "498;511;1066", "reply_reviewers": "1;0;0", "reply_authors": "2;2;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 540.0, 50.93132631298737 ], "wc_reply_reviewers_avg": [ 108.33333333333333, 153.2064692570853 ], "wc_reply_authors_avg": [ 691.6666666666666, 264.74683924249007 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15721878191744318436&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=HkzSQhCcK7", "pdf": "https://openreview.net/pdf?id=HkzSQhCcK7", "email": ";", "author_num": 2 }, { "id": "HkzZBi0cFQ", "title": "Quantization for Rapid Deployment of Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper aims at rapid deployment of the state-of-the-art deep neural networks (DNNs) to energy efficient accelerators without time-consuming fine tuning or the availability of the full datasets. Converting DNNs in full precision to limited precision is essential in taking advantage of the accelerators with reduced memory footprint and computation power. However, such a task is not trivial since it often requires the full training and validation datasets for profiling the network statistics and fine tuning the networks to recover the accuracy lost after quantization. To address these issues, we propose a simple method recognizing channel-level distribution to reduce the quantization-induced accuracy loss and minimize the required image samples for profiling. We evaluated our method on eleven networks trained on the ImageNet classification benchmark and a network trained on the Pascal VOC object detection benchmark. The results prove that the networks can be quantized into 8-bit integer precision without fine tuning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jun Haeng Lee;Sangwon Ha;Saerom Choi;Won-Jo Lee;Seungwon Lee", "authorids": "junhaeng2.lee@samsung.com;sw815.ha@samsung.com;sincere.choi@samsung.com;w-j.lee@samsung.com;seungw.lee@samsung.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlee2019quantization,\ntitle={Quantization for Rapid Deployment of Deep Neural Networks},\nauthor={Jun Haeng Lee and Sangwon Ha and Saerom Choi and Won-Jo Lee and Seungwon Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=HkzZBi0cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=HkzZBi0cFQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;3", "wc_review": "366;59;337", "wc_reply_reviewers": "0;0;319", "wc_reply_authors": "202;118;230", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 254.0, 138.39315975389343 ], "wc_reply_reviewers_avg": [ 106.33333333333333, 150.37804213233912 ], "wc_reply_authors_avg": [ 183.33333333333334, 47.59084879353266 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15657651216355540564&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "HkzyX3CcFQ", "title": "Contextual Recurrent Convolutional Model for Robust Visual Learning", "track": "main", "status": "Reject", "tldr": "we proposed a novel contextual recurrent convolutional network with robust property of visual learning ", "abstract": "Feedforward convolutional neural network has achieved a great success in many computer vision tasks. While it validly imitates the hierarchical structure of biological visual system, it still lacks one essential architectural feature: contextual recurrent connections with feedback, which widely exists in biological visual system. In this work, we designed a Contextual Recurrent Convolutional Network with this feature embedded in a standard CNN structure. We found that such feedback connections could enable lower layers to ``rethink\" about their representations given the top-down contextual information. We carefully studied the components of this network, and showed its robustness and superiority over feedforward baselines in such tasks as noise image classification, partially occluded object recognition and fine-grained image classification. We believed this work could be an important step to help bridge the gap between computer vision models and real biological visual system.", "keywords": "contextual modulation;recurrent convolutional network;robust visual learning", "primary_area": "", "supplementary_material": "", "author": "Siming Yan*;Bowen Xiao*;Yimeng Zhang;Tai Sing Lee", "authorids": "simingyan@pku.edu.cn;mike.xiao@pku.edu.cn;zym1010@gmail.com;taislee@andrew.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyan*2019contextual,\ntitle={Contextual Recurrent Convolutional Model for Robust Visual Learning},\nauthor={Siming Yan* and Bowen Xiao* and Yimeng Zhang and Tai Sing Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=HkzyX3CcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HkzyX3CcFQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;4", "wc_review": "310;460;91", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "756;825;374", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 287.0, 151.51897570931504 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 651.6666666666666, 198.3504194320972 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EUR3GaPVf5cJ:scholar.google.com/&scioq=Contextual+Recurrent+Convolutional+Model+for+Robust+Visual+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Hy4R2oRqKQ", "title": "Canonical Correlation Analysis with Implicit Distributions", "track": "main", "status": "Reject", "tldr": "This paper presents a theoretical study for CCA based on implicit distributions and proposes a generative nonlinear CCA variant which achieves consistent encoding for the multi-view input.", "abstract": "Canonical Correlation Analysis (CCA) is a ubiquitous technique that shows promising performance in multi-view learning problems. Due to the conjugacy of the prior and the likelihood, probabilistic CCA (PCCA) presents the posterior with an analytic solution, which provides probabilistic interpretation for classic linear CCA. As the multi-view data are usually complex in practice, nonlinear mappings are adopted to capture nonlinear dependency among the views. However, the interpretation provided in PCCA cannot be generalized to this nonlinear setting, as the distribution assumptions on the prior and the likelihood makes it restrictive to capture nonlinear dependency. To overcome this bottleneck, in this paper, we provide a novel perspective for CCA based on implicit distributions. Specifically, we present minimum Conditional Mutual Information (CMI) as a new criteria to capture nonlinear dependency for multi-view learning problem. To eliminate the explicit distribution requirement in direct estimation of CMI, we derive an objective whose minimization implicitly leads to the proposed criteria. Based on this objective, we present an implicit probabilistic formulation for CCA, named Implicit CCA (ICCA), which provides a flexible framework to design CCA extensions with implicit distributions. As an instantiation, we present adversarial CCA (ACCA), a nonlinear CCA variant which benefits from consistent encoding achieved by adversarial learning. Quantitative correlation analysis and superior performance on cross-view generation task demonstrate the superiority of the proposed ACCA.", "keywords": "Canonical Correlation Analysis;implicit probabilistic model;cross-view structure output prediction", "primary_area": "", "supplementary_material": "", "author": "Yaxin Shi;Donna Xu;Yuangang Pan;Ivor Tsang", "authorids": "yaxin.shi@student.uts.edu.au;donna.xu@student.uts.edu.au;yuangang.pan@student.uts.edu.au;ivor.tsang@uts.edu.au", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshi2019canonical,\ntitle={Canonical Correlation Analysis with Implicit Distributions},\nauthor={Yaxin Shi and Donna Xu and Yuangang Pan and Ivor Tsang},\nyear={2019},\nurl={https://openreview.net/forum?id=Hy4R2oRqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hy4R2oRqKQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;5;4", "wc_review": "484;186;331", "wc_reply_reviewers": "0;0;119", "wc_reply_authors": "1152;132;589", "reply_reviewers": "0;0;1", "reply_authors": "2;1;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 333.6666666666667, 121.67260259492365 ], "wc_reply_reviewers_avg": [ 39.666666666666664, 56.09713797413277 ], "wc_reply_authors_avg": [ 624.3333333333334, 417.16210544849616 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rzByNkr45hIJ:scholar.google.com/&scioq=Canonical+Correlation+Analysis+with+Implicit+Distributions&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HyEl3o05Fm", "title": "Stochastic Adversarial Video Prediction", "track": "main", "status": "Reject", "tldr": "", "abstract": "Being able to predict what may happen in the future requires an in-depth understanding of the physical and causal rules that govern the world. A model that is able to do so has a number of appealing applications, from robotic planning to representation learning. However, learning to predict raw future observations, such as frames in a video, is exceedingly challenging\u2014the ambiguous nature of the problem can cause a naively designed model to average together possible futures into a single, blurry prediction. Recently, this has been addressed by two distinct approaches: (a) latent variational variable models that explicitly model underlying stochasticity and (b) adversarially-trained models that aim to produce naturalistic images. However, a standard latent variable model can struggle to produce realistic results, and a standard adversarially-trained model underutilizes latent variables and fails to produce diverse predictions. We show that these distinct methods are in fact complementary. Combining the two produces predictions that look more realistic to human raters and better cover the range of possible futures. Our method outperforms prior works in these aspects.", "keywords": "video prediction;GANs;variational autoencoder", "primary_area": "", "supplementary_material": "", "author": "Alex X. Lee;Richard Zhang;Frederik Ebert;Pieter Abbeel;Chelsea Finn;Sergey Levine", "authorids": "alexlee_gk@cs.berkeley.edu;rich.zhang@eecs.berkeley.edu;febert@berkeley.edu;pabbeel@cs.berkeley.edu;cbfinn@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlee2019stochastic,\ntitle={Stochastic Adversarial Video Prediction},\nauthor={Alex X. Lee and Richard Zhang and Frederik Ebert and Pieter Abbeel and Chelsea Finn and Sergey Levine},\nyear={2019},\nurl={https://openreview.net/forum?id=HyEl3o05Fm},\n}", "github": "[![github](/images/github_icon.svg) alexlee-gk/video_prediction](https://github.com/alexlee-gk/video_prediction) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HyEl3o05Fm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyEl3o05Fm", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;3", "wc_review": "487;351;480", "wc_reply_reviewers": "474;0;0", "wc_reply_authors": "1244;350;647", "reply_reviewers": "1;0;0", "reply_authors": "3;1;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 439.3333333333333, 62.52643885234114 ], "wc_reply_reviewers_avg": [ 158.0, 223.44574285494903 ], "wc_reply_authors_avg": [ 747.0, 371.7606757041417 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 542, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=439155926470794521&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Soft Q-Learning with Mutual-Information Regularization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/822", "id": "HyEtjoCqFX", "author_site": "Jordi Grau-Moya, Felix Leibfried, Peter Vrancx", "tldr": "", "abstract": "We propose a reinforcement learning (RL) algorithm that uses mutual-information regularization to optimize a prior action distribution for better performance and exploration. Entropy-based regularization has previously been shown to improve both exploration and robustness in challenging sequential decision-making tasks. It does so by encouraging policies to put probability mass on all actions. However, entropy regularization might be undesirable when actions have significantly different importance. In this paper, we propose a theoretically motivated framework that dynamically weights the importance of actions by using the mutual-information. In particular, we express the RL problem as an inference problem where the prior probability distribution over actions is subject to optimization. We show that the prior optimization introduces a mutual-information regularizer in the RL objective. This regularizer encourages the policy to be close to a non-uniform distribution that assigns higher probability mass to more important actions. We empirically demonstrate that our method significantly improves over entropy regularization methods and unregularized methods.", "keywords": "reinforcement learning;regularization;entropy;mutual information", "primary_area": "", "supplementary_material": "", "author": "Jordi Grau-Moya;Felix Leibfried;Peter Vrancx", "authorids": "jordi@prowler.io;felix@prowler.io;peter@prowler.io", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngrau-moya2018soft,\ntitle={Soft Q-Learning with Mutual-Information Regularization},\nauthor={Jordi Grau-Moya and Felix Leibfried and Peter Vrancx},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyEtjoCqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "wc_review": "715;328;219", "wc_reply_reviewers": "44;81;143", "wc_reply_authors": "1523;425;195", "reply_reviewers": "2;1;1", "reply_authors": "4;2;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 420.6666666666667, 212.82909157871774 ], "wc_reply_reviewers_avg": [ 89.33333333333333, 40.84387618997764 ], "wc_reply_authors_avg": [ 714.3333333333334, 579.4717901292138 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13711644038639649091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=HyEtjoCqFX", "pdf": "https://openreview.net/pdf?id=HyEtjoCqFX", "email": ";;", "author_num": 3 }, { "id": "HyG1_j0cYQ", "title": "Pumpout: A Meta Approach for Robustly Training Deep Neural Networks with Noisy Labels", "track": "main", "status": "Reject", "tldr": "Starting from tomorrow, never worry about your DNNs memorizing noisy labels---forget bad labels by Pumpout in an active manner!", "abstract": "It is challenging to train deep neural networks robustly on the industrial-level data, since labels of such data are heavily noisy, and their label generation processes are normally agnostic. To handle these issues, by using the memorization effects of deep neural networks, we may train deep neural networks on the whole dataset only the first few iterations. Then, we may employ early stopping or the small-loss trick to train them on selected instances. However, in such training procedures, deep neural networks inevitably memorize some noisy labels, which will degrade their generalization. In this paper, we propose a meta algorithm called Pumpout to overcome the problem of memorizing noisy labels. By using scaled stochastic gradient ascent, Pumpout actively squeezes out the negative effects of noisy labels from the training model, instead of passively forgetting these effects. We leverage Pumpout to upgrade two representative methods: MentorNet and Backward Correction. Empirical results on benchmark vision and text datasets demonstrate that Pumpout can significantly improve the robustness of representative methods.", "keywords": "Noisy Labels;Deep Learning;Meta Approach", "primary_area": "", "supplementary_material": "", "author": "Bo Han;Gang Niu;Jiangchao Yao;Xingrui Yu;Miao Xu;Ivor Tsang;Masashi Sugiyama", "authorids": "bo.han@riken.jp;gang.niu@riken.jp;jiangchao.yao@student.uts.edu.au;xingrui.yu@student.uts.edu.au;miao.xu@riken.jp;ivor.tsang@uts.edu.au;sugi@k.u-tokyo.ac.jp", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nhan2019pumpout,\ntitle={Pumpout: A Meta Approach for Robustly Training Deep Neural Networks with Noisy Labels},\nauthor={Bo Han and Gang Niu and Jiangchao Yao and Xingrui Yu and Miao Xu and Ivor Tsang and Masashi Sugiyama},\nyear={2019},\nurl={https://openreview.net/forum?id=HyG1_j0cYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyG1_j0cYQ", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;5;3", "wc_review": "149;640;135", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "389;596;350", "reply_reviewers": "0;0;0", "reply_authors": "2;6;2", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 308.0, 234.8290158107951 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 445.0, 107.95369377654477 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.3273268353539886, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12037164275271041058&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "On the Turing Completeness of Modern Neural Network Architectures", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/707", "id": "HyGBdo0qFm", "author_site": "Jorge P\u00e9rez, Javier Marinkovi\u0107, Pablo Barcel\u00f3", "tldr": "We show that the Transformer architecture and the Neural GPU are Turing complete.", "abstract": "Alternatives to recurrent neural networks, in particular, architectures based on attention or convolutions, have been gaining momentum for processing input sequences. In spite of their relevance, the computational properties of these alternatives have not yet been fully explored. We study the computational power of two of the most paradigmatic architectures exemplifying these mechanisms: the Transformer (Vaswani et al., 2017) and the Neural GPU (Kaiser & Sutskever, 2016). We show both models to be Turing complete exclusively based on their capacity to compute and access internal dense representations of the data. In particular, neither the Transformer nor the Neural GPU requires access to an external memory to become Turing complete. Our study also reveals some minimal sets of elements needed to obtain these completeness results.", "keywords": "Transformer;NeuralGPU;Turing completeness", "primary_area": "", "supplementary_material": "", "author": "Jorge P\u00e9rez;Javier Marinkovi\u0107;Pablo Barcel\u00f3", "authorids": "jperez@dcc.uchile.cl;javier.marinkovic95@gmail.com;pbarcelo@dcc.uchile.cl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\np\u00e9rez2018on,\ntitle={On the Turing Completeness of Modern Neural Network Architectures},\nauthor={Jorge P\u00e9rez and Javier Marinkovi\u0107 and Pablo Barcel\u00f3},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGBdo0qFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;2;2", "wc_review": "210;371;276", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.0, 0.0 ], "wc_review_avg": [ 285.6666666666667, 66.08244009484585 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 186, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11242133264938493225&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyGBdo0qFm", "pdf": "https://openreview.net/pdf?id=HyGBdo0qFm", "email": ";;", "author_num": 3 }, { "id": "HyGDdsCcFQ", "title": "Better Generalization with On-the-fly Dataset Denoising", "track": "main", "status": "Reject", "tldr": "We introduce a fast and easy-to-implement algorithm that is robust to dataset noise.", "abstract": "Memorization in over-parameterized neural networks can severely hurt generalization in the presence of mislabeled examples. However, mislabeled examples are to hard avoid in extremely large datasets. We address this problem using the implicit regularization effect of stochastic gradient descent with large learning rates, which we find to be able to separate clean and mislabeled examples with remarkable success using loss statistics. We leverage this to identify and on-the-fly discard mislabeled examples using a threshold on their losses. This leads to On-the-fly Data Denoising (ODD), a simple yet effective algorithm that is robust to mislabeled examples, while introducing almost zero computational overhead. Empirical results demonstrate the effectiveness of ODD on several datasets containing artificial and real-world mislabeled examples.", "keywords": "dataset denoising;supervised learning;implicit regularization", "primary_area": "", "supplementary_material": "", "author": "Jiaming Song;Tengyu Ma;Michael Auli;Yann Dauphin", "authorids": "jiaming.tsong@gmail.com;tengyuma@cs.stanford.edu;michael.auli@gmail.com;yann@dauphin.io", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsong2019better,\ntitle={Better Generalization with On-the-fly Dataset Denoising},\nauthor={Jiaming Song and Tengyu Ma and Michael Auli and Yann Dauphin},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGDdsCcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyGDdsCcFQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;5;3", "wc_review": "419;346;207", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "698;849;387", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 324.0, 87.93558248323978 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 644.6666666666666, 192.34402050030621 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10384396863120584847&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Improving Differentiable Neural Computers Through Memory Masking, De-allocation, and Link Distribution Sharpness Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/691", "id": "HyGEM3C9KQ", "author_site": "Robert Csordas, J\u00fcrgen Schmidhuber", "tldr": "", "abstract": "The Differentiable Neural Computer (DNC) can learn algorithmic and question answering tasks. An analysis of its internal activation patterns reveals three problems: Most importantly, the lack of key-value separation makes the address distribution resulting from content-based look-up noisy and flat, since the value influences the score calculation, although only the key should. Second, DNC's de-allocation of memory results in aliasing, which is a problem for content-based look-up. Thirdly, chaining memory reads with the temporal linkage matrix exponentially degrades the quality of the address distribution. Our proposed fixes of these problems yield improved performance on arithmetic tasks, and also improve the mean error rate on the bAbI question answering dataset by 43%.", "keywords": "rnn;dnc;memory augmented neural networks;mann", "primary_area": "", "supplementary_material": "", "author": "Robert Csordas;Juergen Schmidhuber", "authorids": "robert@idsia.ch;juergen@idsia.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ncsordas2018improving,\ntitle={Improving Differentiable Neural Computers Through Memory Masking, De-allocation, and Link Distribution Sharpness Control},\nauthor={Robert Csordas and Juergen Schmidhuber},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGEM3C9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;5;5", "wc_review": "200;547;176", "wc_reply_reviewers": "0;15;0", "wc_reply_authors": "6;207;47", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 307.6666666666667, 169.51761625926144 ], "wc_reply_reviewers_avg": [ 5.0, 7.0710678118654755 ], "wc_reply_authors_avg": [ 86.66666666666667, 86.719214838594 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9465849868631633208&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyGEM3C9KQ", "pdf": "https://openreview.net/pdf?id=HyGEM3C9KQ", "email": ";", "author_num": 2 }, { "title": "Evaluating Robustness of Neural Networks with Mixed Integer Programming", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/817", "id": "HyGIdiRqtm", "author_site": "Vincent Tjeng, Kai Xiao, Russ Tedrake", "tldr": "We efficiently verify the robustness of deep neural models with over 100,000 ReLUs, certifying more samples than the state-of-the-art and finding more adversarial examples than a strong first-order attack.", "abstract": "Neural networks trained only to optimize for training accuracy can often be fooled by adversarial examples --- slightly perturbed inputs misclassified with high confidence. Verification of networks enables us to gauge their vulnerability to such adversarial examples. We formulate verification of piecewise-linear neural networks as a mixed integer program. On a representative task of finding minimum adversarial distortions, our verifier is two to three orders of magnitude quicker than the state-of-the-art. We achieve this computational speedup via tight formulations for non-linearities, as well as a novel presolve algorithm that makes full use of all information available. The computational speedup allows us to verify properties on convolutional and residual networks with over 100,000 ReLUs --- several orders of magnitude more than networks previously verified by any complete verifier. In particular, we determine for the first time the exact adversarial accuracy of an MNIST classifier to perturbations with bounded l-\u221e norm \u03b5=0.1: for this classifier, we find an adversarial example for 4.38% of samples, and a certificate of robustness to norm-bounded perturbations for the remainder. Across all robust training procedures and network architectures considered, and for both the MNIST and CIFAR-10 datasets, we are able to certify more samples than the state-of-the-art and find more adversarial examples than a strong first-order attack.", "keywords": "verification;adversarial robustness;adversarial examples;deep learning", "primary_area": "", "supplementary_material": "", "author": "Vincent Tjeng;Kai Y. Xiao;Russ Tedrake", "authorids": "vtjeng@mit.edu;kaix@mit.edu;russt@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntjeng2018evaluating,\ntitle={Evaluating Robustness of Neural Networks with Mixed Integer Programming},\nauthor={Vincent Tjeng and Kai Y. Xiao and Russ Tedrake},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGIdiRqtm},\n}", "github": "[![github](/images/github_icon.svg) vtjeng/MIPVerify.jl](https://github.com/vtjeng/MIPVerify.jl) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=HyGIdiRqtm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "1;5;5", "wc_review": "98;579;227", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "6;1299;65", "reply_reviewers": "0;0;0", "reply_authors": "1;4;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.8856180831641267 ], "wc_review_avg": [ 301.3333333333333, 203.2803209582494 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 456.6666666666667, 596.106440905388 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18154476008132424293&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyGIdiRqtm", "pdf": "https://openreview.net/pdf?id=HyGIdiRqtm", "email": ";;", "author_num": 3 }, { "id": "HyGLy2RqtQ", "title": "Over-parameterization Improves Generalization in the XOR Detection Problem", "track": "main", "status": "Reject", "tldr": "We show in a simplified learning task that over-parameterization improves generalization of a convnet that is trained with gradient descent.", "abstract": "Empirical evidence suggests that neural networks with ReLU activations generalize better with over-parameterization. However, there is currently no theoretical analysis that explains this observation. In this work, we study a simplified learning task with over-parameterized convolutional networks that empirically exhibits the same qualitative phenomenon. For this setting, we provide a theoretical analysis of the optimization and generalization performance of gradient descent. Specifically, we prove data-dependent sample complexity bounds which show that over-parameterization improves the generalization performance of gradient descent.", "keywords": "deep learning;theory;non convex optimization;over-parameterization", "primary_area": "", "supplementary_material": "", "author": "Alon Brutzkus;Amir Globerson", "authorids": "brutzkus@gmail.com;amir.globerson@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbrutzkus2019overparameterization,\ntitle={Over-parameterization Improves Generalization in the {XOR} Detection Problem},\nauthor={Alon Brutzkus and Amir Globerson},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGLy2RqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyGLy2RqtQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "391;297;291", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "224;278;368", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 326.3333333333333, 45.79179936286506 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 290.0, 59.39696961966999 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12618861705302875267&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Random mesh projectors for inverse problems", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/704", "id": "HyGcghRct7", "author_site": "Konik Kothari, Sidharth Gupta, Maarten V de Hoop, Ivan Dokmanic", "tldr": "We solve ill-posed inverse problems with scarce ground truth examples by estimating an ensemble of random projections of the model instead of the model itself.", "abstract": "We propose a new learning-based approach to solve ill-posed inverse problems in imaging. We address the case where ground truth training samples are rare and the problem is severely ill-posed---both because of the underlying physics and because we can only get few measurements. This setting is common in geophysical imaging and remote sensing. We show that in this case the common approach to directly learn the mapping from the measured data to the reconstruction becomes unstable. Instead, we propose to first learn an ensemble of simpler mappings from the data to projections of the unknown image into random piecewise-constant subspaces. We then combine the projections to form a final reconstruction by solving a deconvolution-like problem. We show experimentally that the proposed method is more robust to measurement noise and corruptions not seen during training than a directly learned inverse.", "keywords": "imaging;inverse problems;subspace projections;random Delaunay triangulations;CNN;geophysics;regularization", "primary_area": "", "supplementary_material": "", "author": "Konik Kothari*;Sidharth Gupta*;Maarten v. de Hoop;Ivan Dokmanic", "authorids": "kkothar3@illinois.edu;gupta67@illinois.edu;mdehoop@rice.edu;dokmanic@illinois.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkothari2018random,\ntitle={Random mesh projectors for inverse problems},\nauthor={Konik Kothari and Sidharth Gupta and Maarten v. de Hoop and Ivan Dokmanic},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGcghRct7},\n}", "github": "[![github](/images/github_icon.svg) swing-research/deepmesh](https://github.com/swing-research/deepmesh)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;4;4", "wc_review": "590;439;272", "wc_reply_reviewers": "419;171;54", "wc_reply_authors": "5648;1258;141", "reply_reviewers": "1;1;1", "reply_authors": "9;3;2", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 433.6666666666667, 129.877720268806 ], "wc_reply_reviewers_avg": [ 214.66666666666666, 152.1760676175973 ], "wc_reply_authors_avg": [ 2349.0, 2376.898960129914 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 4.666666666666667, 3.0912061651652345 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9449111825230683, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1149610136001098856&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=HyGcghRct7", "pdf": "https://openreview.net/pdf?id=HyGcghRct7", "email": ";;;", "author_num": 4 }, { "id": "HyGh4sR9YQ", "title": "Deep Neuroevolution: Genetic Algorithms are a Competitive Alternative for Training Deep Neural Networks for Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep artificial neural networks (DNNs) are typically trained via gradient-based learning algorithms, namely backpropagation. \nEvolution strategies (ES) can rival backprop-based algorithms such as Q-learning and policy gradients on challenging deep reinforcement learning (RL) problems. However, ES can be considered a gradient-based algorithm because it performs stochastic gradient descent via an operation similar to a finite-difference approximation of the gradient.\nThat raises the question of whether non-gradient-based evolutionary algorithms can work at DNN scales. \nHere we demonstrate they can: we evolve the weights of a DNN with a simple, gradient-free, population-based genetic algorithm (GA) and it performs well on hard deep RL problems, including Atari and humanoid locomotion. The Deep GA successfully evolves networks with over four million free parameters, the largest neural networks ever evolved with a traditional evolutionary algorithm. These results (1) expand our sense of the scale at which GAs can operate, (2) suggest intriguingly that in some cases following the gradient is not the best choice for optimizing performance, and (3) make immediately available the multitude of neuroevolution techniques that improve performance. We demonstrate the latter by showing that combining DNNs with novelty search, which encourages exploration on tasks with deceptive or sparse reward functions, can solve a high-dimensional problem on which reward-maximizing algorithms (e.g.\\ DQN, A3C, ES, and the GA) fail. Additionally, the Deep GA is faster than ES, A3C, and DQN (it can train Atari in {\\raise.17ex\\hbox{$\\scriptstyle\\sim$}}4 hours on one workstation or {\\raise.17ex\\hbox{$\\scriptstyle\\sim$}}1 hour distributed on 720 cores), and enables a state-of-the-art, up to 10,000-fold compact encoding technique. ", "keywords": "Neuroevolution;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Felipe Petroski Such;Vashisht Madhavan;Edoardo Conti;Joel Lehman;Kenneth O. Stanley;Jeff Clune", "authorids": "felipe.such@uber.com;vashisht@uber.com;edoardo@uber.com;joel.lehman@uber.com;kstanley@uber.com;jeffclune@uber.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nsuch2019deep,\ntitle={Deep Neuroevolution: Genetic Algorithms are a Competitive Alternative for Training Deep Neural Networks for Reinforcement Learning},\nauthor={Felipe Petroski Such and Vashisht Madhavan and Edoardo Conti and Joel Lehman and Kenneth O. Stanley and Jeff Clune},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGh4sR9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer6;AnonReviewer3;AnonReviewer5;AnonReviewer2", "site": "https://openreview.net/forum?id=HyGh4sR9YQ", "pdf_size": 0, "rating": "3;4;6;6;7", "confidence": "4;4;2;4;5", "wc_review": "550;329;299;488;763", "wc_reply_reviewers": "300;0;0;97;29", "wc_reply_authors": "634;762;441;685;343", "reply_reviewers": "1;0;0;1;1", "reply_authors": "1;1;1;2;2", "rating_avg": [ 5.2, 1.469693845669907 ], "confidence_avg": [ 3.8, 0.9797958971132712 ], "wc_review_avg": [ 485.8, 167.5988066783293 ], "wc_reply_reviewers_avg": [ 85.2, 113.09712640027598 ], "wc_reply_authors_avg": [ 573.0, 156.4033247728449 ], "reply_reviewers_avg": [ 0.6, 0.48989794855663565 ], "reply_authors_avg": [ 1.4, 0.4898979485566356 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0277777777777778, "gs_citation": 988, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7576282196045483976&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Multi-Agent Dual Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1045", "id": "HyGhN2A5tm", "author_site": "Yiren Wang, Yingce Xia, Tianyu He, Fei Tian, Tao Qin, ChengXiang Zhai, Tie-Yan Liu", "tldr": "", "abstract": "Dual learning has attracted much attention in machine learning, computer vision and natural language processing communities. The core idea of dual learning is to leverage the duality between the primal task (mapping from domain X to domain Y) and dual task (mapping from domain Y to X) to boost the performances of both tasks. Existing dual learning framework forms a system with two agents (one primal model and one dual model) to utilize such duality. In this paper, we extend this framework by introducing multiple primal and dual models, and propose the multi-agent dual learning framework. Experiments on neural machine translation and image translation tasks demonstrate the effectiveness of the new framework. \nIn particular, we set a new record on IWSLT 2014 German-to-English translation with a 35.44 BLEU score, achieve a 31.03 BLEU score on WMT 2014 English-to-German translation with over 2.6 BLEU improvement over the strong Transformer baseline, and set a new record of 49.61 BLEU score on the recent WMT 2018 English-to-German translation.", "keywords": "Dual Learning;Machine Learning;Neural Machine Translation", "primary_area": "", "supplementary_material": "", "author": "Yiren Wang;Yingce Xia;Tianyu He;Fei Tian;Tao Qin;ChengXiang Zhai;Tie-Yan Liu", "authorids": "yiren@illinois.edu;yingce.xia@gmail.com;hetianyu@mail.ustc.edu.cn;fetia@microsoft.com;taoqin@microsoft.com;czhai@illinois.edu;tie-yan.liu@microsoft.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nwang2018multiagent,\ntitle={Multi-Agent Dual Learning},\nauthor={Yiren Wang and Yingce Xia and Tianyu He and Fei Tian and Tao Qin and ChengXiang Zhai and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGhN2A5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;2;4", "wc_review": "471;270;508", "wc_reply_reviewers": "28;0;0", "wc_reply_authors": "1670;1031;625", "reply_reviewers": "1;0;0", "reply_authors": "3;2;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 416.3333333333333, 104.57001907281499 ], "wc_reply_reviewers_avg": [ 9.333333333333334, 13.199326582148887 ], "wc_reply_authors_avg": [ 1108.6666666666667, 430.13977056559446 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16564645746537397303&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HyGhN2A5tm", "pdf": "https://openreview.net/pdf?id=HyGhN2A5tm", "email": ";;;;;;", "author_num": 7 }, { "id": "HyGySsAct7", "title": "Targeted Adversarial Examples for Black Box Audio Systems", "track": "main", "status": "Reject", "tldr": "We present a novel black-box targeted attack on speech to text systems that supports arbitrarily long adversarial transcriptions and achieves state of the art performance.", "abstract": "The application of deep recurrent networks to audio transcription has led to impressive gains in automatic speech recognition (ASR) systems. Many have demonstrated that small adversarial perturbations can fool deep neural networks into incorrectly predicting a specified target with high confidence. Current work on fooling ASR systems have focused on white-box attacks, in which the model architecture and parameters are known. In this paper, we adopt a black-box approach to adversarial generation, combining the approaches of both genetic algorithms and gradient estimation to solve the task. We achieve a 89.25% targeted attack similarity after 3000 generations while maintaining 94.6% audio file similarity.", "keywords": "adversarial attack;adversarial examples;audio processing;speech to text;deep learning;adversarial audio;black box;machine learning", "primary_area": "", "supplementary_material": "", "author": "Rohan Taori;Amog Kamsetty;Brenton Chu;Nikita Vemuri", "authorids": "rohantaori@berkeley.edu;amogkamsetty@berkeley.edu;brentonlongchu@berkeley.edu;nikitavemuri@berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ntaori2019targeted,\ntitle={Targeted Adversarial Examples for Black Box Audio Systems},\nauthor={Rohan Taori and Amog Kamsetty and Brenton Chu and Nikita Vemuri},\nyear={2019},\nurl={https://openreview.net/forum?id=HyGySsAct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyGySsAct7", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;4", "wc_review": "569;275;319", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "593;259;60", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 387.6666666666667, 129.47415014417186 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 304.0, 219.91058789123062 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 223, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5869651379538606422&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Complement Objective Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/678", "id": "HyM7AiA5YX", "author_site": "Hao-Yun Chen, Pei-Hsin Wang, Chun-Hao Liu, Shih-Chieh Chang, Jia-Yu Pan, Yu-Ting Chen, Wei Wei, Da-Cheng Juan", "tldr": "We propose Complement Objective Training (COT), a new training paradigm that optimizes both the primary and complement objectives for effectively learning the parameters of neural networks.", "abstract": "Learning with a primary objective, such as softmax cross entropy for classification and sequence generation, has been the norm for training deep neural networks for years. Although being a widely-adopted approach, using cross entropy as the primary objective exploits mostly the information from the ground-truth class for maximizing data likelihood, and largely ignores information from the complement (incorrect) classes. We argue that, in addition to the primary objective, training also using a complement objective that leverages information from the complement classes can be effective in improving model performance. This motivates us to study a new training paradigm that maximizes the likelihood of the ground-truth class while neutralizing the probabilities of the complement classes. We conduct extensive experiments on multiple tasks ranging from computer vision to natural language understanding. The experimental results confirm that, compared to the conventional training with just one primary objective, training also with the complement objective further improves the performance of the state-of-the-art models across all tasks. In addition to the accuracy improvement, we also show that models trained with both primary and complement objectives are more robust to single-step adversarial attacks.\n", "keywords": "optimization;entropy;image recognition;natural language understanding;adversarial attacks;deep learning", "primary_area": "", "supplementary_material": "", "author": "Hao-Yun Chen;Pei-Hsin Wang;Chun-Hao Liu;Shih-Chieh Chang;Jia-Yu Pan;Yu-Ting Chen;Wei Wei;Da-Cheng Juan", "authorids": "haoyunchen@gapp.nthu.edu.tw;peihsin@gapp.nthu.edu.tw;newgod1992@gapp.nthu.edu.tw;scchang@cs.nthu.edu.tw;jypan@google.com;yutingchen@google.com;wewei@google.com;dacheng@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nchen2018complement,\ntitle={Complement Objective Training},\nauthor={Hao-Yun Chen and Pei-Hsin Wang and Chun-Hao Liu and Shih-Chieh Chang and Jia-Yu Pan and Yu-Ting Chen and Wei Wei and Da-Cheng Juan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyM7AiA5YX},\n}", "github": "[![github](/images/github_icon.svg) henry8527/COT](https://github.com/henry8527/COT)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;4", "wc_review": "180;1019;197", "wc_reply_reviewers": "132;0;0", "wc_reply_authors": "1022;1334;108", "reply_reviewers": "2;0;0", "reply_authors": "6;2;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 465.3333333333333, 391.56296499484677 ], "wc_reply_reviewers_avg": [ 44.0, 62.22539674441618 ], "wc_reply_authors_avg": [ 821.3333333333334, 520.236698265532 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 2.160246899469287 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=63949908447902569&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyM7AiA5YX", "pdf": "https://openreview.net/pdf?id=HyM7AiA5YX", "email": ";;;;;;;", "author_num": 8 }, { "id": "HyM8V2A9Km", "title": "ACTRCE: Augmenting Experience via Teacher\u2019s Advice", "track": "main", "status": "Reject", "tldr": "Combine language goal representation with hindsight experience replays.", "abstract": "Sparse reward is one of the most challenging problems in reinforcement learning (RL). Hindsight Experience Replay (HER) attempts to address this issue by converting a failure experience to a successful one by relabeling the goals. Despite its effectiveness, HER has limited applicability because it lacks a compact and universal goal representation. We present Augmenting experienCe via TeacheR's adviCE (ACTRCE), an efficient reinforcement learning technique that extends the HER framework using natural language as the goal representation. We first analyze the differences among goal representation, and show that ACTRCE can efficiently solve difficult reinforcement learning problems in challenging 3D navigation tasks, whereas HER with non-language goal representation failed to learn. We also show that with language goal representations, the agent can generalize to unseen instructions, and even generalize to instructions with unseen lexicons. We further demonstrate it is crucial to use hindsight advice to solve challenging tasks, but we also found that little amount of hindsight advice is sufficient for the learning to take off, showing the practical aspect of the method.", "keywords": "language goals;task generalization;hindsight experience replays;language grounding", "primary_area": "", "supplementary_material": "", "author": "Yuhuai Wu;Harris Chan;Jamie Kiros;Sanja Fidler;Jimmy Ba", "authorids": "ywu@cs.toronto.edu;hchan@cs.toronto.edu;kirosjamie@gmail.com;fidler@cs.toronto.edu;jba@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwu2019actrce,\ntitle={{ACTRCE}: Augmenting Experience via Teacher\u2019s Advice},\nauthor={Yuhuai Wu and Harris Chan and Jamie Kiros and Sanja Fidler and Jimmy Ba},\nyear={2019},\nurl={https://openreview.net/forum?id=HyM8V2A9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyM8V2A9Km", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;5", "wc_review": "541;421;239", "wc_reply_reviewers": "80;140;55", "wc_reply_authors": "2669;772;687", "reply_reviewers": "1;1;1", "reply_authors": "6;2;2", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 400.3333333333333, 124.1540262022228 ], "wc_reply_reviewers_avg": [ 91.66666666666667, 35.6682242650545 ], "wc_reply_authors_avg": [ 1376.0, 914.9473573198989 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z1T5L4jcpy4J:scholar.google.com/&scioq=ACTRCE:+Augmenting+Experience+via+Teacher%E2%80%99s+Advice&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HyMRUiC9YX", "title": "Exploring and Enhancing the Transferability of Adversarial Examples", "track": "main", "status": "Reject", "tldr": "", "abstract": " State-of-the-art deep neural networks are vulnerable to adversarial examples, formed by applying small but malicious perturbations to the original inputs. Moreover, the perturbations can \\textit{transfer across models}: adversarial examples generated for a specific model will often mislead other unseen models. Consequently the adversary can leverage it to attack deployed systems without any query, which severely hinders the application of deep learning, especially in the safety-critical areas. In this work, we empirically study how two classes of factors those might influence the transferability of adversarial examples. One is about model-specific factors, including network architecture, model capacity and test accuracy. The other is the local smoothness of loss surface for constructing adversarial examples. Inspired by these understandings on the transferability of adversarial examples, we then propose a simple but effective strategy to enhance the transferability, whose effectiveness is confirmed by a variety of experiments on both CIFAR-10 and ImageNet datasets.", "keywords": "Deep learning;Adversarial example;Transferability;Smoothed gradient", "primary_area": "", "supplementary_material": "", "author": "Lei Wu;Zhanxing Zhu;Cheng Tai", "authorids": "leiwu@pku.edu.cn;zhanxing.zhu@pku.edu.cn;chengtai@pku.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwu2019exploring,\ntitle={Exploring and Enhancing the Transferability of Adversarial Examples},\nauthor={Lei Wu and Zhanxing Zhu and Cheng Tai},\nyear={2019},\nurl={https://openreview.net/forum?id=HyMRUiC9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyMRUiC9YX", "pdf_size": 0, "rating": "4;6;6", "confidence": "2;3;3", "wc_review": "198;201;310", "wc_reply_reviewers": "78;0;0", "wc_reply_authors": "363;97;41", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 236.33333333333334, 52.10459565996928 ], "wc_reply_reviewers_avg": [ 26.0, 36.76955262170047 ], "wc_reply_authors_avg": [ 167.0, 140.4658914707292 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Mdft7McIM50J:scholar.google.com/&scioq=Exploring+and+Enhancing+the+Transferability+of+Adversarial+Examples&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HyMRaoAqKX", "title": "Implicit Autoencoders", "track": "main", "status": "Reject", "tldr": "We propose a generative autoencoder that can learn expressive posterior and conditional likelihood distributions using implicit distributions, and train the model using a new formulation of the ELBO.", "abstract": "In this paper, we describe the \"implicit autoencoder\" (IAE), a generative autoencoder in which both the generative path and the recognition path are parametrized by implicit distributions. We use two generative adversarial networks to define the reconstruction and the regularization cost functions of the implicit autoencoder, and derive the learning rules based on maximum-likelihood learning. Using implicit distributions allows us to learn more expressive posterior and conditional likelihood distributions for the autoencoder. Learning an expressive conditional likelihood distribution enables the latent code to only capture the abstract and high-level information of the data, while the remaining information is captured by the implicit conditional likelihood distribution. For example, we show that implicit autoencoders can disentangle the global and local information, and perform deterministic or stochastic reconstructions of the images. We further show that implicit autoencoders can disentangle discrete underlying factors of variation from the continuous factors in an unsupervised fashion, and perform clustering and semi-supervised learning.", "keywords": "Unsupervised Learning;Generative Models;Variational Inference;Generative Adversarial Networks.", "primary_area": "", "supplementary_material": "", "author": "Alireza Makhzani", "authorids": "a.makhzani@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nmakhzani2019implicit,\ntitle={Implicit Autoencoders},\nauthor={Alireza Makhzani},\nyear={2019},\nurl={https://openreview.net/forum?id=HyMRaoAqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyMRaoAqKX", "pdf_size": 0, "rating": "3;6;6", "confidence": "3;3;4", "wc_review": "1011;119;373", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "2046;169;341", "reply_reviewers": "0;0;0", "reply_authors": "3;2;2", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 501.0, 375.23681411432256 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 852.0, 847.2004878815088 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6820598437661316522&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HyMS8iRcK7", "title": "SEQUENCE MODELLING WITH AUTO-ADDRESSING AND RECURRENT MEMORY INTEGRATING NETWORKS", "track": "main", "status": "Reject", "tldr": "We propose a light-weight Memory-Augmented RNN (MARNN) for sequence modelling.", "abstract": "Processing sequential data with long term dependencies and learn complex transitions are two major challenges in many deep learning applications. In this paper, we introduce a novel architecture, the Auto-addressing and Recurrent Memory Integrating Network (ARMIN) to address these issues. The ARMIN explicitly stores previous hidden states and recurrently integrate useful past states into current time-step by an efficient memory addressing mechanism. Compared to existing memory networks, the ARMIN is more light-weight and inference-time efficient. Our network can be trained on small slices of long sequential data, and thus, can boost its training speed. Experiments on various tasks demonstrate the efficiency of the ARMIN architecture. Codes and models will be available.", "keywords": "Memory Network;RNN;Sequence Modelling", "primary_area": "", "supplementary_material": "", "author": "Zhangheng Li;Jia-Xing Zhong;Jingjia Huang;Tao Zhang;Thomas Li;Ge Li", "authorids": "zhanghengli@pku.edu.cn;jxzhong@pku.edu.cn;jjhuang@pku.edu.cn;t_zhang@pku.edu.cn;thomasli@pkusz.edu.cn;geli@ece.pku.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nli2019sequence,\ntitle={{SEQUENCE} {MODELLING} {WITH} {AUTO}-{ADDRESSING} {AND} {RECURRENT} {MEMORY} {INTEGRATING} {NETWORKS}},\nauthor={Zhangheng Li and Jia-Xing Zhong and Jingjia Huang and Tao Zhang and Thomas Li and Ge Li},\nyear={2019},\nurl={https://openreview.net/forum?id=HyMS8iRcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyMS8iRcK7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "428;190;363", "wc_reply_reviewers": "0;0;46", "wc_reply_authors": "643;342;753", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 327.0, 100.44235494385158 ], "wc_reply_reviewers_avg": [ 15.333333333333334, 21.684607956387456 ], "wc_reply_authors_avg": [ 579.3333333333334, 173.72455848906978 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8LVqkzYDjywJ:scholar.google.com/&scioq=SEQUENCE+MODELLING+WITH+AUTO-ADDRESSING+AND+RECURRENT+MEMORY+INTEGRATING+NETWORKS&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HyMnYiR9Y7", "title": "DOMAIN ADAPTATION VIA DISTRIBUTION AND REPRESENTATION MATCHING: A CASE STUDY ON TRAINING DATA SELECTION VIA REINFORCEMENT LEARNING", "track": "main", "status": "Reject", "tldr": "Training data selection via reinforcement learning", "abstract": "Supervised models suffer from domain shifting where distribution mismatch across domains greatly affect model performance. Particularly, noise scattered in each domain has played a crucial role in representing such distribution, especially in various natural language processing (NLP) tasks. In addressing this issue, training data selection (TDS) has been proven to be a prospective way to train supervised models with higher performance and efficiency. Following the TDS methodology, in this paper, we propose a general data selection framework with representation learning and distribution matching simultaneously for domain adaptation on neural models. In doing so, we formulate TDS as a novel selection process based on a learned distribution from the input data, which is produced by a trainable selection distribution generator (SDG) that is optimized by reinforcement learning (RL). Then, the model trained by the selected data not only predicts the target domain data in a specific task, but also provides input for the value function of the RL. Experiments are conducted on three typical NLP tasks, namely, part-of-speech tagging, dependency parsing, and sentiment analysis. Results demonstrate the validity and effectiveness of our approach.", "keywords": "domain adaptation;training data selection;reinforcement learning;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Miaofeng Liu;Yan Song;Hongbin Zou;Tong Zhang", "authorids": "water3er@gmail.com;clksong@gmail.com;hbzou@xdu.edu.cn;bradymzhang@tencent.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nliu2019domain,\ntitle={{DOMAIN} {ADAPTATION} {VIA} {DISTRIBUTION} {AND} {REPRESENTATION} {MATCHING}: A {CASE} {STUDY} {ON} {TRAINING} {DATA} {SELECTION} {VIA} {REINFORCEMENT} {LEARNING}},\nauthor={Miaofeng Liu and Yan Song and Hongbin Zou and Tong Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=HyMnYiR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyMnYiR9Y7", "pdf_size": 0, "rating": "4;5;7", "confidence": "2;4;3", "wc_review": "632;523;416", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 523.6666666666666, 88.18289075678015 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.3273268353539886, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MQ3SNYWfGXoJ:scholar.google.com/&scioq=DOMAIN+ADAPTATION+VIA+DISTRIBUTION+AND+REPRESENTATION+MATCHING:+A+CASE+STUDY+ON+TRAINING+DATA+SELECTION+VIA+REINFORCEMENT+LEARNING&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "HyMuaiAqY7", "title": "Deli-Fisher GAN: Stable and Efficient Image Generation With Structured Latent Generative Space", "track": "main", "status": "Reject", "tldr": "This paper proposes a new Generative Adversarial Network that is more stable, more efficient, and produces better images than those of status-quo ", "abstract": "Generative Adversarial Networks (GANs) are powerful tools for realistic image generation. However, a major drawback of GANs is that they are especially hard to train, often requiring large amounts of data and long training time. In this paper we propose the Deli-Fisher GAN, a GAN that generates photo-realistic images by enforcing structure on the latent generative space using similar approaches in \\cite{deligan}. The structure of the latent space we consider in this paper is modeled as a mixture of Gaussians, whose parameters are learned in the training process. Furthermore, to improve stability and efficiency, we use the Fisher Integral Probability Metric as the divergence measure in our GAN model, instead of the Jensen-Shannon divergence. We show by experiments that the Deli-Fisher GAN performs better than DCGAN, WGAN, and the Fisher GAN as measured by inception score.", "keywords": "Generative Adversarial Networks;Structured Latent Space;Stable Training", "primary_area": "", "supplementary_material": "", "author": "Boli Fang;Chuck Jia;Miao Jiang;Dhawal Chaturvedi", "authorids": "bfang@iu.edu;jiac@iu.edu;miajiang@iu.edu;dhchat@iu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfang2019delifisher,\ntitle={Deli-Fisher {GAN}: Stable and Efficient Image Generation With Structured Latent Generative Space},\nauthor={Boli Fang and Chuck Jia and Miao Jiang and Dhawal Chaturvedi},\nyear={2019},\nurl={https://openreview.net/forum?id=HyMuaiAqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyMuaiAqY7", "pdf_size": 0, "rating": "2;2;3", "confidence": "5;4;5", "wc_review": "108;59;251", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 139.33333333333334, 81.45482729027721 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:_y98ykxE8nkJ:scholar.google.com/&scioq=Deli-Fisher+GAN:+Stable+and+Efficient+Image+Generation+With+Structured+Latent+Generative+Space&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HyMxAi05Km", "title": "Dual Learning: Theoretical Study and Algorithmic Extensions", "track": "main", "status": "Reject", "tldr": "", "abstract": "Dual learning has been successfully applied in many machine learning applications, including machine translation, image-to-image transformation, etc. The high-level idea of dual learning is very intuitive: if we map an x from one domain to another and then map it back, we should recover the original x. Although its effectiveness has been empirically verified, theoretical understanding of dual learning is still missing. In this paper, we conduct a theoretical study to understand why and when dual learning can improve a mapping function. Based on the theoretical discoveries, we extend dual learning by introducing more related mappings and propose highly symmetric frameworks, cycle dual learning and multipath dual learning, in both of which we can leverage the feedback signals from additional domains to improve the qualities of the mappings. We prove that both cycle dual learning and multipath dual learning can boost the performance of standard dual learning under mild conditions. Experiments on WMT 14 English\u2194German and MultiUN English\u2194French translations verify our theoretical findings on dual learning, and the results on the translations among English, French, and Spanish of MultiUN demonstrate the efficacy of cycle dual learning and multipath dual learning.", "keywords": "machine translation;dual learning", "primary_area": "", "supplementary_material": "", "author": "Zhibing Zhao;Yingce Xia;Tao Qin;Tie-Yan Liu", "authorids": "zhaoz6@rpi.edu;yingce.xia@gmail.com;taoqin@microsoft.com;tyliu@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2019dual,\ntitle={Dual Learning: Theoretical Study and Algorithmic Extensions},\nauthor={Zhibing Zhao and Yingce Xia and Tao Qin and Tie-Yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=HyMxAi05Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyMxAi05Km", "pdf_size": 0, "rating": "2;5;6", "confidence": "4;3;3", "wc_review": "490;254;180", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.699673171197595 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 308.0, 132.19177987555304 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9707253433941508, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Mode Normalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/833", "id": "HyN-M2Rctm", "author_site": "Lucas Deecke, Iain Murray, Hakan Bilen", "tldr": "We present a novel normalization method for deep neural networks that is robust to multi-modalities in intermediate feature distributions.", "abstract": "Normalization methods are a central building block in the deep learning toolbox. They accelerate and stabilize training, while decreasing the dependence on manually tuned learning rate schedules. When learning from multi-modal distributions, the effectiveness of batch normalization (BN), arguably the most prominent normalization method, is reduced. As a remedy, we propose a more flexible approach: by extending the normalization to more than a single mean and variance, we detect modes of data on-the-fly, jointly normalizing samples that share common features. We demonstrate that our method outperforms BN and other widely used normalization techniques in several experiments, including single and multi-task datasets.", "keywords": "Deep Learning;Expert Models;Normalization;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Lucas Deecke;Iain Murray;Hakan Bilen", "authorids": "l.deecke@ed.ac.uk;i.murray@ed.ac.uk;hbilen@ed.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndeecke2018mode,\ntitle={Mode Normalization},\nauthor={Lucas Deecke and Iain Murray and Hakan Bilen},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyN-M2Rctm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HyN-M2Rctm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "184;668;435", "wc_reply_reviewers": "0;470;0", "wc_reply_authors": "158;1223;306", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 429.0, 197.63771569886822 ], "wc_reply_reviewers_avg": [ 156.66666666666666, 221.5601247717849 ], "wc_reply_authors_avg": [ 562.3333333333334, 471.0529576267289 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10555295858855595157&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HyN-M2Rctm", "pdf": "https://openreview.net/pdf?id=HyN-M2Rctm", "email": ";;", "author_num": 3 }, { "title": "Detecting Egregious Responses in Neural Sequence-to-sequence Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/861", "id": "HyNA5iRcFQ", "author_site": "Tianxing He, James R Glass", "tldr": "This paper aims to provide an empirical answer to the question of whether well-trained dialogue response model can output malicious responses.", "abstract": "In this work, we attempt to answer a critical question: whether there exists some input sequence that will cause a well-trained discrete-space neural network sequence-to-sequence (seq2seq) model to generate egregious outputs (aggressive, malicious, attacking, etc.). And if such inputs exist, how to find them efficiently. We adopt an empirical methodology, in which we first create lists of egregious output sequences, and then design a discrete optimization algorithm to find input sequences that will cause the model to generate them. Moreover, the optimization algorithm is enhanced for large vocabulary search and constrained to search for input sequences that are likely to be input by real-world users. In our experiments, we apply this approach to dialogue response generation models trained on three real-world dialogue data-sets: Ubuntu, Switchboard and OpenSubtitles, testing whether the model can generate malicious responses. We demonstrate that given the trigger inputs our algorithm finds, a significant number of malicious sentences are assigned large probability by the model, which reveals an undesirable consequence of standard seq2seq training. ", "keywords": "Deep Learning;Natural Language Processing;Adversarial Attacks;Dialogue Response Generation", "primary_area": "", "supplementary_material": "", "author": "Tianxing He;James Glass", "authorids": "tianxing@mit.edu;glass@mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhe2018detecting,\ntitle={Detecting Egregious Responses in Neural Sequence-to-sequence Models},\nauthor={Tianxing He and James Glass},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyNA5iRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;2", "wc_review": "371;644;183", "wc_reply_reviewers": "0;283;0", "wc_reply_authors": "416;1978;321", "reply_reviewers": "0;2;0", "reply_authors": "1;4;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 399.3333333333333, 189.26583303796687 ], "wc_reply_reviewers_avg": [ 94.33333333333333, 133.40747938386198 ], "wc_reply_authors_avg": [ 905.0, 759.7161750724191 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12410178054097232012&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyNA5iRcFQ", "pdf": "https://openreview.net/pdf?id=HyNA5iRcFQ", "email": ";", "author_num": 2 }, { "id": "HyNbtiR9YX", "title": "Unsupervised Document Representation using Partition Word-Vectors Averaging", "track": "main", "status": "Reject", "tldr": "A simple unsupervised method for multi-sentence-document embedding using partition based word vectors averaging that achieve results comparable to sophisticated models.", "abstract": "Learning effective document-level representation is essential in many important NLP tasks such as document classification, summarization, etc. Recent research has shown that simple weighted averaging of word vectors is an effective way to represent sentences, often outperforming complicated seq2seq neural models in many tasks. While it is desirable to use the same method to represent documents as well, unfortunately, the effectiveness is lost when representing long documents involving multiple sentences. One reason for this degradation is due to the fact that a longer document is likely to contain words from many different themes (or topics), and hence creating a single vector while ignoring all the thematic structure is unlikely to yield an effective representation of the document. This problem is less acute in single sentences and other short text fragments where presence of a single theme/topic is most likely. To overcome this problem, in this paper we present PSIF, a partitioned word averaging model to represent long documents. P-SIF retains the simplicity of simple weighted word averaging while taking a document's thematic structure into account. In particular, P-SIF learns topic-specific vectors from a document and finally concatenates them all to represent the overall document. Through our experiments over multiple real-world datasets and tasks, we demonstrate PSIF's effectiveness compared to simple weighted averaging and many other state-of-the-art baselines. We also show that PSIF is particularly effective in representing long multi-sentence documents. We will release PSIF's embedding source code and data-sets for reproducing results.", "keywords": "Unsupervised Learning;Natural Language Processing;Representation Learning;Document Embedding", "primary_area": "", "supplementary_material": "", "author": "Vivek Gupta;Ankit Kumar Saw;Partha Pratim Talukdar;Praneeth Netrapalli", "authorids": "vgupta@cs.utah.edu;ankit.kgpian@gmail.com;ppt@iisc.ac.in;praneeth@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngupta2019unsupervised,\ntitle={Unsupervised Document Representation using Partition Word-Vectors Averaging},\nauthor={Vivek Gupta and Ankit Kumar Saw and Partha Pratim Talukdar and Praneeth Netrapalli},\nyear={2019},\nurl={https://openreview.net/forum?id=HyNbtiR9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyNbtiR9YX", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;4", "wc_review": "228;302;718", "wc_reply_reviewers": "0;0;614", "wc_reply_authors": "772;664;1906", "reply_reviewers": "0;0;3", "reply_authors": "2;2;4", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 416.0, 215.672591366327 ], "wc_reply_reviewers_avg": [ 204.66666666666666, 289.44237576569344 ], "wc_reply_authors_avg": [ 1114.0, 561.7615152357805 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8077362532255109001&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HyNmRiCqtm", "title": "CDeepEx: Contrastive Deep Explanations", "track": "main", "status": "Reject", "tldr": "A method to answer \"why not class B?\" for explaining deep networks", "abstract": "We propose a method which can visually explain the classification decision of deep neural networks (DNNs). There are many proposed methods in machine learning and computer vision seeking to clarify the decision of machine learning black boxes, specifically DNNs. All of these methods try to gain insight into why the network \"chose class A\" as an answer. Humans, when searching for explanations, ask two types of questions. The first question is, \"Why did you choose this answer?\" The second question asks, \"Why did you not choose answer B over A?\" The previously proposed methods are either not able to provide the latter directly or efficiently.\n\nWe introduce a method capable of answering the second question both directly and efficiently. In this work, we limit the inputs to be images. In general, the proposed method generates explanations in the input space of any model capable of efficient evaluation and gradient evaluation. We provide results, showing the superiority of this approach for gaining insight into the inner representation of machine learning models.", "keywords": "Deep learning;Explanation;Network interpretation;Contrastive explanation", "primary_area": "", "supplementary_material": "", "author": "Amir Feghahati;Christian R. Shelton;Michael J. Pazzani;Kevin Tang", "authorids": "sfegh001@ucr.edu;cshelton@cs.ucr.edu;pazzani@ucr.edu;ktang012@ucr.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfeghahati2019cdeepex,\ntitle={{CD}eepEx: Contrastive Deep Explanations},\nauthor={Amir Feghahati and Christian R. Shelton and Michael J. Pazzani and Kevin Tang},\nyear={2019},\nurl={https://openreview.net/forum?id=HyNmRiCqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyNmRiCqtm", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;5", "wc_review": "822;880;484", "wc_reply_reviewers": "451;338;0", "wc_reply_authors": "1183;444;586", "reply_reviewers": "2;1;0", "reply_authors": "4;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 728.6666666666666, 174.6183139179724 ], "wc_reply_reviewers_avg": [ 263.0, 191.60549748550187 ], "wc_reply_authors_avg": [ 737.6666666666666, 320.1898742239604 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2605791621865254906&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "HyVbhi0cYX", "title": "Complexity of Training ReLU Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we explore some basic questions on complexity of training Neural networks with ReLU activation function. We show that it is NP-hard to train a two-hidden layer feedforward ReLU neural network. If dimension d of the data is fixed then we show that there exists a polynomial time algorithm for the same training problem. We also show that if sufficient over-parameterization is provided in the first hidden layer of ReLU neural network then there is a polynomial time algorithm which finds weights such that output of the over-parameterized ReLU neural network matches with the output of the given data.", "keywords": "NP-hardness;ReLU activation;Two hidden layer networks", "primary_area": "", "supplementary_material": "", "author": "Digvijay Boob;Santanu S. Dey;Guanghui Lan", "authorids": "digvijaybb40@gatech.edu;santanu.dey@isye.gatech.edu;george.lan@isye.gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nboob2019complexity,\ntitle={Complexity of Training Re{LU} Neural Networks},\nauthor={Digvijay Boob and Santanu S. Dey and Guanghui Lan},\nyear={2019},\nurl={https://openreview.net/forum?id=HyVbhi0cYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyVbhi0cYX", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;3;5", "wc_review": "269;101;428", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 266.0, 133.51404420509476 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13496973912537108906&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "HyVxPsC9tm", "title": "DynCNN: An Effective Dynamic Architecture on Convolutional Neural Network for Surveillance Videos", "track": "main", "status": "Reject", "tldr": "An optimizing architecture on CNN for surveillance videos with 75.7% reduction on FLOPs and 2.2 times improvement on FPS", "abstract": "The large-scale surveillance video analysis becomes important as the development of intelligent city. The heavy computation resources neccessary for state-of-the-art deep learning model makes the real-time processing hard to be implemented. This paper exploits the characteristic of high scene similarity generally existing in surveillance videos and proposes dynamic convolution reusing the previous feature map to reduce the computation amount. We tested the proposed method on 45 surveillance videos with various scenes. The experimental results show that dynamic convolution can reduce up to 75.7% of FLOPs while preserving the precision within 0.7% mAP. Furthermore, the dynamic convolution can enhance the processing time up to 2.2 times.", "keywords": "CNN optimization;Reduction on convolution calculation;dynamic convolution;surveillance video", "primary_area": "", "supplementary_material": "", "author": "De-Qin Gao;Ping-Chen Tsai;Shanq-Jang Ruan", "authorids": "b10113120@gmail.com;pctsainb@gmail.com;sjruan@mail.ntust.edu.tw", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngao2019dyncnn,\ntitle={Dyn{CNN}: An Effective Dynamic Architecture on Convolutional Neural Network for Surveillance Videos},\nauthor={De-Qin Gao and Ping-Chen Tsai and Shanq-Jang Ruan},\nyear={2019},\nurl={https://openreview.net/forum?id=HyVxPsC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyVxPsC9tm", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;4", "wc_review": "126;396;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 265.3333333333333, 110.39726244593004 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2504976500388654022&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Hye-LiR5Y7", "title": "SOSELETO: A Unified Approach to Transfer Learning and Training with Noisy Labels", "track": "main", "status": "Reject", "tldr": "Learning with limited training data by exploiting \"helpful\" instances from a rich data source. ", "abstract": "We present SOSELETO (SOurce SELEction for Target Optimization), a new method for exploiting a source dataset to solve a classification problem on a target dataset. SOSELETO is based on the following simple intuition: some source examples are more informative than others for the target problem. To capture this intuition, source samples are each given weights; these weights are solved for jointly with the source and target classification problems via a bilevel optimization scheme. The target therefore gets to choose the source samples which are most informative for its own classification task. Furthermore, the bilevel nature of the optimization acts as a kind of regularization on the target, mitigating overfitting. SOSELETO may be applied to both classic transfer learning, as well as the problem of training on datasets with noisy labels; we show state of the art results on both of these problems.", "keywords": "transfer learning", "primary_area": "", "supplementary_material": "", "author": "Or Litany;Daniel Freedman", "authorids": "orlitany@gmail.com;danielfreedman@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlitany2019soseleto,\ntitle={{SOSELETO}: A Unified Approach to Transfer Learning and Training with Noisy Labels},\nauthor={Or Litany and Daniel Freedman},\nyear={2019},\nurl={https://openreview.net/forum?id=Hye-LiR5Y7},\n}", "github": "[![github](/images/github_icon.svg) orlitany/SOSELETO](https://github.com/orlitany/SOSELETO)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hye-LiR5Y7", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "368;200;608", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "555;307;798", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 392.0, 167.4275962916508 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 553.3333333333334, 200.45337501662465 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4140015565291541094&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "Hye64hA9tm", "title": "Measuring Density and Similarity of Task Relevant Information in Neural Representations", "track": "main", "status": "Reject", "tldr": "Measuring information density and cross-task similarity in neural models and its application in transfer learning.", "abstract": "Neural models achieve state-of-the-art performance due to their ability to extract salient features useful to downstream tasks. However, our understanding of how this task-relevant information is included in these networks is still incomplete. In this paper, we examine two questions (1) how densely is information included in extracted representations, and (2) how similar is the encoding of relevant information between related tasks. We propose metrics to measure information density and cross-task similarity, and perform an extensive analysis in the domain of natural language processing, using four varieties of sentence representation and 13 tasks. We also demonstrate how the proposed analysis tools can find immediate use in choosing tasks for transfer learning.", "keywords": "Neural Networks;Representation;Information density;Transfer Learning", "primary_area": "", "supplementary_material": "", "author": "Danish Pruthi;Mansi Gupta;Nitish Kumar Kulkarni;Graham Neubig;Eduard Hovy", "authorids": "ddanish@cs.cmu.edu;mansig1@cs.cmu.edu;nitishkk@andrew.cmu.edu;gneubig@cs.cmu.edu;hovy@cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\npruthi2019measuring,\ntitle={Measuring Density and Similarity of Task Relevant Information in Neural Representations},\nauthor={Danish Pruthi and Mansi Gupta and Nitish Kumar Kulkarni and Graham Neubig and Eduard Hovy},\nyear={2019},\nurl={https://openreview.net/forum?id=Hye64hA9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hye64hA9tm", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "wc_review": "1025;343;444", "wc_reply_reviewers": "0;0;52", "wc_reply_authors": "772;426;372", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 604.0, 300.5339692391971 ], "wc_reply_reviewers_avg": [ 17.333333333333332, 24.51303508113365 ], "wc_reply_authors_avg": [ 523.3333333333334, 177.21048376311023 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Hc7vlfnhnrIJ:scholar.google.com/&scioq=Measuring+Density+and+Similarity+of+Task+Relevant+Information+in+Neural+Representations&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Hye6uoC9tm", "title": "Incremental Hierarchical Reinforcement Learning with Multitask LMDPs", "track": "main", "status": "Reject", "tldr": "We develop an agent capable of incrementally growing a hierarchical representation, and using its experience to date to improve exploration.", "abstract": "Exploration is a well known challenge in Reinforcement Learning. One principled way of overcoming this challenge is to find a hierarchical abstraction of the base problem and explore at these higher levels, rather than in the space of primitives. However, discovering a deep abstraction autonomously remains a largely unsolved problem, with practitioners typically hand-crafting these hierarchical control architectures. Recent work with multitask linear Markov decision processes, allows for the autonomous discovery of deep hierarchical abstractions, but operates exclusively in the offline setting. By extending this work, we develop an agent that is capable of incrementally growing a hierarchical representation, and using its experience to date to improve exploration.", "keywords": "Reinforcement learning;hierarchy;linear markov decision process;lmdl;subtask discovery;incremental", "primary_area": "", "supplementary_material": "", "author": "Adam C Earle;Andrew M Saxe;Benjamin Rosman", "authorids": "adamchristopherearle@gmail.com;andrew.saxe@psy.ox.ac.uk;benjros@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nearle2019incremental,\ntitle={Incremental Hierarchical Reinforcement Learning with Multitask {LMDP}s},\nauthor={Adam C Earle and Andrew M Saxe and Benjamin Rosman},\nyear={2019},\nurl={https://openreview.net/forum?id=Hye6uoC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hye6uoC9tm", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "369;264;638", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 423.6666666666667, 157.50202820563578 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zTAjuciP4gQJ:scholar.google.com/&scioq=Incremental+Hierarchical+Reinforcement+Learning+with+Multitask+LMDPs&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Learning Actionable Representations with Goal Conditioned Policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/910", "id": "Hye9lnCct7", "author_site": "Dibya Ghosh, Abhishek Gupta, Sergey Levine", "tldr": "Learning state representations which capture factors necessary for control", "abstract": "Representation learning is a central challenge across a range of machine learning areas. In reinforcement learning, effective and functional representations have the potential to tremendously accelerate learning progress and solve more challenging problems. Most prior work on representation learning has focused on generative approaches, learning representations that capture all the underlying factors of variation in the observation space in a more disentangled or well-ordered manner. In this paper, we instead aim to learn functionally salient representations: representations that are not necessarily complete in terms of capturing all factors of variation in the observation space, but rather aim to capture those factors of variation that are important for decision making -- that are \"actionable\". These representations are aware of the dynamics of the environment, and capture only the elements of the observation that are necessary for decision making rather than all factors of variation, eliminating the need for explicit reconstruction. We show how these learned representations can be useful to improve exploration for sparse reward problems, to enable long horizon hierarchical reinforcement learning, and as a state representation for learning policies for downstream tasks. We evaluate our method on a number of simulated environments, and compare it to prior methods for representation learning, exploration, and hierarchical reinforcement learning.", "keywords": "Representation Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Dibya Ghosh;Abhishek Gupta;Sergey Levine", "authorids": "dibya.ghosh@berkeley.edu;abhigupta@berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nghosh2018learning,\ntitle={Learning Actionable Representations with Goal Conditioned Policies},\nauthor={Dibya Ghosh and Abhishek Gupta and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hye9lnCct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "239;1197;356", "wc_reply_reviewers": "96;349;0", "wc_reply_authors": "886;1383;875", "reply_reviewers": "1;1;0", "reply_authors": "2;3;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 597.3333333333334, 426.71015403380727 ], "wc_reply_reviewers_avg": [ 148.33333333333334, 147.20582717934627 ], "wc_reply_authors_avg": [ 1048.0, 236.92333499819443 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17735393609212194781&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Hye9lnCct7", "pdf": "https://openreview.net/pdf?id=Hye9lnCct7", "email": ";;", "author_num": 3 }, { "title": "Verification of Non-Linear Specifications for Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/878", "id": "HyeFAsRctQ", "author_site": "Chongli Qin, Krishnamurthy Dvijotham, Brendan ODonoghue, Rudy R Bunel, Robert Stanforth, Sven Gowal, Jonathan Uesato, Grzegorz Swirszcz, Pushmeet Kohli", "tldr": "", "abstract": "Prior work on neural network verification has focused on specifications that are linear functions of the output of the network, e.g., invariance of the classifier output under adversarial perturbations of the input. In this paper, we extend verification algorithms to be able to certify richer properties of neural networks. To do this we introduce the class of convex-relaxable specifications, which constitute nonlinear specifications that can be verified using a convex relaxation. We show that a number of important properties of interest can be modeled within this class, including conservation of energy in a learned dynamics model of a physical system; semantic consistency of a classifier's output labels under adversarial perturbations and bounding errors in a system that predicts the summation of handwritten digits. Our experimental evaluation shows that our method is able to effectively verify these specifications. Moreover, our evaluation exposes the failure modes in models which cannot be verified to satisfy these specifications. Thus, emphasizing the importance of training models not just to fit training data but also to be consistent with specifications.", "keywords": "Verification;Convex Optimization;Adversarial Robustness", "primary_area": "", "supplementary_material": "", "author": "Chongli Qin;Krishnamurthy (Dj) Dvijotham;Brendan O'Donoghue;Rudy Bunel;Robert Stanforth;Sven Gowal;Jonathan Uesato;Grzegorz Swirszcz;Pushmeet Kohli", "authorids": "chongliqin@google.com;dvij@google.com;bodonoghue@google.com;rbunel@google.com;stanforth@google.com;sgowal@google.com;juesato@google.com;swirszcz@google.com;pushmeet@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nqin2018verification,\ntitle={Verification of Non-Linear Specifications for Neural Networks},\nauthor={Chongli Qin and Krishnamurthy (Dj) Dvijotham and Brendan O'Donoghue and Rudy Bunel and Robert Stanforth and Sven Gowal and Jonathan Uesato and Grzegorz Swirszcz and Pushmeet Kohli},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyeFAsRctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;3;5", "wc_review": "164;641;617", "wc_reply_reviewers": "36;0;60", "wc_reply_authors": "685;1849;1036", "reply_reviewers": "1;0;1", "reply_authors": "1;3;5", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 474.0, 219.4219679065886 ], "wc_reply_reviewers_avg": [ 32.0, 24.657656011875904 ], "wc_reply_authors_avg": [ 1190.0, 487.518204788293 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17519355431971639254&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HyeFAsRctQ", "pdf": "https://openreview.net/pdf?id=HyeFAsRctQ", "email": ";;;;;;;;", "author_num": 9 }, { "title": "Generating Liquid Simulations with Deformation-aware Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/912", "id": "HyeGBj09Fm", "author_site": "Lukas Prantl, Boris Bonev, Nils Thuerey", "tldr": "Learning weighting and deformations of space-time data sets for highly efficient approximations of liquid behavior.", "abstract": "We propose a novel approach for deformation-aware neural networks that learn the weighting and synthesis of dense volumetric deformation fields. Our method specifically targets the space-time representation of physical surfaces from liquid simulations. Liquids exhibit highly complex, non-linear behavior under changing simulation conditions such as different initial conditions. Our algorithm captures these complex phenomena in two stages: a first neural network computes a weighting function for a set of pre-computed deformations, while a second network directly generates a deformation field for refining the surface. Key for successful training runs in this setting is a suitable loss function that encodes the effect of the deformations, and a robust calculation of the corresponding gradients. To demonstrate the effectiveness of our approach, we showcase our method with several complex examples of flowing liquids with topology changes. Our representation makes it possible to rapidly generate the desired implicit surfaces. We have implemented a mobile application to demonstrate that real-time interactions with complex liquid effects are possible with our approach.", "keywords": "deformation learning;spatial transformer networks;fluid simulation", "primary_area": "", "supplementary_material": "", "author": "Lukas Prantl;Boris Bonev;Nils Thuerey", "authorids": "lukas.prantl@tum.de;boris.bonev@tum.de;nils.thuerey@tum.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nprantl2018generating,\ntitle={Generating Liquid Simulations with Deformation-aware Neural Networks},\nauthor={Lukas Prantl and Boris Bonev and Nils Thuerey},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyeGBj09Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;3", "wc_review": "243;442;264", "wc_reply_reviewers": "0;28;0", "wc_reply_authors": "302;165;195", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 316.3333333333333, 89.27236725636638 ], "wc_reply_reviewers_avg": [ 9.333333333333334, 13.199326582148887 ], "wc_reply_authors_avg": [ 220.66666666666666, 58.80098260705815 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3505652891247833080&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HyeGBj09Fm", "pdf": "https://openreview.net/pdf?id=HyeGBj09Fm", "email": ";;", "author_num": 3 }, { "title": "DyRep: Learning Representations over Dynamic Graphs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/800", "id": "HyePrhR5KX", "author_site": "Rakshit Trivedi, Mehrdad Farajtabar, Prasenjeet Biswal, Hongyuan Zha", "tldr": "Models Representation Learning over dynamic graphs as latent hidden process bridging two observed processes of Topological Evolution of and Interactions on dynamic graphs.", "abstract": "Representation Learning over graph structured data has received significant attention recently due to its ubiquitous applicability. However, most advancements have been made in static graph settings while efforts for jointly learning dynamic of the graph and dynamic on the graph are still in an infant stage. Two fundamental questions arise in learning over dynamic graphs: (i) How to elegantly model dynamical processes over graphs? (ii) How to leverage such a model to effectively encode evolving graph information into low-dimensional representations? We present DyRep - a novel modeling framework for dynamic graphs that posits representation learning as a latent mediation process bridging two observed processes namely -- dynamics of the network (realized as topological evolution) and dynamics on the network (realized as activities between nodes). Concretely, we propose a two-time scale deep temporal point process model that captures the interleaved dynamics of the observed processes. This model is further parameterized by a temporal-attentive representation network that encodes temporally evolving structural information into node representations which in turn drives the nonlinear evolution of the observed graph dynamics. Our unified framework is trained using an efficient unsupervised procedure and has capability to generalize over unseen nodes. We demonstrate that DyRep outperforms state-of-the-art baselines for dynamic link prediction and time prediction tasks and present extensive qualitative insights into our framework.", "keywords": "Dynamic Graphs;Representation Learning;Dynamic Processes;Temporal Point Process;Attention;Latent Representation", "primary_area": "", "supplementary_material": "", "author": "Rakshit Trivedi;Mehrdad Farajtabar;Prasenjeet Biswal;Hongyuan Zha", "authorids": "rstrivedi@gatech.edu;farajtabar@google.com;bprasenjeet1108@gmail.com;zha@cc.gatech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ntrivedi2018dyrep,\ntitle={DyRep: Learning Representations over Dynamic Graphs},\nauthor={Rakshit Trivedi and Mehrdad Farajtabar and Prasenjeet Biswal and Hongyuan Zha},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyePrhR5KX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HyePrhR5KX)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;5;4", "wc_review": "405;559;127", "wc_reply_reviewers": "0;470;0", "wc_reply_authors": "1367;2432;49", "reply_reviewers": "0;1;0", "reply_authors": "2;4;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 363.6666666666667, 178.76862762303182 ], "wc_reply_reviewers_avg": [ 156.66666666666666, 221.5601247717849 ], "wc_reply_authors_avg": [ 1282.6666666666667, 974.6816004327887 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 729, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7348553015191648875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyePrhR5KX", "pdf": "https://openreview.net/pdf?id=HyePrhR5KX", "email": ";;;", "author_num": 4 }, { "id": "HyeS73ActX", "title": "Multi-Objective Value Iteration with Parameterized Threshold-Based Safety Constraints", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider an environment with multiple reward functions. One of them represents goal achievement and the others represent instantaneous safety conditions. We consider a scenario where the safety rewards should always be above some thresholds. The thresholds are parameters with values that differ between users.\n%The thresholds are not known at the time the policy is being designed.\nWe efficiently compute a family of policies that cover all threshold-based constraints and maximize the goal achievement reward. We introduce a new parameterized threshold-based scalarization method of the reward vector that encodes our objective. We present novel data structures to store the value functions of the Bellman equation that allow their efficient computation using the value iteration algorithm. We present results for both discrete and continuous state spaces. ", "keywords": "reinforcement learning;Markov decision processes;safety constraints;multi-objective optimization;geometric analysis", "primary_area": "", "supplementary_material": "", "author": "Hussein Sibai;Sayan Mitra", "authorids": "sibai2@illinois.edu;mitras@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsibai2019multiobjective,\ntitle={Multi-Objective Value Iteration with Parameterized Threshold-Based Safety Constraints},\nauthor={Hussein Sibai and Sayan Mitra},\nyear={2019},\nurl={https://openreview.net/forum?id=HyeS73ActX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=HyeS73ActX", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;2;4", "wc_review": "334;98;496", "wc_reply_reviewers": "0;79;0", "wc_reply_authors": "272;22;261", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 309.3333333333333, 163.41630545600057 ], "wc_reply_reviewers_avg": [ 26.333333333333332, 37.2409571424915 ], "wc_reply_authors_avg": [ 185.0, 115.34585673818833 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8TbV39E2JZAJ:scholar.google.com/&scioq=Multi-Objective+Value+Iteration+with+Parameterized+Threshold-Based+Safety+Constraints&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HyeU1hRcFX", "title": "Unsupervised Conditional Generation using noise engineered mode matching GAN", "track": "main", "status": "Reject", "tldr": "A GAN model where an inversion mapping from the generated data space to an engineered latent space is learned such that properties of the data generating distribution are matched to those of the latent distribution.", "abstract": "Conditional generation refers to the process of sampling from an unknown distribution conditioned on semantics of the data. This can be achieved by augmenting the generative model with the desired semantic labels, albeit it is not straightforward in an unsupervised setting where the semantic label of every data sample is unknown. In this paper, we address this issue by proposing a method that can generate samples conditioned on the properties of a latent distribution engineered in accordance with a certain data prior. In particular, a latent space inversion network is trained in tandem with a generative adversarial network such that the modal properties of the latent space distribution are induced in the data generating distribution. We demonstrate that our model, despite being fully unsupervised, is effective in learning meaningful representations through its mode matching property. We validate our method on multiple unsupervised tasks such as conditional generation, dataset attribute discovery and inference using three real world image datasets namely MNIST, CIFAR-10 and CELEB-A and show that the results are comparable to the state-of-the-art methods. ", "keywords": "Noise engineered GAN;Latent space engineering;Mode matching;Unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Deepak Mishra;Prathosh AP;Aravind J;Prashant Pandey;Santanu Chaudhury", "authorids": "deemishra21@gmail.com;prathoshap@gmail.com;maxaravind@gmail.com;getprashant57@gmail.com;santanuc@ee.iitd.ac.in", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmishra2019unsupervised,\ntitle={Unsupervised Conditional Generation using noise engineered mode matching {GAN}},\nauthor={Deepak Mishra and Prathosh AP and Aravind J and Prashant Pandey and Santanu Chaudhury},\nyear={2019},\nurl={https://openreview.net/forum?id=HyeU1hRcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyeU1hRcFX", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;4", "wc_review": "295;218;234", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1180;1020;757", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 249.0, 33.1762967593833 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 985.6666666666666, 174.3871809763805 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4842967257332157463&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Trellis Networks for Sequence Modeling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/825", "id": "HyeVtoRqtQ", "author_site": "Shaojie Bai, Zico Kolter, Vladlen Koltun", "tldr": "Trellis networks are a new sequence modeling architecture that bridges recurrent and convolutional models and sets a new state of the art on word- and character-level language modeling.", "abstract": "We present trellis networks, a new architecture for sequence modeling. On the one hand, a trellis network is a temporal convolutional network with special structure, characterized by weight tying across depth and direct injection of the input into deep layers. On the other hand, we show that truncated recurrent networks are equivalent to trellis networks with special sparsity structure in their weight matrices. Thus trellis networks with general weight matrices generalize truncated recurrent networks. We leverage these connections to design high-performing trellis networks that absorb structural and algorithmic elements from both recurrent and convolutional models. Experiments demonstrate that trellis networks outperform the current state of the art methods on a variety of challenging benchmarks, including word-level language modeling and character-level language modeling tasks, and stress tests designed to evaluate long-term memory retention. The code is available at https://github.com/locuslab/trellisnet .", "keywords": "sequence modeling;language modeling;recurrent networks;convolutional networks;trellis networks", "primary_area": "", "supplementary_material": "", "author": "Shaojie Bai;J. Zico Kolter;Vladlen Koltun", "authorids": "shaojieb@cs.cmu.edu;zkolter@cs.cmu.edu;vkoltun@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbai2018trellis,\ntitle={Trellis Networks for Sequence Modeling},\nauthor={Shaojie Bai and J. Zico Kolter and Vladlen Koltun},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyeVtoRqtQ},\n}", "github": "[![github](/images/github_icon.svg) locuslab/trellisnet](https://github.com/locuslab/trellisnet)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;3", "wc_review": "279;326;96", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "656;415;139", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 233.66666666666666, 99.21805391269385 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 403.3333333333333, 211.22552455189268 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 181, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13782940196634240151&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HyeVtoRqtQ", "pdf": "https://openreview.net/pdf?id=HyeVtoRqtQ", "email": ";;", "author_num": 3 }, { "id": "Hyed4i05KX", "title": "Interpreting Layered Neural Networks via Hierarchical Modular Representation", "track": "main", "status": "Reject", "tldr": "A method for obtaining a hierarchical cluster structure of a trained layered neural network", "abstract": "Interpreting the prediction mechanism of complex models is currently one of the most important tasks in the machine learning field, especially with layered neural networks, which have achieved high predictive performance with various practical data sets. To reveal the global structure of a trained neural network in an interpretable way, a series of clustering methods have been proposed, which decompose the units into clusters according to the similarity of their inference roles. The main problems in these studies were that (1) we have no prior knowledge about the optimal resolution for the decomposition, or the appropriate number of clusters, and (2) there was no method with which to acquire knowledge about whether the outputs of each cluster have a positive or negative correlation with the input and output dimension values. \nIn this paper, to solve these problems, we propose a method for obtaining a hierarchical modular representation of a layered neural network. The application of a hierarchical clustering method to a trained network reveals a tree-structured relationship among hidden layer units, based on their feature vectors defined by their correlation with the input and output dimension values. ", "keywords": "interpretabile machine learning;neural network;hierarchical clustering", "primary_area": "", "supplementary_material": "", "author": "Chihiro Watanabe", "authorids": "watanabe.chihiro@lab.ntt.co.jp", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nwatanabe2019interpreting,\ntitle={Interpreting Layered Neural Networks via Hierarchical Modular Representation},\nauthor={Chihiro Watanabe},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyed4i05KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyed4i05KX", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;3", "wc_review": "432;485;96", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 337.6666666666667, 172.24852845686536 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14350196223967780908&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "HyefgnCqFm", "title": "Learning Partially Observed PDE Dynamics with Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Spatio-Temporal processes bear a central importance in many applied scientific fields. Generally, differential equations are used to describe these processes. In this work, we address the problem of learning spatio-temporal dynamics with neural networks when only partial information on the system's state is available. Taking inspiration from the dynamical system approach, we outline a general framework in which complex dynamics generated by families of differential equations can be learned in a principled way. Two models are derived from this framework. We demonstrate how they can be applied in practice by considering the problem of forecasting fluid flows. We show how the underlying equations fit into our formalism and evaluate our method by comparing with standard baselines.", "keywords": "deep learning;spatio-temporal dynamics;physical processes;differential equations;dynamical systems", "primary_area": "", "supplementary_material": "", "author": "Ibrahim Ayed;Emmanuel De B\u00e9zenac;Arthur Pajot;Patrick Gallinari", "authorids": "ayedibrahim@gmail.com;emmanuel.de-bezenac@lip6.fr;arthur.pajot@lip6.fr;patrick.gallinari@lip6.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nayed2019learning,\ntitle={Learning Partially Observed {PDE} Dynamics with Neural Networks},\nauthor={Ibrahim Ayed and Emmanuel De B\u00e9zenac and Arthur Pajot and Patrick Gallinari},\nyear={2019},\nurl={https://openreview.net/forum?id=HyefgnCqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyefgnCqFm", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;5;3", "wc_review": "261;129;501", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "940;735;1330", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 297.0, 153.98701243936125 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1001.6666666666666, 246.79050945195513 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13643773160045218084&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HyesB2RqFQ", "title": "Bridging HMMs and RNNs through Architectural Transformations", "track": "main", "status": "Withdraw", "tldr": "Are HMMs a special case of RNNs? We investigate a series of architectural transformations between HMMs and RNNs, both through theoretical derivations and empirical hybridization and provide new insights.", "abstract": "A distinct commonality between HMMs and RNNs is that they both learn hidden representations for sequential data. In addition, it has been noted that the backward computation of the Baum-Welch algorithm for HMMs is a special case of the back propagation algorithm used for neural networks (Eisner (2016)). Do these observations suggest that, despite their many apparent differences, HMMs are a special case of RNNs? In this paper, we investigate a series of architectural transformations between HMMs and RNNs, both through theoretical derivations and empirical hybridization, to answer this question. In particular, we investigate three key design factors\u2014independence assumptions between the hidden states and the observation, the placement of softmax, and the use of non-linearity\u2014in order to pin down their empirical effects. We present a comprehensive empirical study to provide insights on the interplay between expressivity and interpretability with respect to language modeling and parts-of-speech induction. ", "keywords": "rnns;hmms;latent variable models;language modelling;interpretability;sequence modelling", "primary_area": "", "supplementary_material": "", "author": "Jan Buys;Yonatan Bisk;Yejin Choi", "authorids": "jbuys@cs.washington.edu;ybisk@yonatanbisk.com;yejin@cs.washington.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyesB2RqFQ", "pdf_size": 0, "rating": "3;5;5", "confidence": "3;4;4", "wc_review": "136;320;875", "wc_reply_reviewers": "0;47;486", "wc_reply_authors": "14;123;157", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 443.6666666666667, 314.11286011382737 ], "wc_reply_reviewers_avg": [ 177.66666666666666, 218.86728601800885 ], "wc_reply_authors_avg": [ 98.0, 60.99726769837044 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13334162303866012898&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "HyesW2C9YQ", "title": "I Know the Feeling: Learning to Converse with Empathy", "track": "main", "status": "Reject", "tldr": "We improve existing dialogue systems for responding to people sharing personal stories, incorporating emotion prediction representations and also release a new benchmark and dataset of empathetic dialogues.", "abstract": "Beyond understanding what is being discussed, human communication requires an awareness of what someone is feeling. One challenge for dialogue agents is recognizing feelings in the conversation partner and replying accordingly, a key communicative skill that is trivial for humans. Research in this area is made difficult by the paucity of suitable publicly available datasets both for emotion and dialogues. This work proposes a new task for empathetic dialogue generation and EmpatheticDialogues, a dataset of 25k conversations grounded in emotional situations to facilitate training and evaluating dialogue systems. Our experiments indicate that dialogue models that use our dataset are perceived to be more empathetic by human evaluators, while improving on other metrics as well (e.g. perceived relevance of responses, BLEU scores), compared to models merely trained on large-scale Internet conversation data. We also present empirical comparisons of several ways to improve the performance of a given model by leveraging existing models or datasets without requiring lengthy re-training of the full model.", "keywords": "dialogue generation;nlp applications;grounded text generation;contextual representation learning", "primary_area": "", "supplementary_material": "", "author": "Hannah Rashkin;Eric Michael Smith;Margaret Li;Y-Lan Boureau", "authorids": "hrashkin@cs.washington.edu;ems@fb.com;hadasah@gmail.com;ylan@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nrashkin2019i,\ntitle={I Know the Feeling: Learning to Converse with Empathy},\nauthor={Hannah Rashkin and Eric Michael Smith and Margaret Li and Y-Lan Boureau},\nyear={2019},\nurl={https://openreview.net/forum?id=HyesW2C9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyesW2C9YQ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;4", "wc_review": "720;1413;371", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1110;1915;934", "reply_reviewers": "0;0;0", "reply_authors": "2;3;3", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 834.6666666666666, 433.0529862371219 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1319.6666666666667, 427.0521695947177 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10586752738169454296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HyevnsCqtQ", "title": "Integral Pruning on Activations and Weights for Efficient Neural Networks", "track": "main", "status": "Reject", "tldr": "This work advances DNN compression beyond the weights to the activations by integrating the activation pruning with the weight pruning. ", "abstract": "With the rapidly scaling up of deep neural networks (DNNs), extensive research studies on network model compression such as weight pruning have been performed for efficient deployment. This work aims to advance the compression beyond the weights to the activations of DNNs. We propose the Integral Pruning (IP) technique which integrates the activation pruning with the weight pruning. Through the learning on the different importance of neuron responses and connections, the generated network, namely IPnet, balances the sparsity between activations and weights and therefore further improves execution efficiency. The feasibility and effectiveness of IPnet are thoroughly evaluated through various network models with different activation functions and on different datasets. With <0.5% disturbance on the testing accuracy, IPnet saves 71.1% ~ 96.35% of computation cost, compared to the original dense models with up to 5.8x and 10x reductions in activation and weight numbers, respectively. ", "keywords": "activation pruning;weight pruning;computation cost reduction;efficient DNNs", "primary_area": "", "supplementary_material": "", "author": "Qing Yang;Wei Wen;Zuoguan Wang;Yiran Chen;Hai Li", "authorids": "qing.yang21@duke.edu;wei.wen@duke.edu;zuoguan.wang@blacksesame.com;yiran.chen@duke.edu;hai.li@duke.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyang2019integral,\ntitle={Integral Pruning on Activations and Weights for Efficient Neural Networks},\nauthor={Qing Yang and Wei Wen and Zuoguan Wang and Yiran Chen and Hai Li},\nyear={2019},\nurl={https://openreview.net/forum?id=HyevnsCqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyevnsCqtQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;4", "wc_review": "184;920;168", "wc_reply_reviewers": "0;278;0", "wc_reply_authors": "527;611;352", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 424.0, 350.78578458464744 ], "wc_reply_reviewers_avg": [ 92.66666666666667, 131.0504567799068 ], "wc_reply_authors_avg": [ 496.6666666666667, 107.88986153583765 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5616844820052281280&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Hyewf3AqYX", "title": "A Frank-Wolfe Framework for Efficient and Effective Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Depending on how much information an adversary can access to, adversarial attacks can be classified as white-box attack and black-box attack. In both cases, optimization-based attack algorithms can achieve relatively low distortions and high attack success rates. However, they usually suffer from poor time and query complexities, thereby limiting their practical usefulness. In this work, we focus on the problem of developing efficient and effective optimization-based adversarial attack algorithms. In particular, we propose a novel adversarial attack framework for both white-box and black-box settings based on the non-convex Frank-Wolfe algorithm. We show in theory that the proposed attack algorithms are efficient with an $O(1/\\sqrt{T})$ convergence rate. The empirical results of attacking Inception V3 model and ResNet V2 model on the ImageNet dataset also verify the efficiency and effectiveness of the proposed algorithms. More specific, our proposed algorithms attain the highest attack success rate in both white-box and black-box attacks among all baselines, and are more time and query efficient than the state-of-the-art.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jinghui Chen;Jinfeng Yi;Quanquan Gu", "authorids": "jc4zg@virginia.edu;yijinfeng@jd.com;qgu@cs.ucla.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2019a,\ntitle={A Frank-Wolfe Framework for Efficient and Effective Adversarial Attacks},\nauthor={Jinghui Chen and Jinfeng Yi and Quanquan Gu},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyewf3AqYX},\n}", "github": "[![github](/images/github_icon.svg) uclaml/Frank-Wolfe-AdvML](https://github.com/uclaml/Frank-Wolfe-AdvML) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=Hyewf3AqYX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyewf3AqYX", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "388;567;503", "wc_reply_reviewers": "148;63;22", "wc_reply_authors": "1046;606;534", "reply_reviewers": "1;1;1", "reply_authors": "4;3;2", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 486.0, 74.05853540724841 ], "wc_reply_reviewers_avg": [ 77.66666666666667, 52.4743323497837 ], "wc_reply_authors_avg": [ 728.6666666666666, 226.30559476562266 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16584329289973334129&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "title": "Scalable Unbalanced Optimal Transport using Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/841", "id": "HyexAiA5Fm", "author_site": "Karren Yang, Caroline Uhler", "tldr": "We propose new methodology for unbalanced optimal transport using generative adversarial networks.", "abstract": "Generative adversarial networks (GANs) are an expressive class of neural generative models with tremendous success in modeling high-dimensional continuous measures. In this paper, we present a scalable method for unbalanced optimal transport (OT) based on the generative-adversarial framework. We formulate unbalanced OT as a problem of simultaneously learning a transport map and a scaling factor that push a source measure to a target measure in a cost-optimal manner. We provide theoretical justification for this formulation, showing that it is closely related to an existing static formulation by Liero et al. (2018). We then propose an algorithm for solving this problem based on stochastic alternating gradient updates, similar in practice to GANs, and perform numerical experiments demonstrating how this methodology can be applied to population modeling.", "keywords": "unbalanced optimal transport;generative adversarial networks;population modeling", "primary_area": "", "supplementary_material": "", "author": "Karren D. Yang;Caroline Uhler", "authorids": "karren@mit.edu;cuhler@mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nyang2018scalable,\ntitle={Scalable Unbalanced Optimal Transport using Generative Adversarial Networks},\nauthor={Karren D. Yang and Caroline Uhler},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyexAiA5Fm},\n}", "github": "[![github](/images/github_icon.svg) uhlerlab/unbalanced_ot](https://github.com/uhlerlab/unbalanced_ot)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "598;741;848", "wc_reply_reviewers": "110;69;16", "wc_reply_authors": "537;451;1191", "reply_reviewers": "1;1;1", "reply_authors": "2;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 729.0, 102.41419172491021 ], "wc_reply_reviewers_avg": [ 65.0, 38.47943173523573 ], "wc_reply_authors_avg": [ 726.3333333333334, 330.4394380551786 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14112773597586866494&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyexAiA5Fm", "pdf": "https://openreview.net/pdf?id=HyexAiA5Fm", "email": ";", "author_num": 2 }, { "id": "Hyffti0ctQ", "title": "PRUNING WITH HINTS: AN EFFICIENT FRAMEWORK FOR MODEL ACCELERATION", "track": "main", "status": "Reject", "tldr": "This is a work aiming for boosting all the existing pruning and mimic method.", "abstract": "In this paper, we propose an efficient framework to accelerate convolutional neural networks. We utilize two types of acceleration methods: pruning and hints. Pruning can reduce model size by removing channels of layers. Hints can improve the performance of student model by transferring knowledge from teacher model. We demonstrate that pruning and hints are complementary to each other. On one hand, hints can benefit pruning by maintaining similar feature representations. On the other hand, the model pruned from teacher networks is a good initialization for student model, which increases the transferability between two networks. Our approach performs pruning stage and hints stage iteratively to further improve the\nperformance. Furthermore, we propose an algorithm to reconstruct the parameters of hints layer and make the pruned model more suitable for hints. Experiments were conducted on various tasks including classification and pose estimation. Results on CIFAR-10, ImageNet and COCO demonstrate the generalization and superiority of our framework.", "keywords": "model acceleration;mimic;knowledge distillation;channel pruning", "primary_area": "", "supplementary_material": "", "author": "Wei Gao;Yi Wei;Quanquan Li;Hongwei Qin;Wanli Ouyang;Junjie Yan", "authorids": "weigao1996@outlook.com;wei-y15@mails.tsinghua.edu.cn;liquanquan@sensetime.com;qinghongwei@sensetime.com;wanli.ouyang@sydney.edu.cn;yanjunjie@outlook.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ngao2019pruning,\ntitle={{PRUNING} {WITH} {HINTS}: {AN} {EFFICIENT} {FRAMEWORK} {FOR} {MODEL} {ACCELERATION}},\nauthor={Wei Gao and Yi Wei and Quanquan Li and Hongwei Qin and Wanli Ouyang and Junjie Yan},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyffti0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=Hyffti0ctQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "wc_review": "221;227;128", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 192.0, 45.32107677449864 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3247635956728857761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Hyfg5o0qtm", "title": "Temporal Gaussian Mixture Layer for Videos", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a new convolutional layer named the Temporal Gaussian Mixture (TGM) layer and present how it can be used to efficiently capture longer-term temporal information in continuous activity videos. The TGM layer is a temporal convolutional layer governed by a much smaller set of parameters (e.g., location/variance of Gaussians) that are fully differentiable. We present our fully convolutional video models with multiple TGM layers for activity detection. The experiments on multiple datasets including Charades and MultiTHUMOS confirm the effectiveness of TGM layers, outperforming the state-of-the-arts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "AJ Piergiovanni;Michael S. Ryoo", "authorids": "ajpiergi@indiana.edu;mryoo@indiana.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npiergiovanni2019temporal,\ntitle={Temporal Gaussian Mixture Layer for Videos},\nauthor={AJ Piergiovanni and Michael S. Ryoo},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyfg5o0qtm},\n}", "github": "[![github](/images/github_icon.svg) piergiaj/tgm-icml19](https://github.com/piergiaj/tgm-icml19)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Hyfg5o0qtm", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;3;5", "wc_review": "855;230;205", "wc_reply_reviewers": "314;0;0", "wc_reply_authors": "869;353;269", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 430.0, 300.6936425444786 ], "wc_reply_reviewers_avg": [ 104.66666666666667, 148.02101952838393 ], "wc_reply_authors_avg": [ 497.0, 265.26967410542807 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7515216755463628280&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "title": "Solving the Rubik's Cube with Approximate Policy Iteration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1094", "id": "Hyfn2jCcKm", "author_site": "Stephen McAleer, Forest Agostinelli, Alexander K Shmakov, Pierre Baldi", "tldr": "We solve the Rubik's Cube with pure reinforcement learning", "abstract": "Recently, Approximate Policy Iteration (API) algorithms have achieved super-human proficiency in two-player zero-sum games such as Go, Chess, and Shogi without human data. These API algorithms iterate between two policies: a slow policy (tree search), and a fast policy (a neural network). In these two-player games, a reward is always received at the end of the game. However, the Rubik\u2019s Cube has only a single solved state, and episodes are not guaranteed to terminate. This poses a major problem for these API algorithms since they rely on the reward received at the end of the game. We introduce Autodidactic Iteration: an API algorithm that overcomes the problem of sparse rewards by training on a distribution of states that allows the reward to propagate from the goal state to states farther away. Autodidactic Iteration is able to learn how to solve the Rubik\u2019s Cube and the 15-puzzle without relying on human data. Our algorithm is able to solve 100% of randomly scrambled cubes while achieving a median solve length of 30 moves \u2014 less than or equal to solvers that employ human domain knowledge.", "keywords": "reinforcement learning;Rubik's Cube;approximate policy iteration;deep learning;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Stephen McAleer;Forest Agostinelli;Alexander Shmakov;Pierre Baldi", "authorids": "smcaleer@uci.edu;fagostin@uci.edu;ashmakov@uci.edu;pfbaldi@ics.uci.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmcaleer2018solving,\ntitle={Solving the Rubik's Cube with Approximate Policy Iteration},\nauthor={Stephen McAleer and Forest Agostinelli and Alexander Shmakov and Pierre Baldi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyfn2jCcKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;4", "wc_review": "422;249;256", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "237;506;295", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 309.0, 79.95415352979899 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 346.0, 115.58835004734112 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17384475801236234336&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Hyfn2jCcKm", "pdf": "https://openreview.net/pdf?id=Hyfn2jCcKm", "email": ";;;", "author_num": 4 }, { "id": "HyfyN30qt7", "title": "NICE: noise injection and clamping estimation for neural network quantization", "track": "main", "status": "Reject", "tldr": "Combine noise injection, gradual quantization and activation clamping learning to achieve state-of-the-art 3,4 and 5 bit quantization", "abstract": "Convolutional Neural Networks (CNN) are very popular in many fields including computer vision, speech recognition, natural language processing, to name a few. Though deep learning leads to groundbreaking performance in these domains, the networks used are very demanding computationally and are far from real-time even on a GPU, which is not power efficient and therefore does not suit low power systems such as mobile devices. To overcome this challenge, some solutions have been proposed for quantizing the weights and activations of these networks, which accelerate the runtime significantly. Yet, this acceleration comes at the cost of a larger error. The NICE method proposed in this work trains quantized neural networks by noise injection and a learned clamping, which improve the accuracy. This leads to state-of-the-art results on various regression and classification tasks, e.g., ImageNet classification with architectures such as ResNet-18/34/50 with low as 3-bit weights and 3 -bit activations. We implement the proposed solution on an FPGA to demonstrate its applicability for low power real-time applications.", "keywords": "Efficient inference;Hardware-efficient model architectures;Quantization", "primary_area": "", "supplementary_material": "", "author": "Chaim Baskin;Natan Liss;Yoav Chai;Evgenii Zheltonozhskii;Eli Schwartz;Raja Girayes;Avi Mendelson;Alexander M.Bronstein", "authorids": "chaimbaskin@cs.technion.ac.il;lissnatan@campus.technion.ac.il;yoavchai1@mail.tau.ac.il;evgeniizh@campus.technion.ac.il;eli.shw@gmail.com;raja@tauex.tau.ac.il;avi.mendelson@tce.technion.ac.il;bron@cs.technion.ac.il", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nbaskin2019nice,\ntitle={{NICE}: noise injection and clamping estimation for neural network quantization},\nauthor={Chaim Baskin and Natan Liss and Yoav Chai and Evgenii Zheltonozhskii and Eli Schwartz and Raja Girayes and Avi Mendelson and Alexander M.Bronstein},\nyear={2019},\nurl={https://openreview.net/forum?id=HyfyN30qt7},\n}", "github": "[![github](/images/github_icon.svg) Lancer555/NICE](https://github.com/Lancer555/NICE)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyfyN30qt7", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;4;3", "wc_review": "641;330;493", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 488.0, 127.01443487520096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15749868443757562507&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "title": "Variance Reduction for Reinforcement Learning in Input-Driven Environments", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1025", "id": "Hyg1G2AqtQ", "author_site": "Hongzi Mao, Shaileshh Bojja Venkatakrishnan, Malte Schwarzkopf, Mohammad Alizadeh", "tldr": "For environments dictated partially by external input processes, we derive an input-dependent baseline that provably reduces the variance for policy gradient methods and improves the policy performance in a wide range of RL tasks.", "abstract": "We consider reinforcement learning in input-driven environments, where an exogenous, stochastic input process affects the dynamics of the system. Input processes arise in many applications, including queuing systems, robotics control with disturbances, and object tracking. Since the state dynamics and rewards depend on the input process, the state alone provides limited information for the expected future returns. Therefore, policy gradient methods with standard state-dependent baselines suffer high variance during training. We derive a bias-free, input-dependent baseline to reduce this variance, and analytically show its benefits over state-dependent baselines. We then propose a meta-learning approach to overcome the complexity of learning a baseline that depends on a long sequence of inputs. Our experimental results show that across environments from queuing systems, computer networks, and MuJoCo robotic locomotion, input-dependent baselines consistently improve training stability and result in better eventual policies.", "keywords": "reinforcement learning;policy gradient;input-driven environments;variance reduction;baseline", "primary_area": "", "supplementary_material": "", "author": "Hongzi Mao;Shaileshh Bojja Venkatakrishnan;Malte Schwarzkopf;Mohammad Alizadeh", "authorids": "hongzi@csail.mit.edu;bjjvnkt@csail.mit.edu;malte@csail.mit.edu;alizadeh@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmao2018variance,\ntitle={Variance Reduction for Reinforcement Learning in Input-Driven Environments},\nauthor={Hongzi Mao and Shaileshh Bojja Venkatakrishnan and Malte Schwarzkopf and Mohammad Alizadeh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyg1G2AqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer2", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;4;4", "wc_review": "675;526;274", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "798;1246;75", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 491.6666666666667, 165.49790196723205 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 706.3333333333334, 482.4329544667896 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11915919798056236304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=Hyg1G2AqtQ", "pdf": "https://openreview.net/pdf?id=Hyg1G2AqtQ", "email": ";;;", "author_num": 4 }, { "id": "Hyg1Ls0cKQ", "title": "Learning Latent Semantic Representation from Pre-defined Generative Model", "track": "main", "status": "Reject", "tldr": "We propose a generative model that not only produces data with desired features from the pre-defined latent space but also fully understands the features of the data to create characteristics that are not in the dataset.", "abstract": "Learning representations of data is an important issue in machine learning. Though GAN has led to significant improvements in the data representations, it still has several problems such as unstable training, hidden manifold of data, and huge computational overhead. GAN tends to produce the data simply without any information about the manifold of the data, which hinders from controlling desired features to generate. Moreover, most of GAN\u2019s have a large size of manifold, resulting in poor scalability. In this paper, we propose a novel GAN to control the latent semantic representation, called LSC-GAN, which allows us to produce desired data to generate and learns a representation of the data efficiently. Unlike the conventional GAN models with hidden distribution of latent space, we define the distributions explicitly in advance that are trained to generate the data based on the corresponding features by inputting the latent variables that follow the distribution. As the larger scale of latent space caused by deploying various distributions in one latent space makes training unstable while maintaining the dimension of latent space, we need to separate the process of defining the distributions explicitly and operation of generation. We prove that a VAE is proper for the former and modify a loss function of VAE to map the data into the pre-defined latent space so as to locate the reconstructed data as close to the input data according to its characteristics. Moreover, we add the KL divergence to the loss function of LSC-GAN to include this process. The decoder of VAE, which generates the data with the corresponding features from the pre-defined latent space, is used as the generator of the LSC-GAN. Several experiments on the CelebA dataset are conducted to verify the usefulness of the proposed method to generate desired data stably and efficiently, achieving a high compression ratio that can hold about 24 pixels of information in each dimension of latent space. Besides, our model learns the reverse of features such as not laughing (rather frowning) only with data of ordinary and smiling facial expression.", "keywords": "Latent space;Generative adversarial network;variational autoencoder;conditioned generation", "primary_area": "", "supplementary_material": "", "author": "Jin-Young Kim;Sung-Bae Cho", "authorids": "seago0828@yonsei.ac.kr;sbcho@yonsei.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkim2019learning,\ntitle={Learning Latent Semantic Representation from Pre-defined Generative Model},\nauthor={Jin-Young Kim and Sung-Bae Cho},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyg1Ls0cKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hyg1Ls0cKQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;3;2", "wc_review": "350;250;911", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 503.6666666666667, 290.90701072488594 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:wBmSZIGDLLYJ:scholar.google.com/&scioq=Learning+Latent+Semantic+Representation+from+Pre-defined+Generative+Model&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Hyg74h05tX", "title": "Flow++: Improving Flow-Based Generative Models with Variational Dequantization and Architecture Design", "track": "main", "status": "Reject", "tldr": "Improved training of current flow-based generative models (Glow and RealNVP) on density estimation benchmarks", "abstract": "Flow-based generative models are powerful exact likelihood models with efficient sampling and inference. \nDespite their computational efficiency, flow-based models generally have much worse density modeling performance compared to state-of-the-art autoregressive models. In this paper, we investigate and improve upon three limiting design choices employed by flow-based models in prior work: the use of uniform noise for dequantization, the use of inexpressive affine flows, and the use of purely convolutional conditioning networks in coupling layers. Based on our findings, we propose Flow++, a new flow-based model that is now the state-of-the-art non-autoregressive model for unconditional density estimation on standard image benchmarks. Our work has begun to close the significant performance gap that has so far existed between autoregressive models and flow-based models.", "keywords": "Deep Generative Models;Normalizing Flows;RealNVP;Density Estimation", "primary_area": "", "supplementary_material": "", "author": "Jonathan Ho;Xi Chen;Aravind Srinivas;Yan Duan;Pieter Abbeel", "authorids": "jonathanho@berkeley.edu;peter@covariant.ai;aravind_srinivas@berkeley.edu;dementrock@gmail.com;pabbeel@cs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nho2019flow,\ntitle={Flow++: Improving Flow-Based Generative Models with Variational Dequantization and Architecture Design },\nauthor={Jonathan Ho and Xi Chen and Aravind Srinivas and Yan Duan and Pieter Abbeel},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyg74h05tX},\n}", "github": "[![github](/images/github_icon.svg) aravind0706/flowpp](https://github.com/aravind0706/flowpp) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=Hyg74h05tX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyg74h05tX", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;3;4", "wc_review": "428;275;446", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "208;24;277", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 383.0, 76.72027111526653 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 169.66666666666666, 106.78431012507825 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 549, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7151289546461544772&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "title": "Transferring Knowledge across Learning Processes", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/771", "id": "HygBZnRctX", "author_site": "Sebastian Flennerhag, Pablo Moreno, Neil D Lawrence, Andreas Damianou", "tldr": "We propose Leap, a framework that transfers knowledge across learning processes by minimizing the expected distance the training process travels on a task's loss surface.", "abstract": "In complex transfer learning scenarios new tasks might not be tightly linked to previous tasks. Approaches that transfer information contained only in the final parameters of a source model will therefore struggle. Instead, transfer learning at at higher level of abstraction is needed. We propose Leap, a framework that achieves this by transferring knowledge across learning processes. We associate each task with a manifold on which the training process travels from initialization to final parameters and construct a meta-learning objective that minimizes the expected length of this path. Our framework leverages only information obtained during training and can be computed on the fly at negligible cost. We demonstrate that our framework outperforms competing methods, both in meta-learning and transfer learning, on a set of computer vision tasks. Finally, we demonstrate that Leap can transfer knowledge across learning processes in demanding reinforcement learning environments (Atari) that involve millions of gradient steps.", "keywords": "meta-learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Sebastian Flennerhag;Pablo G. Moreno;Neil D. Lawrence;Andreas Damianou", "authorids": "sflennerhag@turing.ac.uk;morepabl@amazon.com;lawrennd@amazon.com;damianou@amazon.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nflennerhag2018transferring,\ntitle={Transferring Knowledge across Learning Processes},\nauthor={Sebastian Flennerhag and Pablo Garcia Moreno and Neil Lawrence and Andreas Damianou},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HygBZnRctX},\n}", "github": "[![github](/images/github_icon.svg) amzn/xfer](https://github.com/amzn/xfer) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=HygBZnRctX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;8;8", "confidence": "4;3;4", "wc_review": "165;474;2084", "wc_reply_reviewers": "0;189;991", "wc_reply_authors": "343;947;1300", "reply_reviewers": "0;1;2", "reply_authors": "2;3;3", "rating_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 907.6666666666666, 841.3046746307521 ], "wc_reply_reviewers_avg": [ 393.3333333333333, 429.60007241878145 ], "wc_reply_authors_avg": [ 863.3333333333334, 395.1475111342036 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12789436144351549005&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HygBZnRctX", "pdf": "https://openreview.net/pdf?id=HygBZnRctX", "email": ";;;", "author_num": 4 }, { "title": "Model-Predictive Policy Learning with Uncertainty Regularization for Driving in Dense Traffic", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1121", "id": "HygQBn0cYm", "author_site": "Mikael Henaff, Alfredo Canziani, Yann LeCun", "tldr": "A model-based RL approach which uses a differentiable uncertainty penalty to learn driving policies from purely observational data.", "abstract": " Learning a policy using only observational data is challenging because the distribution of states it induces at execution time may differ from the distribution observed during training. In this work, we propose to train a policy while explicitly penalizing the mismatch between these two distributions over a fixed time horizon. We do this by using a learned model of the environment dynamics which is unrolled for multiple time steps, and training a policy network to minimize a differentiable cost over this rolled-out trajectory. This cost contains two terms: a policy cost which represents the objective the policy seeks to optimize, and an uncertainty cost which represents its divergence from the states it is trained on. We propose to measure this second cost by using the uncertainty of the dynamics model about its own predictions, using recent ideas from uncertainty estimation for deep networks. We evaluate our approach using a large-scale observational dataset of driving behavior recorded from traffic cameras, and show that we are able to learn effective driving policies from purely observational data, with no environment interaction. ", "keywords": "model-based reinforcement learning;stochastic video prediction;autonomous driving", "primary_area": "", "supplementary_material": "", "author": "Mikael Henaff;Alfredo Canziani;Yann LeCun", "authorids": "mbh305@nyu.edu;canziani@nyu.edu;yann@cs.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhenaff2018modelpredictive,\ntitle={Model-Predictive Policy Learning with Uncertainty Regularization for Driving in Dense Traffic},\nauthor={Mikael Henaff and Alfredo Canziani and Yann LeCun},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HygQBn0cYm},\n}", "github": "[![github](/images/github_icon.svg) Atcold/pytorch-PPUU](https://github.com/Atcold/pytorch-PPUU)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;5", "wc_review": "560;418;428", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "969;492;412", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 468.6666666666667, 64.71132478597201 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 624.3333333333334, 245.89473809380758 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 159, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5048415252406845644&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HygQBn0cYm", "pdf": "https://openreview.net/pdf?id=HygQBn0cYm", "email": ";;", "author_num": 3 }, { "id": "HygQro05KX", "title": "$A^*$ sampling with probability matching", "track": "main", "status": "Reject", "tldr": "", "abstract": "Probabilistic methods often need to draw samples from a nontrivial distribution. $A^*$ sampling is a nice algorithm by building upon a top-down construction of a Gumbel process, where a large state space is divided into subsets and at each round $A^*$ sampling selects a subset to process. However, the selection rule depends on a bound function, which can be intractable. Moreover, we show that such a selection criterion can be inefficient. This paper aims to improve $A^*$ sampling by addressing these issues. To design a suitable selection rule, we apply \\emph{Probability Matching}, a widely used method for decision making, to $A^*$ sampling. We provide insights into the relationship between $A^*$ sampling and probability matching by analyzing a nontrivial special case in which the state space is partitioned into two subsets. We show that in this case probability matching is optimal within a constant gap. Furthermore, as directly applying probability matching to $A^*$ sampling is time consuming, we design an approximate version based on Monte-Carlo estimators. We also present an efficient implementation by leveraging special properties of Gumbel distributions and well-designed balanced trees. Empirical results show that our method saves a significantly amount of computational resources on suboptimal regions compared with $A^*$ sampling.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yichi Zhou;Jun Zhu", "authorids": "vofhqn@gmail.com;dcszj@mail.tsinghua.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhou2019a,\ntitle={$A^*$ sampling with probability matching},\nauthor={Yichi Zhou and Jun Zhu},\nyear={2019},\nurl={https://openreview.net/forum?id=HygQro05KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HygQro05KX", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;5;2", "wc_review": "316;553;82", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 317.0, 192.28624495787523 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:v0bjHCopwyAJ:scholar.google.com/&scioq=%24A%5E*%24+sampling+with+probability+matching&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "HygS7n0cFQ", "title": "Fast Exploration with Simplified Models and Approximately Optimistic Planning in Model Based Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We studied exploration with imperfect planning and used object representation to learn simple models and introduced a new sample efficient RL algorithm that achieves state of the art results on Pitfall!", "abstract": "Humans learn to play video games significantly faster than the state-of-the-art reinforcement learning (RL) algorithms. People seem to build simple models that are easy to learn to support planning and strategic exploration. Inspired by this, we investigate two issues in leveraging model-based RL for sample efficiency. First we investigate how to perform strategic exploration when exact planning is not feasible and empirically show that optimistic Monte Carlo Tree Search outperforms posterior sampling methods. Second we show how to learn simple deterministic models to support fast learning using object representation. We illustrate the benefit of these ideas by introducing a novel algorithm, Strategic Object Oriented Reinforcement Learning (SOORL), that outperforms state-of-the-art algorithms in the game of Pitfall! in less than 50 episodes.", "keywords": "Reinforcement Learning;Strategic Exploration;Model Based Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Ramtin Keramati;Jay Whang;Patrick Cho;Emma Brunskill", "authorids": "keramati@stanford.edu;jaywhang@cs.stanford.edu;patcho@cs.stanford.edu;ebrun@cs.stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkeramati2019fast,\ntitle={Fast Exploration with Simplified Models and Approximately Optimistic Planning in Model Based Reinforcement Learning},\nauthor={Ramtin Keramati and Jay Whang and Patrick Cho and Emma Brunskill},\nyear={2019},\nurl={https://openreview.net/forum?id=HygS7n0cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=HygS7n0cFQ", "pdf_size": 0, "rating": "4;5", "confidence": "4;4", "wc_review": "518;778", "wc_reply_reviewers": "0;0", "wc_reply_authors": "0;0", "reply_reviewers": "0;0", "reply_authors": "0;0", "rating_avg": [ 4.5, 0.5 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 648.0, 130.0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 3, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7732174050908592802&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "HygT9oRqFX", "title": "MixFeat: Mix Feature in Latent Space Learns Discriminative Space", "track": "main", "status": "Reject", "tldr": "We provide a novel method named MixFeat, which directly makes the latent space discriminative.", "abstract": "Deep learning methods perform well in various tasks. However, the over-fitting problem, which causes the performance to decrease for unknown data, remains. We hence propose a method named MixFeat that directly creates latent spaces in a network that can distinguish classes. MixFeat mixes two feature maps in each latent space in the network and uses unmixed labels for learning. We discuss the difference between a method that mixes only features (MixFeat) and a method that mixes both features and labels (mixup and its family). Mixing features repeatedly is effective in expanding feature diversity, but mixing labels repeatedly makes learning difficult. MixFeat makes it possible to obtain the advantages of repeated mixing by mixing only features. We report improved results obtained using existing network models with MixFeat on CIFAR-10/100 datasets. In addition, we show that MixFeat effectively reduces the over-fitting problem even when the training dataset is small or contains errors. MixFeat is easy to implement and can be added to various network models without additional computational cost in the inference phase.", "keywords": "regularization;generalization;image classification;latent space;feature learning", "primary_area": "", "supplementary_material": "", "author": "Yoichi Yaguchi;Fumiyuki Shiratani;Hidekazu Iwaki", "authorids": "yoichi_yaguchi@ot.olympus.co.jp;f_shiratani@ot.olympus.co.jp;h_iwaki@ot.olympus.co.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyaguchi2019mixfeat,\ntitle={MixFeat: Mix Feature in Latent Space Learns Discriminative Space},\nauthor={Yoichi Yaguchi and Fumiyuki Shiratani and Hidekazu Iwaki},\nyear={2019},\nurl={https://openreview.net/forum?id=HygT9oRqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HygT9oRqFX", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;4", "wc_review": "515;268;646", "wc_reply_reviewers": "0;0;116", "wc_reply_authors": "799;714;790", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 476.3333333333333, 156.72126708125975 ], "wc_reply_reviewers_avg": [ 38.666666666666664, 54.68292441175968 ], "wc_reply_authors_avg": [ 767.6666666666666, 38.12552367582058 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3186315848457562968&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "HygTE309t7", "title": "Outlier Detection from Image Data", "track": "main", "status": "Reject", "tldr": "A novel approach that detects outliers from image data, while preserving the classification accuracy of image classification", "abstract": "Modern applications from Autonomous Vehicles to Video Surveillance generate massive amounts of image data. In this work we propose a novel image outlier detection approach (IOD for short) that leverages the cutting-edge image classifier to discover outliers without using any labeled outlier. We observe that although intuitively the confidence that a convolutional neural network (CNN) has that an image belongs to a particular class could serve as outlierness measure to each image, directly applying this confidence to detect outlier does not work well. This is because CNN often has high confidence on an outlier image that does not belong to any target class due to its generalization ability that ensures the high accuracy in classification. To solve this issue, we propose a Deep Neural Forest-based approach that harmonizes the contradictory requirements of accurately classifying images and correctly detecting the outlier images. Our experiments using several benchmark image datasets including MNIST, CIFAR-10, CIFAR-100, and SVHN demonstrate the effectiveness of our IOD approach for outlier detection, capturing more than 90% of outliers generated by injecting one image dataset into another, while still preserving the classification accuracy of the multi-class classification problem.", "keywords": "Image outlier;CNN;Deep Neural Forest", "primary_area": "", "supplementary_material": "", "author": "Lei Cao;Yizhou Yan;Samuel Madden;Elke Rundensteiner", "authorids": "lcao@csail.mit.edu;yyan2@wpi.edu;madden@csail.mit.edu;rundenst@cs.wpi.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ncao2019outlier,\ntitle={Outlier Detection from Image Data},\nauthor={Lei Cao and Yizhou Yan and Samuel Madden and Elke Rundensteiner},\nyear={2019},\nurl={https://openreview.net/forum?id=HygTE309t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HygTE309t7", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "208;590;228", "wc_reply_reviewers": "0;517;0", "wc_reply_authors": "661;3019;1323", "reply_reviewers": "0;1;0", "reply_authors": "1;6;3", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 342.0, 175.5524612948126 ], "wc_reply_reviewers_avg": [ 172.33333333333334, 243.71613724896338 ], "wc_reply_authors_avg": [ 1667.6666666666667, 993.021427540995 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 2.0548046676563256 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "HygUOoC5KX", "title": "Are Generative Classifiers More Robust to Adversarial Attacks?", "track": "main", "status": "Reject", "tldr": "We proposed a generative classifier based on deep generative models, and show improved robustness and detection results against adversarial attacks. ", "abstract": "There is a rising interest in studying the robustness of deep neural network classifiers against adversaries, with both advanced attack and defence techniques being actively developed. However, most recent work focuses on discriminative classifiers, which only model the conditional distribution of the labels given the inputs. In this paper, we propose and investigate the deep Bayes classifier, which improves classical naive Bayes with conditional deep generative models. We further develop detection methods for adversarial examples, which reject inputs with low likelihood under the generative model. Experimental results suggest that deep Bayes classifiers are more robust than deep discriminative classifiers, and that the proposed detection methods are effective against many recently proposed attacks.", "keywords": "generative models;adversarial attack;defence;detection;Bayes' rule", "primary_area": "", "supplementary_material": "", "author": "Yingzhen Li;John Bradshaw;Yash Sharma", "authorids": "yl494@cam.ac.uk;jab255@cam.ac.uk;ysharma1126@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2019are,\ntitle={Are Generative Classifiers More Robust to Adversarial Attacks?},\nauthor={Yingzhen Li and John Bradshaw and Yash Sharma},\nyear={2019},\nurl={https://openreview.net/forum?id=HygUOoC5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HygUOoC5KX", "pdf_size": 0, "rating": "4;4;6;8", "confidence": "5;4;3;3", "wc_review": "837;363;316;467", "wc_reply_reviewers": "712;0;0;0", "wc_reply_authors": "2627;668;426;241", "reply_reviewers": "3;0;0;0", "reply_authors": "5;1;1;1", "rating_avg": [ 5.5, 1.6583123951777 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "wc_review_avg": [ 495.75, 204.45705539305803 ], "wc_reply_reviewers_avg": [ 178.0, 308.3050437472602 ], "wc_reply_authors_avg": [ 990.5, 956.88936142064 ], "reply_reviewers_avg": [ 0.75, 1.299038105676658 ], "reply_authors_avg": [ 2.0, 1.7320508075688772 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8181818181818182, "gs_citation": 111, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10770378244624939531&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "HygYqs0qKX", "title": "Conscious Inference for Object Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "Current Convolutional Neural Network (CNN)-based object detection models adopt strictly feedforward inference to predict the final detection results. However, the widely used one-way inference is agnostic to the global image context and the interplay between input image and task semantics. In this work, we present a general technique to improve off-the-shelf CNN-based object detection models in the inference stage without re-training, architecture modification or ground-truth requirements. We propose an iterative, bottom-up and top-down inference mechanism, which is named conscious inference, as it is inspired by prevalent models for human consciousness with top-down guidance and temporal persistence. While the downstream pass accumulates category-specific evidence over time, it subsequently affects the proposal calculation and the final detection. Feature activations are updated in line with no additional memory cost. Our approach advances the state of the art using popular detection models (Faster-RCNN, YOLOv2, YOLOv3) on 2D object detection and 6D object pose estimation.", "keywords": "consciousness;conscious inference;object detection;object pose estimation", "primary_area": "", "supplementary_material": "", "author": "Jiahuan Zhou;Nikolaos Karianakis;Ying Wu;Gang Hua", "authorids": "zhoujh09@gmail.com;nikolaos.karianakis@microsoft.com;yingwu@eecs.northwestern.edu;ganghua@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhou2019conscious,\ntitle={Conscious Inference for Object Detection},\nauthor={Jiahuan Zhou and Nikolaos Karianakis and Ying Wu and Gang Hua},\nyear={2019},\nurl={https://openreview.net/forum?id=HygYqs0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HygYqs0qKX", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;4", "wc_review": "335;117;418", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "28;28;28", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 290.0, 126.93567925003067 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 28.0, 0.0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WDKs2qKhIZYJ:scholar.google.com/&scioq=Conscious+Inference+for+Object+Detection&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "GAN Dissection: Visualizing and Understanding Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1089", "id": "Hyg_X2C5FX", "author_site": "David Bau, Jun-Yan Zhu, Hendrik Strobelt, Bolei Zhou, Joshua Tenenbaum, William Freeman, Antonio Torralba", "tldr": "GAN representations are examined in detail, and sets of representation units are found that control the generation of semantic concepts in the output.", "abstract": "Generative Adversarial Networks (GANs) have recently achieved impressive results for many real-world applications, and many GAN variants have emerged with improvements in sample quality and training stability. However, visualization and understanding of GANs is largely missing. How does a GAN represent our visual world internally? What causes the artifacts in GAN results? How do architectural choices affect GAN learning? Answering such questions could enable us to develop new insights and better models.\n\nIn this work, we present an analytic framework to visualize and understand GANs at the unit-, object-, and scene-level. We first identify a group of interpretable units that are closely related to object concepts with a segmentation-based network dissection method. Then, we quantify the causal effect of interpretable units by measuring the ability of interventions to control objects in the output. Finally, we examine the contextual relationship between these units and their surrounding by inserting the discovered object concepts into new images. We show several practical applications enabled by our framework, from comparing internal representations across different layers, models, and datasets, to improving GANs by locating and removing artifact-causing units, to interactively manipulating objects in the scene. We provide open source interpretation tools to help peer researchers and practitioners better understand their GAN models.", "keywords": "GANs;representation;interpretability;causality", "primary_area": "", "supplementary_material": "", "author": "David Bau;Jun-Yan Zhu;Hendrik Strobelt;Bolei Zhou;Joshua B. Tenenbaum;William T. Freeman;Antonio Torralba", "authorids": "davidbau@csail.mit.edu;junyanz@csail.mit.edu;hendrik.strobelt@ibm.com;bzhou@csail.mit.edu;jbt@csail.mit.edu;billf@csail.mit.edu;torralba@csail.mit.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nbau2018visualizing,\ntitle={Visualizing and Understanding Generative Adversarial Networks},\nauthor={David Bau and Jun-Yan Zhu and Hendrik Strobelt and Bolei Zhou and Joshua B. Tenenbaum and William T. Freeman and Antonio Torralba},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyg_X2C5FX},\n}", "github": "[![github](/images/github_icon.svg) CSAILVision/gandissect](https://github.com/CSAILVision/gandissect) + [![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=Hyg_X2C5FX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;3;4", "wc_review": "200;457;163", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "588;559;594", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 273.3333333333333, 130.74742402391294 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 580.3333333333334, 15.2825245151302 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 694, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=197925763027882731&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=Hyg_X2C5FX", "pdf": "https://openreview.net/pdf?id=Hyg_X2C5FX", "email": ";;;;;;", "author_num": 7 }, { "id": "HygcvsAcFX", "title": "Optimal margin Distribution Network", "track": "main", "status": "Reject", "tldr": "This paper presents a deep neural network embedding a loss function in regard to the optimal margin distribution, which alleviates the overfitting problem theoretically and empirically.", "abstract": "Recent research about margin theory has proved that maximizing the minimum margin like support vector machines does not necessarily lead to better performance, and instead, it is crucial to optimize the margin distribution. In the meantime, margin theory has been used to explain the empirical success of deep network in recent studies. In this paper, we present ODN (the Optimal margin Distribution Network), a network which embeds a loss function in regard to the optimal margin distribution. We give a theoretical analysis for our method using the PAC-Bayesian framework, which confirms the significance of the margin distribution for classification within the framework of deep networks. In addition, empirical results show that the ODN model always outperforms the baseline cross-entropy loss model consistently across different regularization situations. And our ODN\nmodel also outperforms the cross-entropy loss (Xent), hinge loss and soft hinge loss model in generalization task through limited training data.", "keywords": "Optimal margin distribution;Deep neural network;Generalization bound", "primary_area": "", "supplementary_material": "", "author": "Shen-Huan Lv;Lu Wang;Zhi-Hua Zhou", "authorids": "lvsh@lamda.nju.edu.cn;wangl@lamda.nju.edu.cn;zhouzh@lamda.nju.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlv2019optimal,\ntitle={Optimal margin Distribution Network},\nauthor={Shen-Huan Lv and Lu Wang and Zhi-Hua Zhou},\nyear={2019},\nurl={https://openreview.net/forum?id=HygcvsAcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HygcvsAcFX", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;3", "wc_review": "313;389;164", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "824;191;147", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 288.6666666666667, 93.45349407890298 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 387.3333333333333, 309.292023103661 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6561764432331589061&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Hyghb2Rct7", "title": "SIMILE: Introducing Sequential Information towards More Effective Imitation Learning", "track": "main", "status": "Reject", "tldr": "This paper introduces sequential information to improve inverse reinforcement learning algorithms", "abstract": "Reinforcement learning (RL) is a metaheuristic aiming at teaching an agent to interact with an environment and maximizing the reward in a complex task. RL algorithms often encounter the difficulty in defining a reward function in a sparse solution space. Imitation learning (IL) deals with this issue by providing a few expert demonstrations, and then either mimicking the expert's behavior (behavioral cloning, BC) or recovering the reward function by assuming the optimality of the expert (inverse reinforcement learning, IRL). Conventional IL approaches formulate the agent policy by mapping one single state to a distribution over actions, which did not consider sequential information. This strategy can be less accurate especially in IL, a weakly supervised learning environment, especially when the number of expert demonstrations is limited.\n\nThis paper presents an effective approach named Sequential IMItation LEarning (SIMILE). The core idea is to introduce sequential information, so that an agent can refer to both the current state and past state-action pairs to make a decision. We formulate our approach into a recurrent model, and instantiate it using LSTM so as to fuse both long-term and short-term information. SIMILE is a generalized IL framework which is easily applied to BL and IRL, two major types of IL algorithms. Experiments are performed on several robot controlling tasks in OpenAI Gym. SIMILE not only achieves performance gain over the baseline approaches, but also enjoys the benefit of faster convergence and better stability of testing performance. These advantages verify a higher learning efficiency of SIMILE, and implies its potential applications in real-world scenarios, i.e., when the agent-environment interaction is more difficult and/or expensive.", "keywords": "Reinforcement Learning;Imitation Learning;Sequential Information", "primary_area": "", "supplementary_material": "", "author": "Yutong Bai;Lingxi Xie", "authorids": "ytongbai@gmail.com;198808xc@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbai2019simile,\ntitle={{SIMILE}: Introducing Sequential Information towards More Effective Imitation Learning},\nauthor={Yutong Bai and Lingxi Xie},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyghb2Rct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hyghb2Rct7", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;5;3", "wc_review": "491;370;208", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "123;167;72", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 356.3333333333333, 115.93772274611726 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 120.66666666666667, 38.818666758260626 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FTCDWMV9M0AJ:scholar.google.com/&scioq=SIMILE:+Introducing+Sequential+Information+towards+More+Effective+Imitation+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Improving MMD-GAN Training with Repulsive Loss Function", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/676", "id": "HygjqjR9Km", "author_site": "Wei Wang, Yuan Sun, Saman Halgamuge", "tldr": "Rearranging the terms in maximum mean discrepancy yields a much better loss function for the discriminator of generative adversarial nets", "abstract": "Generative adversarial nets (GANs) are widely used to learn the data sampling process and their performance may heavily depend on the loss functions, given a limited computational budget. This study revisits MMD-GAN that uses the maximum mean discrepancy (MMD) as the loss function for GAN and makes two contributions. First, we argue that the existing MMD loss function may discourage the learning of fine details in data as it attempts to contract the discriminator outputs of real data. To address this issue, we propose a repulsive loss function to actively learn the difference among the real data by simply rearranging the terms in MMD. Second, inspired by the hinge loss, we propose a bounded Gaussian kernel to stabilize the training of MMD-GAN with the repulsive loss function. The proposed methods are applied to the unsupervised image generation tasks on CIFAR-10, STL-10, CelebA, and LSUN bedroom datasets. Results show that the repulsive loss function significantly improves over the MMD loss at no additional computational cost and outperforms other representative loss functions. The proposed methods achieve an FID score of 16.21 on the CIFAR-10 dataset using a single DCGAN network and spectral normalization.", "keywords": "generative adversarial nets;loss function;maximum mean discrepancy;image generation;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Wei Wang;Yuan Sun;Saman Halgamuge", "authorids": "weiw8@student.unimelb.edu.au;yuan.sun@rmit.edu.au;saman@unimelb.edu.au", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nwang2018improving,\ntitle={Improving {MMD}-{GAN} Training with Repulsive Loss Function},\nauthor={Wei Wang and Yuan Sun and Saman Halgamuge},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HygjqjR9Km},\n}", "github": "[![github](/images/github_icon.svg) richardwth/MMD-GAN](https://github.com/richardwth/MMD-GAN)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;5;5", "wc_review": "593;530;162", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "749;800;244", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 428.3333333333333, 190.0742544960317 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 597.6666666666666, 250.94532383679828 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999999, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5981776109708607840&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 18, "openreview": "https://openreview.net/forum?id=HygjqjR9Km", "pdf": "https://openreview.net/pdf?id=HygjqjR9Km", "email": ";;", "author_num": 3 }, { "id": "Hygm8jC9FQ", "title": "FAVAE: SEQUENCE DISENTANGLEMENT USING IN- FORMATION BOTTLENECK PRINCIPLE", "track": "main", "status": "Reject", "tldr": "We propose new model that can disentangle multiple dynamic factors in sequential data", "abstract": "A state-of-the-art generative model, a \u201dfactorized action variational autoencoder (FAVAE),\u201d is presented for learning disentangled and interpretable representations from sequential data via the information bottleneck without supervision. The purpose of disentangled representation learning is to obtain interpretable and transferable representations from data. We focused on the disentangled representation of sequential data because there is a wide range of potential applications if disentanglement representation is extended to sequential data such as video, speech, and stock price data. Sequential data is characterized by dynamic factors and static factors: dynamic factors are time-dependent, and static factors are independent of time. Previous works succeed in disentangling static factors and dynamic factors by explicitly modeling the priors of latent variables to distinguish between static and dynamic factors. However, this model can not disentangle representations between dynamic factors, such as disentangling \u201dpicking\u201d and \u201dthrowing\u201d in robotic tasks. In this paper, we propose new model that can disentangle multiple dynamic factors. Since our method does not require modeling priors, it is capable of disentangling \u201dbetween\u201d dynamic factors. In experiments, we show that FAVAE can extract the disentangled dynamic factors.", "keywords": "disentangled representation learning", "primary_area": "", "supplementary_material": "", "author": "Masanori Yamada;Kim Heecheol;Kosuke Miyoshi;Hiroshi Yamakawa", "authorids": "yamada0224@gmail.com;h-kim@isi.imi.i.u-tokyo.ac.jp;miyoshi@narr.jp;hiroshi_yamakawa@dwango.co.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyamada2019favae,\ntitle={{FAVAE}: {SEQUENCE} {DISENTANGLEMENT} {USING} {IN}- {FORMATION} {BOTTLENECK} {PRINCIPLE}},\nauthor={Masanori Yamada and Kim Heecheol and Kosuke Miyoshi and Hiroshi Yamakawa},\nyear={2019},\nurl={https://openreview.net/forum?id=Hygm8jC9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hygm8jC9FQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;5", "wc_review": "285;1006;868", "wc_reply_reviewers": "0;0;281", "wc_reply_authors": "124;224;843", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 719.6666666666666, 312.4764880043866 ], "wc_reply_reviewers_avg": [ 93.66666666666667, 132.4646703422799 ], "wc_reply_authors_avg": [ 397.0, 318.00104821630174 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=289041464039541409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Deterministic PAC-Bayesian generalization bounds for deep networks via generalizing noise-resilience", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/954", "id": "Hygn2o0qKX", "author_site": "Vaishnavh Nagarajan, Zico Kolter", "tldr": "We provide a PAC-Bayes based generalization guarantee for uncompressed, deterministic deep networks by generalizing noise-resilience of the network on the training data to the test data.", "abstract": "The ability of overparameterized deep networks to generalize well has been linked to the fact that stochastic gradient descent (SGD) finds solutions that lie in flat, wide minima in the training loss -- minima where the output of the network is resilient to small random noise added to its parameters. \nSo far this observation has been used to provide generalization guarantees only for neural networks whose parameters are either \\textit{stochastic} or \\textit{compressed}. In this work, we present a general PAC-Bayesian framework that leverages this observation to provide a bound on the original network learned -- a network that is deterministic and uncompressed. What enables us to do this is a key novelty in our approach: our framework allows us to show that if on training data, the interactions between the weight matrices satisfy certain conditions that imply a wide training loss minimum, these conditions themselves {\\em generalize} to the interactions between the matrices on test data, thereby implying a wide test loss minimum. We then apply our general framework in a setup where we assume that the pre-activation values of the network are not too small (although we assume this only on the training data). In this setup, we provide a generalization guarantee for the original (deterministic, uncompressed) network, that does not scale with product of the spectral norms of the weight matrices -- a guarantee that would not have been possible with prior approaches.", "keywords": "generalization;PAC-Bayes;SGD;learning theory;implicit regularization", "primary_area": "", "supplementary_material": "", "author": "Vaishnavh Nagarajan;Zico Kolter", "authorids": "vaishnavh@cs.cmu.edu;zkolter@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nnagarajan2018deterministic,\ntitle={Deterministic {PAC}-Bayesian generalization bounds for deep networks via generalizing noise-resilience},\nauthor={Vaishnavh Nagarajan and Zico Kolter},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hygn2o0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "5;7;7;8", "confidence": "4;3;2;5", "wc_review": "342;319;234;309", "wc_reply_reviewers": "709;0;0;416", "wc_reply_authors": "4567;525;51;1069", "reply_reviewers": "2;0;0;1", "reply_authors": "9;3;1;5", "rating_avg": [ 6.75, 1.0897247358851685 ], "confidence_avg": [ 3.5, 1.118033988749895 ], "wc_review_avg": [ 301.0, 40.4907396820557 ], "wc_reply_reviewers_avg": [ 281.25, 299.7210161133183 ], "wc_reply_authors_avg": [ 1553.0, 1777.0227910750048 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 4.5, 2.958039891549808 ], "replies_avg": [ 36, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.10259783520851541, "gs_citation": 121, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2407471741644905222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Hygn2o0qKX", "pdf": "https://openreview.net/pdf?id=Hygn2o0qKX", "email": ";", "author_num": 2 }, { "id": "Hygp1nR9FQ", "title": "Unifying Bilateral Filtering and Adversarial Training for Robust Neural Networks", "track": "main", "status": "Reject", "tldr": "We adapt bilateral filtering as a layer in a neural network which improves robustness to adversarial examples using nonlocal filtering.", "abstract": "Recent analysis of deep neural networks has revealed their vulnerability to carefully structured adversarial examples. Many effective algorithms exist to craft these adversarial examples, but performant defenses seem to be far away. In this work, we explore the use of edge-aware bilateral filtering as a projection back to the space of natural images. We show that bilateral filtering is an effective defense in multiple attack settings, where the strength of the adversary gradually increases. In the case of adversary who has no knowledge of the defense, bilateral filtering can remove more than 90% of adversarial examples from a variety of different attacks. To evaluate against an adversary with complete knowledge of our defense, we adapt the bilateral filter as a trainable layer in a neural network and show that adding this layer makes ImageNet images significantly more robust to attacks. When trained under a framework of adversarial training, we show that the resulting model is hard to fool with even the best attack methods. ", "keywords": "Adversarial examples;Image denoising", "primary_area": "", "supplementary_material": "", "author": "Neale Ratzlaff;Li Fuxin", "authorids": "ratzlafn@oregonstate.edu;lif@oregonstate.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nratzlaff2019unifying,\ntitle={Unifying Bilateral Filtering and Adversarial Training for Robust Neural Networks},\nauthor={Neale Ratzlaff and Li Fuxin},\nyear={2019},\nurl={https://openreview.net/forum?id=Hygp1nR9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hygp1nR9FQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;3;5", "wc_review": "896;194;532", "wc_reply_reviewers": "378;0;0", "wc_reply_authors": "1026;200;419", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 540.6666666666666, 286.65581374804793 ], "wc_reply_reviewers_avg": [ 126.0, 178.19090885900997 ], "wc_reply_authors_avg": [ 548.3333333333334, 349.3940786879798 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6752184922481277519&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HygqJnCqtm", "title": "Rating Continuous Actions in Spatial Multi-Agent Problems", "track": "main", "status": "Reject", "tldr": "", "abstract": "We study credit assignment problems in spatial multi-agent environments where agents pursue a joint objective. On the example of soccer, we rate the movements of individual players with respect to their potential for staging a successful attack. We propose a purely data-driven approach to simultaneously learn a model of agent movements as well as their ratings via an agent-centric deep reinforcement learning framework. Our model allows for efficient learning and sampling of ratings in the continuous action space. We empirically observe on historic soccer data that the model accurately rates agent movements w.r.t. their relative contribution to the collective goal.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Uwe Dick;Maryam Tavakol;Ulf Brefeld", "authorids": "uwe.dick@leuphana.de;tavakol@leuphana.de;brefeld@leuphana.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndick2019rating,\ntitle={Rating Continuous Actions in Spatial Multi-Agent Problems},\nauthor={Uwe Dick and Maryam Tavakol and Ulf Brefeld},\nyear={2019},\nurl={https://openreview.net/forum?id=HygqJnCqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HygqJnCqtm", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;4", "wc_review": "878;214;159", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 417.0, 326.748629173355 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3091047013327852799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Recall Traces: Backtracking Models for Efficient Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1033", "id": "HygsfnR9Ym", "author_site": "Anirudh Goyal, Philemon Brakel, William Fedus, Soumye Singhal, Timothy Lillicrap, Sergey Levine, Hugo Larochelle, Yoshua Bengio", "tldr": "A backward model of previous (state, action) given the next state, i.e. P(s_t, a_t | s_{t+1}), can be used to simulate additional trajectories terminating at states of interest! Improves RL learning efficiency.", "abstract": "In many environments only a tiny subset of all states yield high reward. In these cases, few of the interactions with the environment provide a relevant learning signal. Hence, we may want to preferentially train on those high-reward states and the probable trajectories leading to them. \nTo this end, we advocate for the use of a \\textit{backtracking model} that predicts the preceding states that terminate at a given high-reward state. We can train a model which, starting from a high value state (or one that is estimated to have high value), predicts and samples which (state, action)-tuples may have led to that high value state. These traces of (state, action) pairs, which we refer to as Recall Traces, sampled from this backtracking model starting from a high value state, are informative as they terminate in good states, and hence we can use these traces to improve a policy. We provide a variational interpretation for this idea and a practical algorithm in which the backtracking model samples from an approximate posterior distribution over trajectories which lead to large rewards. Our method improves the sample efficiency of both on- and off-policy RL algorithms across several environments and tasks. ", "keywords": "Model free RL;Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Anirudh Goyal;Philemon Brakel;William Fedus;Soumye Singhal;Timothy Lillicrap;Sergey Levine;Hugo Larochelle;Yoshua Bengio", "authorids": "anirudhgoyal9119@gmail.com;philemon@google.com;liam.fedus@gmail.com;singhalsoumye@gmail.com;countzero@google.com;svlevine@eecs.berkeley.edu;hugolarochelle@google.com;yoshua.bengio@mila.quebec", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\ngoyal2018recall,\ntitle={Recall Traces: Backtracking Models for Efficient Reinforcement Learning},\nauthor={Anirudh Goyal and Philemon Brakel and William Fedus and Soumye Singhal and Timothy Lillicrap and Sergey Levine and Hugo Larochelle and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HygsfnR9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;2", "wc_review": "98;125;488", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "111;699;1523", "reply_reviewers": "0;0;0", "reply_authors": "1;2;5", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 237.0, 177.82575741438583 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 777.6666666666666, 579.1242430505641 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4448196484511573554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HygsfnR9Ym", "pdf": "https://openreview.net/pdf?id=HygsfnR9Ym", "email": ";;;;;;;", "author_num": 8 }, { "id": "HygtHnR5tQ", "title": "Generative Adversarial Networks for Extreme Learned Image Compression", "track": "main", "status": "Reject", "tldr": "GAN-based extreme image compression method using less than half the bits of the SOTA engineered codec while preserving visual quality", "abstract": "We propose a framework for extreme learned image compression based on Generative Adversarial Networks (GANs), obtaining visually pleasing images at significantly lower bitrates than previous methods. This is made possible through our GAN formulation of learned compression combined with a generator/decoder which operates on the full-resolution image and is trained in combination with a multi-scale discriminator. Additionally, if a semantic label map of the original image is available, our method can fully synthesize unimportant regions in the decoded image such as streets and trees from the label map, therefore only requiring the storage of the preserved region and the semantic label map. A user study confirms that for low bitrates, our approach is preferred to state-of-the-art methods, even when they use more than double the bits.", "keywords": "Learned compression;generative adversarial networks;extreme compression", "primary_area": "", "supplementary_material": "", "author": "Eirikur Agustsson;Michael Tschannen;Fabian Mentzer;Radu Timofte;Luc van Gool", "authorids": "aeirikur@vision.ee.ethz.ch;michaelt@nari.ee.ethz.ch;mentzerf@vision.ee.ethz.ch;radu.timofte@vision.ee.ethz.ch;vangool@vision.ee.ethz.ch", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nagustsson2019generative,\ntitle={Generative Adversarial Networks for Extreme Learned Image Compression},\nauthor={Eirikur Agustsson and Michael Tschannen and Fabian Mentzer and Radu Timofte and Luc van Gool},\nyear={2019},\nurl={https://openreview.net/forum?id=HygtHnR5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HygtHnR5tQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;3", "wc_review": "263;157;232", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "493;220;311", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 217.33333333333334, 44.4996878890428 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 341.3333333333333, 113.49694073214289 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 698, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12093745616543835312&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "id": "Hygv0sC5F7", "title": "When Will Gradient Methods Converge to Max-margin Classifier under ReLU Models?", "track": "main", "status": "Reject", "tldr": "We study the implicit bias of gradient methods in solving a binary classification problem with nonlinear ReLU models.", "abstract": "We study the implicit bias of gradient descent methods in solving a binary classification problem over a linearly separable dataset. The classifier is described by a nonlinear ReLU model and the objective function adopts the exponential loss function. We first characterize the landscape of the loss function and show that there can exist spurious asymptotic local minima besides asymptotic global minima. We then show that gradient descent (GD) can converge to either a global or a local max-margin direction, or may diverge from the desired max-margin direction in a general context. For stochastic gradient descent (SGD), we show that it converges in expectation to either the global or the local max-margin direction if SGD converges. We further explore the implicit bias of these algorithms in learning a multi-neuron network under certain stationary conditions, and show that the learned classifier maximizes the margins of each sample pattern partition under the ReLU activation.", "keywords": "gradient method;max-margin;ReLU model", "primary_area": "", "supplementary_material": "", "author": "Tengyu Xu;Yi Zhou;Kaiyi Ji;Yingbin Liang", "authorids": "xu.3260@osu.edu;zhou.1172@osu.edu;ji.367@osu.edu;liang.889@osu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxu2019when,\ntitle={When Will Gradient Methods Converge to Max-margin Classifier under Re{LU} Models?},\nauthor={Tengyu Xu and Yi Zhou and Kaiyi Ji and Yingbin Liang},\nyear={2019},\nurl={https://openreview.net/forum?id=Hygv0sC5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Hygv0sC5F7", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;5", "wc_review": "295;130;513", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 312.6666666666667, 156.85733078891218 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9020199152029029285&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "Hygvln09K7", "title": "Meta Learning with Fast/Slow Learners", "track": "main", "status": "Reject", "tldr": "We applied multiple meta-strategy to improve meta-learning performance on base CNNs. ", "abstract": "Meta-learning has recently achieved success in many optimization problems. In general, a meta learner g(.) could be learned for a base model f(.) on a variety of tasks, such that it can be more efficient on a new task. In this paper, we make some key modifications to enhance the performance of meta-learning models. (1) we leverage different meta-strategies for different modules to optimize them separately: we use conservative \u201cslow learners\u201d on low-level basic feature representation layers and \u201cfast learners\u201d on high-level task-specific layers; (2) Furthermore, we provide theoretical analysis on why the proposed approach works, based on a case study on a two-layer MLP. We evaluate our model on synthetic MLP regression, as well as low-shot learning tasks on Omniglot and ImageNet benchmarks. We demonstrate that our approach is able to achieve state-of-the-art performance.", "keywords": "computer vision;meta learning", "primary_area": "", "supplementary_material": "", "author": "zhuoyuan@fb.com", "authorids": "chengzhuoyuan07@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nzhuoyuan@fb.com2019meta,\ntitle={Meta Learning with Fast/Slow Learners},\nauthor={zhuoyuan@fb.com},\nyear={2019},\nurl={https://openreview.net/forum?id=Hygvln09K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hygvln09K7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;3", "wc_review": "375;175;459", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 336.3333333333333, 119.12271916902426 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Stable Recurrent Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/658", "id": "Hygxb2CqKm", "author_site": "John Miller, Moritz Hardt", "tldr": "Stable recurrent models can be approximated by feed-forward networks and empirically perform as well as unstable models on benchmark tasks.", "abstract": "Stability is a fundamental property of dynamical systems, yet to this date it has had little bearing on the practice of recurrent neural networks. In this work, we conduct a thorough investigation of stable recurrent models. Theoretically, we prove stable recurrent neural networks are well approximated by feed-forward networks for the purpose of both inference and training by gradient descent. Empirically, we demonstrate stable recurrent models often perform as well as their unstable counterparts on benchmark sequence tasks. Taken together, these findings shed light on the effective power of recurrent networks and suggest much of sequence learning happens, or can be made to happen, in the stable regime. Moreover, our results help to explain why in many cases practitioners succeed in replacing recurrent models by feed-forward models.\n", "keywords": "stability;gradient descent;non-convex optimization;recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "John Miller;Moritz Hardt", "authorids": "miller_john@berkeley.edu;hardt@berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmiller2018stable,\ntitle={Stable Recurrent Models},\nauthor={John Miller and Moritz Hardt},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hygxb2CqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;2", "wc_review": "504;138;603", "wc_reply_reviewers": "54;143;70", "wc_reply_authors": "487;822;278", "reply_reviewers": "1;2;2", "reply_authors": "1;3;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 415.0, 199.99499993749845 ], "wc_reply_reviewers_avg": [ 89.0, 38.738439135652676 ], "wc_reply_authors_avg": [ 529.0, 224.06397895839186 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10865601919039311451&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Hygxb2CqKm", "pdf": "https://openreview.net/pdf?id=Hygxb2CqKm", "email": ";", "author_num": 2 }, { "id": "HylDpoActX", "title": "N-Ary Quantization for CNN Model Compression and Inference Acceleration", "track": "main", "status": "Reject", "tldr": "We propose a quantization scheme for weights and activations of deep neural networks. This reduces the memory footprint substantially and accelerates inference.", "abstract": "The tremendous memory and computational complexity of Convolutional Neural Networks (CNNs) prevents the inference deployment on resource-constrained systems. As a result, recent research focused on CNN optimization techniques, in particular quantization, which allows weights and activations of layers to be represented with just a few bits while achieving impressive prediction performance. However, aggressive quantization techniques still fail to achieve full-precision prediction performance on state-of-the-art CNN architectures on large-scale classification tasks. In this work we propose a method for weight and activation quantization that is scalable in terms of quantization levels (n-ary representations) and easy to compute while maintaining the performance close to full-precision CNNs. Our weight quantization scheme is based on trainable scaling factors and a nested-means clustering strategy which is robust to weight updates and therefore exhibits good convergence properties. The flexibility of nested-means clustering enables exploration of various n-ary weight representations with the potential of high parameter compression. For activations, we propose a linear quantization strategy that takes the statistical properties of batch normalization into account. We demonstrate the effectiveness of our approach using state-of-the-art models on ImageNet.", "keywords": "low-resource deep neural networks;quantized weights;weight-clustering;resource efficient neural networks", "primary_area": "", "supplementary_material": "", "author": "G\u00fcnther Schindler;Wolfgang Roth;Franz Pernkopf;Holger Fr\u00f6ning", "authorids": "guenther.schindler@ziti.uni-heidelberg.de;roth@tugraz.at;pernkopf@tugraz.at;holger.froening@ziti.uni-heidelberg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nschindler2019nary,\ntitle={N-Ary Quantization for {CNN} Model Compression and Inference Acceleration},\nauthor={G\u00fcnther Schindler and Wolfgang Roth and Franz Pernkopf and Holger Fr\u00f6ning},\nyear={2019},\nurl={https://openreview.net/forum?id=HylDpoActX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=HylDpoActX", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;4;5", "wc_review": "212;467;248", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "266;339;132", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 309.0, 112.68540278137182 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 245.66666666666666, 85.72177215983243 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3983504826308906541&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HylIcj0qFQ", "title": "Capacity of Deep Neural Networks under Parameter Quantization", "track": "main", "status": "Withdraw", "tldr": "We suggest the sufficient number of bits for representing weights of DNNs and the optimum bits are conservative when solving real problems.", "abstract": "Most deep neural networks (DNNs) require complex models to achieve high performance. Parameter quantization is widely used for reducing the implementation complexities. Previous studies on quantization were mostly based on extensive simulation using training data. We choose a different approach and attempt to measure the per-parameter capacity of DNN models and interpret the results to obtain insights on optimum quantization of parameters. This research uses artificially generated data and generic forms of fully connected DNNs, convolutional neural networks, and recurrent neural networks. We conduct memorization and classification tests to study the effects of the number and precision of the parameters on the performance. The model and the per-parameter capacities are assessed by measuring the mutual information between the input and the classified output. We also extend the memorization capacity measurement results to image classification and language modeling tasks. To get insight for parameter quantization when performing real tasks, the training and test performances are compared.", "keywords": "quantization;network capacity;hardware implementation;network compression", "primary_area": "", "supplementary_material": "", "author": "Yoonho Boo;Sungho Shin;and Wonyong Sung", "authorids": "dnsgh337@snu.ac.kr;ssh9919@snu.ac.kr;wysung@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HylIcj0qFQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;4;3", "wc_review": "416;697;989", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 700.6666666666666, 233.94063824445342 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Yc2eLZfEiv0J:scholar.google.com/&scioq=Capacity+of+Deep+Neural+Networks+under+Parameter+Quantization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HylJtiRqYQ", "title": "VECTORIZATION METHODS IN RECOMMENDER SYSTEM", "track": "main", "status": "Reject", "tldr": "", "abstract": "The most used recommendation method is collaborative filtering, and the key part of collaborative filtering is to compute the similarity. The similarity based on co-occurrence of similar event is easy to implement and can be applied to almost all the situation. So when the word2vec model reach the state-of-art at a lower computation cost in NLP. An correspond model in recommender system item2vec is proposed and reach state-of-art in recommender system. It is easy to see that the position of user and item is interchangeable when their count size gap is not too much, we proposed a user2vec model and show its performance. The similarity based on co-occurrence information suffers from cold start, we proposed a content based similarity model based on doc2vec which is another technology in NLP.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Qiang Sun;Bin Wang;Zizhou Gu;Yanwei Fu", "authorids": "~Qiang_Sun2;vborisw@gmail.com;2470569@qq.com;yanweifu@fudan.edu.cn", "gender": "M;;;", "homepage": "https://sites.google.com/view/qsun;;;", "dblp": "73/2066-7;;;", "google_scholar": "f0V2fAYAAAAJ;;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Qiang_Sun2;vborisw@gmail.com;2470569@qq.com;yanweifu@fudan.edu.cn", "aff": "University of Toronto;;;", "aff_domain": "utoronto.ca;;;", "position": "Full Professor;;;", "bibtex": "@misc{\nsun2019vectorization,\ntitle={{VECTORIZATION} {METHODS} {IN} {RECOMMENDER} {SYSTEM}},\nauthor={Qiang Sun and Bin Wang and Zizhou Gu and Yanwei Fu},\nyear={2019},\nurl={https://openreview.net/forum?id=HylJtiRqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HylJtiRqYQ", "pdf_size": 0, "rating": "2;2;3", "confidence": "5;5;4", "wc_review": "127;86;110", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 107.66666666666667, 16.81930108205715 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17575025455214368467&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_country_unique_index": "0", "aff_country_unique": "Canada" }, { "id": "HylKJhCcKm", "title": "Generalized Capsule Networks with Trainable Routing Procedure", "track": "main", "status": "Reject", "tldr": "A scalable capsule network", "abstract": "CapsNet (Capsule Network) was first proposed by Sabour et al. (2017) and lateranother version of CapsNet was proposed by Hinton et al. (2018). CapsNet hasbeen proved effective in modeling spatial features with much fewer parameters.However, the routing procedures (dynamic routing and EM routing) in both pa-pers are not well incorporated into the whole training process, and the optimalnumber for the routing procedure has to be found manually. We propose Gen-eralized GapsNet (G-CapsNet) to overcome this disadvantages by incorporatingthe routing procedure into the optimization. We implement two versions of G-CapsNet (fully-connected and convolutional) on CAFFE (Jia et al. (2014)) andevaluate them by testing the accuracy on MNIST & CIFAR10, the robustness towhite-box & black-box attack, and the generalization ability on GAN-generatedsynthetic images. We also explore the scalability of G-CapsNet by constructinga relatively deep G-CapsNet. The experiment shows that G-CapsNet has goodgeneralization ability and scalability. ", "keywords": "Capsule networks;generalization;scalability;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Zhenhua Chen;Chuhua Wang;Tiancong Zhao;David Crandall", "authorids": "chen478@iu.edu;cw234@iu.edu;tz11@iu.edu;djcran@iu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2019generalized,\ntitle={Generalized Capsule Networks with Trainable Routing Procedure},\nauthor={Zhenhua Chen and Chuhua Wang and Tiancong Zhao and David Crandall},\nyear={2019},\nurl={https://openreview.net/forum?id=HylKJhCcKm},\n}", "github": "[![github](/images/github_icon.svg) chenzhenhua986/CAFFE-CapsNet](https://github.com/chenzhenhua986/CAFFE-CapsNet)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HylKJhCcKm", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;5;3", "wc_review": "424;258;196", "wc_reply_reviewers": "104;0;0", "wc_reply_authors": "945;424;488", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 292.6666666666667, 96.25429283356088 ], "wc_reply_reviewers_avg": [ 34.666666666666664, 49.026070162267295 ], "wc_reply_authors_avg": [ 619.0, 231.99281598072528 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1128210281313258105&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "HylRk2A5FQ", "title": "Graph Learning Network: A Structure Learning Algorithm", "track": "main", "status": "Reject", "tldr": "Methods for simultaneous prediction of nodes' feature embeddings and adjacency matrix, and how to learn this process.", "abstract": "Graph prediction methods that work closely with the structure of the data, e.g., graph generation, commonly ignore the content of its nodes. On the other hand, the solutions that consider the node\u2019s information, e.g., classification, ignore the structure of the whole. And some methods exist in between, e.g., link prediction, but predict the structure piece-wise instead of considering the graph as a whole. We hypothesize that by jointly predicting the structure of the graph and its nodes\u2019 features, we can improve both tasks. We propose the Graph Learning Network (GLN), a simple yet effective process to learn node embeddings and structure prediction functions. Our model uses graph convolutions to propose expected node features, and predict the best structure based on them. We repeat these steps sequentially to enhance the prediction and the embeddings. In contrast to existing generation methods that rely only on the structure of the data, we use the feature on the nodes to predict better relations, similar to what link prediction methods do. However, we propose an holistic approach to process the whole graph for our predictions. Our experiments show that our method predicts consistent structures across a set of problems, while creating meaningful node embeddings.", "keywords": "graph prediction;graph structure learning;graph neural network", "primary_area": "", "supplementary_material": "", "author": "Darwin Danilo Saire Pilco;Ad\u00edn Ram\u00edrez Rivera", "authorids": "darwin.pilco@ic.unicamp.br;adin@ic.unicamp.br", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npilco2019graph,\ntitle={Graph Learning Network: A Structure Learning Algorithm},\nauthor={Darwin Danilo Saire Pilco and Ad\u00edn Ram\u00edrez Rivera},\nyear={2019},\nurl={https://openreview.net/forum?id=HylRk2A5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HylRk2A5FQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;5", "wc_review": "817;267;271", "wc_reply_reviewers": "41;0;0", "wc_reply_authors": "113;45;140", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 451.6666666666667, 258.33483870529136 ], "wc_reply_reviewers_avg": [ 13.666666666666666, 19.3275853524323 ], "wc_reply_authors_avg": [ 99.33333333333333, 39.96943276499625 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7329105110329938467&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "HylSk205YQ", "title": "Multi-agent Deep Reinforcement Learning with Extremely Noisy Observations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Multi-agent reinforcement learning systems aim to provide interacting agents with the ability to collaboratively learn and adapt to the behaviour of other agents. In many real-world applications, the agents can only acquire a partial view of the world. Here we consider a setting whereby most agents' observations are also extremely noisy, hence only weakly correlated to the true state of the environment. Under these circumstances, learning an optimal policy becomes particularly challenging, even in the unrealistic case that an agent's policy can be made conditional upon all other agents\u2019 observations. To overcome these difficulties, we propose a multi-agent deep deterministic policy gradient algorithm enhanced by a communication medium (MADDPG-M), which implements a two-level, concurrent learning mechanism. An agent's policy depends on its own private observations as well as those explicitly shared by others through a communication medium. At any given point in time, an agent must decide whether its private observations are sufficiently informative to be shared with others. However, our environments provide no explicit feedback informing an agent whether a communication action is beneficial, rather the communication policies must also be learned through experience concurrently to the main policies. Our experimental results demonstrate that the algorithm performs well in six highly non-stationary environments of progressively higher complexity, and offers substantial performance gains compared to the baselines.", "keywords": "Reinforcement learning;multi-agent;hierarchical;noisy observation;partial observability;deep learning", "primary_area": "", "supplementary_material": "", "author": "Ozsel Kilinc;Giovanni Montana", "authorids": "ozsel.kilinc@warwick.ac.uk;g.montana@warwick.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkilinc2019multiagent,\ntitle={Multi-agent Deep Reinforcement Learning with Extremely Noisy Observations},\nauthor={Ozsel Kilinc and Giovanni Montana},\nyear={2019},\nurl={https://openreview.net/forum?id=HylSk205YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HylSk205YQ", "pdf_size": 0, "rating": "3;6;7", "confidence": "4;3;2", "wc_review": "245;650;229", "wc_reply_reviewers": "112;0;0", "wc_reply_authors": "1221;1182;95", "reply_reviewers": "2;0;0", "reply_authors": "3;2;1", "rating_avg": [ 5.333333333333333, 1.699673171197595 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 374.6666666666667, 194.79961213741902 ], "wc_reply_reviewers_avg": [ 37.333333333333336, 52.797306328595546 ], "wc_reply_authors_avg": [ 832.6666666666666, 521.8520437399432 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9607689228305228, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13703950905489738522&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "The Limitations of Adversarial Training and the Blind-Spot Attack", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/730", "id": "HylTBhA5tQ", "author_site": "Huan Zhang, Hongge Chen, Zhao Song, Duane S Boning, Inderjit Dhillon, Cho-Jui Hsieh", "tldr": "We show that even the strongest adversarial training methods cannot defend against adversarial examples crafted on slightly scaled and shifted test images.", "abstract": "The adversarial training procedure proposed by Madry et al. (2018) is one of the most effective methods to defend against adversarial examples in deep neural net- works (DNNs). In our paper, we shed some lights on the practicality and the hardness of adversarial training by showing that the effectiveness (robustness on test set) of adversarial training has a strong correlation with the distance between a test point and the manifold of training data embedded by the network. Test examples that are relatively far away from this manifold are more likely to be vulnerable to adversarial attacks. Consequentially, an adversarial training based defense is susceptible to a new class of attacks, the \u201cblind-spot attack\u201d, where the input images reside in \u201cblind-spots\u201d (low density regions) of the empirical distri- bution of training data but is still on the ground-truth data manifold. For MNIST, we found that these blind-spots can be easily found by simply scaling and shifting image pixel values. Most importantly, for large datasets with high dimensional and complex data manifold (CIFAR, ImageNet, etc), the existence of blind-spots in adversarial training makes defending on any valid test examples difficult due to the curse of dimensionality and the scarcity of training data. Additionally, we find that blind-spots also exist on provable defenses including (Kolter & Wong, 2018) and (Sinha et al., 2018) because these trainable robustness certificates can only be practically optimized on a limited set of training data.", "keywords": "Adversarial Examples;Adversarial Training;Blind-Spot Attack", "primary_area": "", "supplementary_material": "", "author": "Huan Zhang*;Hongge Chen*;Zhao Song;Duane Boning;Inderjit S. Dhillon;Cho-Jui Hsieh", "authorids": "huan@huan-zhang.com;chenhg@mit.edu;zhaos@utexas.edu;boning@mtl.mit.edu;inderjit@cs.utexas.edu;chohsieh@cs.ucla.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nzhang2018the,\ntitle={The Limitations of Adversarial Training and the Blind-Spot Attack},\nauthor={Huan Zhang and Hongge Chen and Zhao Song and Duane Boning and inderjit dhillon and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HylTBhA5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;2;3", "wc_review": "365;185;379", "wc_reply_reviewers": "479;0;0", "wc_reply_authors": "1205;233;386", "reply_reviewers": "2;0;0", "reply_authors": "4;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 309.6666666666667, 88.33773573935181 ], "wc_reply_reviewers_avg": [ 159.66666666666666, 225.8027654589042 ], "wc_reply_authors_avg": [ 608.0, 426.738795986491 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 194, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6221614174421347650&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HylTBhA5tQ", "pdf": "https://openreview.net/pdf?id=HylTBhA5tQ", "email": ";;;;;", "author_num": 6 }, { "title": "Efficiently testing local optimality and escaping saddles for ReLU networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/957", "id": "HylTXn0qYX", "author_site": "Chulhee Yun, Suvrit Sra, Ali Jadbabaie", "tldr": "A theoretical algorithm for testing local optimality and extracting descent directions at nondifferentiable points of empirical risks of one-hidden-layer ReLU networks.", "abstract": "We provide a theoretical algorithm for checking local optimality and escaping saddles at nondifferentiable points of empirical risks of two-layer ReLU networks. Our algorithm receives any parameter value and returns: local minimum, second-order stationary point, or a strict descent direction. The presence of M data points on the nondifferentiability of the ReLU divides the parameter space into at most 2^M regions, which makes analysis difficult. By exploiting polyhedral geometry, we reduce the total computation down to one convex quadratic program (QP) for each hidden node, O(M) (in)equality tests, and one (or a few) nonconvex QP. For the last QP, we show that our specific problem can be solved efficiently, in spite of nonconvexity. In the benign case, we solve one equality constrained QP, and we prove that projected gradient descent solves it exponentially fast. In the bad case, we have to solve a few more inequality constrained QPs, but we prove that the time complexity is exponential only in the number of inequality constraints. Our experiments show that either benign case or bad case with very few inequality constraints occurs, implying that our algorithm is efficient in most cases.", "keywords": "local optimality;second-order stationary point;escaping saddle points;nondifferentiability;ReLU;empirical risk", "primary_area": "", "supplementary_material": "", "author": "Chulhee Yun;Suvrit Sra;Ali Jadbabaie", "authorids": "chulheey@mit.edu;suvrit@mit.edu;jadbabai@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyun2018efficiently,\ntitle={Efficiently testing local optimality and escaping saddles for Re{LU} networks},\nauthor={Chulhee Yun and Suvrit Sra and Ali Jadbabaie},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HylTXn0qYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "3;6;6;8", "confidence": "4;3;2;3", "wc_review": "360;373;390;385", "wc_reply_reviewers": "103;0;0;63", "wc_reply_authors": "330;267;443;264", "reply_reviewers": "2;0;0;1", "reply_authors": "2;1;1;2", "rating_avg": [ 5.75, 1.7853571071357126 ], "confidence_avg": [ 3.0, 0.7071067811865476 ], "wc_review_avg": [ 377.0, 11.597413504743201 ], "wc_reply_reviewers_avg": [ 41.5, 43.84347157787577 ], "wc_reply_authors_avg": [ 326.0, 72.50862017718997 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5940885257860046, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17166372605101418764&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=HylTXn0qYX", "pdf": "https://openreview.net/pdf?id=HylTXn0qYX", "email": ";;", "author_num": 3 }, { "title": "ProxylessNAS: Direct Neural Architecture Search on Target Task and Hardware", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1029", "id": "HylVB3AqYm", "author_site": "Han Cai, Ligeng Zhu, Song Han", "tldr": "Proxy-less neural architecture search for directly learning architectures on large-scale target task (ImageNet) while reducing the cost to the same level of normal training.", "abstract": "Neural architecture search (NAS) has a great impact by automatically designing effective neural network architectures. However, the prohibitive computational demand of conventional NAS algorithms (e.g. 10 4 GPU hours) makes it difficult to directly search the architectures on large-scale tasks (e.g. ImageNet). Differentiable NAS can reduce the cost of GPU hours via a continuous representation of network architecture but suffers from the high GPU memory consumption issue (grow linearly w.r.t. candidate set size). As a result, they need to utilize proxy tasks, such as training on a smaller dataset, or learning with only a few blocks, or training just for a few epochs. These architectures optimized on proxy tasks are not guaranteed to be optimal on the target task. In this paper, we present ProxylessNAS that can directly learn the architectures for large-scale target tasks and target hardware platforms. We address the high memory consumption issue of differentiable NAS and reduce the computational cost (GPU hours and GPU memory) to the same level of regular training while still allowing a large candidate set. Experiments on CIFAR-10 and ImageNet demonstrate the effectiveness of directness and specialization. On CIFAR-10, our model achieves 2.08% test error with only 5.7M parameters, better than the previous state-of-the-art architecture AmoebaNet-B, while using 6\u00d7 fewer parameters. On ImageNet, our model achieves 3.1% better top-1 accuracy than MobileNetV2, while being 1.2\u00d7 faster with measured GPU latency. We also apply ProxylessNAS to specialize neural architectures for hardware with direct hardware metrics (e.g. latency) and provide insights for efficient CNN architecture design.", "keywords": "Neural Architecture Search;Efficient Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Han Cai;Ligeng Zhu;Song Han", "authorids": "hancai@mit.edu;ligeng@mit.edu;songhan@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ncai2018proxylessnas,\ntitle={Proxyless{NAS}: Direct Neural Architecture Search on Target Task and Hardware},\nauthor={Han Cai and Ligeng Zhu and Song Han},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HylVB3AqYm},\n}", "github": "[![github](/images/github_icon.svg) MIT-HAN-LAB/ProxylessNAS](https://github.com/MIT-HAN-LAB/ProxylessNAS) + [![Papers with Code](/images/pwc_icon.svg) 22 community implementations](https://paperswithcode.com/paper/?openreview=HylVB3AqYm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "2;4;2", "wc_review": "114;494;215", "wc_reply_reviewers": "0;575;168", "wc_reply_authors": "333;1567;416", "reply_reviewers": "0;1;1", "reply_authors": "1;4;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 274.3333333333333, 160.70746370000643 ], "wc_reply_reviewers_avg": [ 247.66666666666666, 241.4074471832954 ], "wc_reply_authors_avg": [ 772.0, 563.1701933400477 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2375, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18033301425061747520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HylVB3AqYm", "pdf": "https://openreview.net/pdf?id=HylVB3AqYm", "email": ";;", "author_num": 3 }, { "id": "HylXHhA9Km", "title": "Statistical Characterization of Deep Neural Networks and their Sensitivity", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite their ubiquity, it remains an active area of research to fully understand deep neural networks (DNNs) and the reasons of their empirical success. We contribute to this effort by introducing a principled approach to statistically characterize DNNs and their sensitivity. By distinguishing between randomness from input data and from model parameters, we study how central and non-central moments of network activation and sensitivity evolve during propagation. Thereby, we provide novel statistical insights on the hypothesis space of input-output mappings encoded by different architectures. Our approach applies both to fully-connected and convolutional networks and incorporates most ingredients of modern DNNs: rectified linear unit (ReLU) activation, batch normalization, skip connections.", "keywords": "Statistics;Sensitivity;Exploding Gradient;Convolutional Neural Networks;Residual Neural Networks;Batch Normalization", "primary_area": "", "supplementary_material": "", "author": "Antoine Labatie", "authorids": "antoine.labatie@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=HylXHhA9Km", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 2, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LH0eAS-LTnkJ:scholar.google.com/&scioq=Statistical+Characterization+of+Deep+Neural+Networks+and+their+Sensitivity&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Hierarchical Reinforcement Learning via Advantage-Weighted Information Maximization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1109", "id": "Hyl_vjC5KQ", "author_site": "Takayuki Osa, Voot Tangkaratt, Masashi Sugiyama", "tldr": "This paper presents a hierarchical reinforcement learning framework based on deterministic option policies and mutual information maximization. ", "abstract": "Real-world tasks are often highly structured. Hierarchical reinforcement learning (HRL) has attracted research interest as an approach for leveraging the hierarchical structure of a given task in reinforcement learning (RL). However, identifying the hierarchical policy structure that enhances the performance of RL is not a trivial task. In this paper, we propose an HRL method that learns a latent variable of a hierarchical policy using mutual information maximization. Our approach can be interpreted as a way to learn a discrete and latent representation of the state-action space. To learn option policies that correspond to modes of the advantage function, we introduce advantage-weighted importance sampling. \nIn our HRL method, the gating policy learns to select option policies based on an option-value function, and these option policies are optimized based on the deterministic policy gradient method. This framework is derived by leveraging the analogy between a monolithic policy in standard RL and a hierarchical policy in HRL by using a deterministic option policy. Experimental results indicate that our HRL approach can learn a diversity of options and that it can enhance the performance of RL in continuous control tasks.", "keywords": "Hierarchical reinforcement learning;Representation learning;Continuous control", "primary_area": "", "supplementary_material": "", "author": "Takayuki Osa;Voot Tangkaratt;Masashi Sugiyama", "authorids": "osa@mfg.t.u-tokyo.ac.jp;voot.tangkaratt@riken.jp;sugi@k.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nosa2018hierarchical,\ntitle={Hierarchical Reinforcement Learning via Advantage-Weighted Information Maximization},\nauthor={Takayuki Osa and Voot Tangkaratt and Masashi Sugiyama},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyl_vjC5KQ},\n}", "github": "[![github](/images/github_icon.svg) TakaOsa/adInfoHRL](https://github.com/TakaOsa/adInfoHRL)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "336;497;614", "wc_reply_reviewers": "0;63;0", "wc_reply_authors": "321;236;180", "reply_reviewers": "0;1;0", "reply_authors": "1;2;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 482.3333333333333, 113.96588183409202 ], "wc_reply_reviewers_avg": [ 21.0, 29.698484809834994 ], "wc_reply_authors_avg": [ 245.66666666666666, 57.96742380184082 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8371143208721459013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Hyl_vjC5KQ", "pdf": "https://openreview.net/pdf?id=Hyl_vjC5KQ", "email": ";;", "author_num": 3 }, { "id": "HyldojC9t7", "title": "D2KE: From Distance to Kernel and Embedding via Random Features For Structured Inputs", "track": "main", "status": "Withdraw", "tldr": "From Distance to Kernel and Embedding via Random Features For Structured Inputs", "abstract": "We present a new methodology that constructs a family of \\emph{positive definite kernels} from any given dissimilarity measure on structured inputs whose elements are either real-valued time series or discrete structures such as strings, histograms, and graphs. \nOur approach, which we call D2KE (from Distance to Kernel and Embedding), draws from the literature of Random Features.\nHowever, instead of deriving random feature maps from a user-defined kernel to approximate kernel machines, we build a kernel from a random feature map, that we specify given the distance measure. \nWe further propose use of a finite number of random objects to produce a random feature embedding of each instance.\nWe provide a theoretical analysis showing that D2KE enjoys better generalizability than universal Nearest-Neighbor estimates. \nOn one hand, D2KE subsumes the widely-used \\emph{representative-set method} as a special case, and relates to the well-known \\emph{distance substitution kernel} in a limiting case. \nOn the other hand, D2KE generalizes existing \\emph{Random Features methods} applicable only to vector input representations to complex structured inputs of variable sizes. \nWe conduct classification experiments over such disparate domains as time series, strings, and histograms (for texts and images), for which our proposed framework compares favorably to existing distance-based learning methods in terms of both testing accuracy and computational time.", "keywords": "Distance Kernel;Embeddings;Random Features;Structured Inputs", "primary_area": "", "supplementary_material": "", "author": "Lingfei Wu;Ian E.H. Yen;Fangli Xu;Pradeep Ravikumar;Michael J. Witbrock", "authorids": "lwu@email.wm.edu;eyan@cs.cmu.edu;fxu02@email.wm.edu;pradeepr@cs.cmu.edu;witbrock@us.ibm.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=HyldojC9t7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "484;66;1282", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 610.6666666666666, 504.4451297107436 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:h4Fyo1TCZnwJ:scholar.google.com/&scioq=D2KE:+From+Distance+to+Kernel+and+Embedding+via+Random+Features+For+Structured+Inputs&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HyleYiC9FX", "title": "Text Embeddings for Retrieval from a Large Knowledge Base", "track": "main", "status": "Reject", "tldr": "The new attempt for creating semantically meaningful text embeddings via improved language modeling and utilizing an extra knowledge base", "abstract": "Text embedding representing natural language documents in a semantic vector space can be used for document retrieval using nearest neighbor lookup. In order to study the feasibility of neural models specialized for retrieval in a semantically meaningful way, we suggest the use of the Stanford Question Answering Dataset (SQuAD) in an open-domain question answering context, where the first task is to find paragraphs useful for answering a given question. First, we compare the quality of various text-embedding methods on the performance of retrieval and give an extensive empirical comparison on the performance of various non-augmented base embedding with, and without IDF weighting. Our main results are that by training deep residual neural models specifically for retrieval purposes can yield significant gains when it is used to augment existing embeddings. We also establish that deeper models are superior to this task. The best base baseline embeddings augmented by our learned neural approach improves the top-1 recall of the system by 14% in terms of the question side, and by 8% in terms of the paragraph side.", "keywords": "Text Embeddings;Document Ranking;Improving Retrieval;Question-Answering;Learning to Rank", "primary_area": "", "supplementary_material": "", "author": "Tolgahan Cakaloglu;Christian Szegedy;Xiaowei Xu", "authorids": "txcakaloglu@ualr.edu;szegedy@google.com;xwxu@ualr.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncakaloglu2019text,\ntitle={Text Embeddings for Retrieval from a Large Knowledge Base},\nauthor={Tolgahan Cakaloglu and Christian Szegedy and Xiaowei Xu},\nyear={2019},\nurl={https://openreview.net/forum?id=HyleYiC9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyleYiC9FX", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;4;4", "wc_review": "412;149;304", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 288.3333333333333, 107.93928334433618 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3558692297817822966&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5 }, { "id": "HyllasActm", "title": "End-to-End Learning of Video Compression Using Spatio-Temporal Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning (DL) is having a revolutionary impact in image processing, with DL-based approaches now holding the state of the art in many tasks, including image compression. However, video compression has so far resisted the DL revolution, with the very few proposed approaches being based on complex and impractical architectures with multiple networks. This paper proposes what we believe is the first approach to end-to-end learning of a single network for video compression. We tackle the problem in a novel way, avoiding explicit motion estimation/prediction, by formalizing it as the rate-distortion optimization of a single spatio-temporal autoencoder; i.e., we jointly learn a latent-space projection transform and a synthesis transform for low bitrate video compression. The quantizer uses a rounding scheme, which is relaxed during training, and an entropy estimation technique to enforce an information bottleneck, inspired by recent advances in image compression. We compare the obtained video compression networks with standard widely-used codecs, showing better performance than the MPEG-4 standard, being competitive with H.264/AVC for low bitrates. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jorge Pessoa;Helena Aidos;Pedro Tom\u00e1s;M\u00e1rio A. T. Figueiredo", "authorids": "jorge.pessoa@tecnico.ulisboa.pt;haidos@lx.it.pt;pedro.tomas@inesc-id.pt;mario.figueiredo@lx.it.pt", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\npessoa2019endtoend,\ntitle={End-to-End Learning of Video Compression Using Spatio-Temporal Autoencoders},\nauthor={Jorge Pessoa and Helena Aidos and Pedro Tom\u00e1s and M\u00e1rio A. T. Figueiredo},\nyear={2019},\nurl={https://openreview.net/forum?id=HyllasActm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=HyllasActm", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;3", "wc_review": "595;732;205", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "81;19;55", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 2.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 510.6666666666667, 223.25819631588496 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 51.666666666666664, 25.42090128658349 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3692656545439083193&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "Hylnis0qKX", "title": "Task-GAN for Improved GAN based Image Restoration", "track": "main", "status": "Reject", "tldr": "Couple the GAN based image restoration framework with another task-specific network to generate realistic image while preserving task-specific features.", "abstract": "Deep Learning (DL) algorithms based on Generative Adversarial Network (GAN) have demonstrated great potentials in computer vision tasks such as image restoration. Despite the rapid development of image restoration algorithms using DL and GANs, image restoration for specific scenarios, such as medical image enhancement and super-resolved identity recognition, are still facing challenges. How to ensure visually realistic restoration while avoiding hallucination or mode- collapse? How to make sure the visually plausible results do not contain hallucinated features jeopardizing downstream tasks such as pathology identification and subject identification?\nHere we propose to resolve these challenges by coupling the GAN based image restoration framework with another task-specific network. With medical imaging restoration as an example, the proposed model conducts additional pathology recognition/classification task to ensure the preservation of detailed structures that are important to this task. Validated on multiple medical datasets, we demonstrate the proposed method leads to improved deep learning based image restoration while preserving the detailed structure and diagnostic features. Additionally, the trained task network show potentials to achieve super-human level performance in identifying pathology and diagnosis.\nFurther validation on super-resolved identity recognition tasks also show that the proposed method can be generalized for diverse image restoration tasks.", "keywords": "Task-GAN: Improving Generative Adversarial Network for Image Restoration", "primary_area": "", "supplementary_material": "", "author": "Jiahong Ouyang;Guanhua Wang;Enhao Gong;Kevin Chen;John Pauly and Greg Zaharchuk", "authorids": "jiahongo@stanford.edu;guanhua@stanford.edu;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nouyang2019taskgan,\ntitle={Task-{GAN} for Improved {GAN} based Image Restoration},\nauthor={Jiahong Ouyang and Guanhua Wang and Enhao Gong and Kevin Chen and John Pauly and Greg Zaharchuk},\nyear={2019},\nurl={https://openreview.net/forum?id=Hylnis0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Hylnis0qKX", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;5;4", "wc_review": "331;121;141", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 197.66666666666666, 94.63379711052261 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15367145432907021489&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Hyls7h05FQ", "title": "A Differentiable Self-disambiguated Sense Embedding Model via Scaled Gumbel Softmax", "track": "main", "status": "Reject", "tldr": "Disambiguate and embed word senses with a differentiable hard-attention model using Scaled Gumbel Softmax", "abstract": "We present a differentiable multi-prototype word representation model that disentangles senses of polysemous words and produces meaningful sense-specific embeddings without external resources. It jointly learns how to disambiguate senses given local context and how to represent senses using hard attention. Unlike previous multi-prototype models, our model approximates discrete sense selection in a differentiable manner via a modified Gumbel softmax. We also propose a novel human evaluation task that quantitatively measures (1) how meaningful the learned sense groups are to humans and (2) how well the model is able to disambiguate senses given a context sentence. Our model outperforms competing approaches on both human evaluations and multiple word similarity tasks.", "keywords": "unsupervised representation learning;sense embedding;word sense disambiguation;human evaluation", "primary_area": "", "supplementary_material": "", "author": "Fenfei Guo;Mohit Iyyer;Leah Findlater;Jordan Boyd-Graber", "authorids": "fenfeigo@cs.umd.edu;miyyer@cs.umass.edu;leahkf@uw.edu;jbg@umiacs.umd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nguo2019a,\ntitle={A Differentiable Self-disambiguated Sense Embedding Model via Scaled Gumbel Softmax},\nauthor={Fenfei Guo and Mohit Iyyer and Leah Findlater and Jordan Boyd-Graber},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyls7h05FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyls7h05FQ", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;3", "wc_review": "381;353;499", "wc_reply_reviewers": "91;0;0", "wc_reply_authors": "734;247;323", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 411.0, 63.26663154196426 ], "wc_reply_reviewers_avg": [ 30.333333333333332, 42.897811391983886 ], "wc_reply_authors_avg": [ 434.6666666666667, 213.9226236022943 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15424032847615491679&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HylsgnCcFQ", "title": "Dynamic Graph Representation Learning via Self-Attention Networks", "track": "main", "status": "Reject", "tldr": "A novel neural architecture named DySAT to learn node representations on dynamic graphs by employing self-attention along two dimensions: structural neighborhood and temporal dynamics, achieves state-of-the-art results in dynamic link prediction.", "abstract": "Learning latent representations of nodes in graphs is an important and ubiquitous task with widespread applications such as link prediction, node classification, and graph visualization. Previous methods on graph representation learning mainly focus on static graphs, however, many real-world graphs are dynamic and evolve over time. In this paper, we present Dynamic Self-Attention Network (DySAT), a novel neural architecture that operates on dynamic graphs and learns node representations that capture both structural properties and temporal evolutionary patterns. Specifically, DySAT computes node representations by jointly employing self-attention layers along two dimensions: structural neighborhood and temporal dynamics. We conduct link prediction experiments on two classes of graphs: communication networks and bipartite rating networks. Our experimental results show that DySAT has a significant performance gain over several different state-of-the-art graph embedding baselines.", "keywords": "Graph Representation Learning;Dynamic Graphs;Attention;Self-Attention;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Aravind Sankar;Yanhong Wu;Liang Gou;Wei Zhang;Hao Yang", "authorids": "asankar3@illinois.edu;yanwu@visa.com;ligou@visa.com;wzhan@visa.com;haoyang@visa.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nsankar2019dynamic,\ntitle={Dynamic Graph Representation Learning via Self-Attention Networks},\nauthor={Aravind Sankar and Yanhong Wu and Liang Gou and Wei Zhang and Hao Yang},\nyear={2019},\nurl={https://openreview.net/forum?id=HylsgnCcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HylsgnCcFQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;4", "wc_review": "126;341;264", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "984;1029;816", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 243.66666666666666, 88.94317786592116 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 943.0, 91.6624241442479 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6256193993390909611&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "Hylyui09tm", "title": "EMI: Exploration with Mutual Information Maximizing State and Action Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "Policy optimization struggles when the reward feedback signal is very sparse and essentially becomes a random search algorithm until the agent stumbles upon a rewarding or the goal state. Recent works utilize intrinsic motivation to guide the exploration via generative models, predictive forward models, or more ad-hoc measures of surprise. We propose EMI, which is an exploration method that constructs embedding representation of states and actions that does not rely on generative decoding of the full observation but extracts predictive signals that can be used to guide exploration based on forward prediction in the representation space. Our experiments show the state of the art performance on challenging locomotion task with continuous control and on image-based exploration tasks with discrete actions on Atari.", "keywords": "reinforcement learning;exploration;representation learning", "primary_area": "", "supplementary_material": "", "author": "Hyoungseok Kim;Jaekyeom Kim;Yeonwoo Jeong;Sergey Levine;Hyun Oh Song", "authorids": "harry2636@mllab.snu.ac.kr;jaekyeom@mllab.snu.ac.kr;yeonwoo@mllab.snu.ac.kr;svlevine@eecs.berkeley.edu;hyunoh@snu.ac.kr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkim2019emi,\ntitle={{EMI}: Exploration with Mutual Information Maximizing State and Action Embeddings},\nauthor={Hyoungseok Kim and Jaekyeom Kim and Yeonwoo Jeong and Sergey Levine and Hyun Oh Song},\nyear={2019},\nurl={https://openreview.net/forum?id=Hylyui09tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hylyui09tm", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;3;4", "wc_review": "561;360;246", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "711;1072;317", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 389.0, 130.22288585344742 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 700.0, 308.3255854882411 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12024396790383510496&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "GENERATING HIGH FIDELITY IMAGES WITH SUBSCALE PIXEL NETWORKS AND MULTIDIMENSIONAL UPSCALING", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1064", "id": "HylzTiC5Km", "author_site": "Jacob Menick, Nal Kalchbrenner", "tldr": "We show that autoregressive models can generate high fidelity images. ", "abstract": "The unconditional generation of high fidelity images is a longstanding benchmark\nfor testing the performance of image decoders. Autoregressive image models\nhave been able to generate small images unconditionally, but the extension of\nthese methods to large images where fidelity can be more readily assessed has\nremained an open problem. Among the major challenges are the capacity to encode\nthe vast previous context and the sheer difficulty of learning a distribution that\npreserves both global semantic coherence and exactness of detail. To address the\nformer challenge, we propose the Subscale Pixel Network (SPN), a conditional\ndecoder architecture that generates an image as a sequence of image slices of equal\nsize. The SPN compactly captures image-wide spatial dependencies and requires a\nfraction of the memory and the computation. To address the latter challenge, we\npropose to use multidimensional upscaling to grow an image in both size and depth\nvia intermediate stages corresponding to distinct SPNs. We evaluate SPNs on the\nunconditional generation of CelebAHQ of size 256 and of ImageNet from size 32\nto 128. We achieve state-of-the-art likelihood results in multiple settings, set up\nnew benchmark results in previously unexplored settings and are able to generate\nvery high fidelity large scale samples on the basis of both datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jacob Menick;Nal Kalchbrenner", "authorids": "jmenick@google.com;nalk@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmenick2018generating,\ntitle={{GENERATING} {HIGH} {FIDELITY} {IMAGES} {WITH} {SUBSCALE} {PIXEL} {NETWORKS} {AND} {MULTIDIMENSIONAL} {UPSCALING}},\nauthor={Jacob Menick and Nal Kalchbrenner},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HylzTiC5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;9;10", "confidence": "3;3;5", "wc_review": "88;333;783", "wc_reply_reviewers": "0;0;91", "wc_reply_authors": "53;432;1346", "reply_reviewers": "0;0;2", "reply_authors": "1;1;3", "rating_avg": [ 8.666666666666666, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 401.3333333333333, 287.8174575818654 ], "wc_reply_reviewers_avg": [ 30.333333333333332, 42.897811391983886 ], "wc_reply_authors_avg": [ 610.3333333333334, 542.7180565347802 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 163, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6361435766800029488&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HylzTiC5Km", "pdf": "https://openreview.net/pdf?id=HylzTiC5Km", "email": ";", "author_num": 2 }, { "title": "Generalizable Adversarial Training via Spectral Normalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/958", "id": "Hyx4knR9Ym", "author_site": "Farzan Farnia, Jesse Zhang, David Tse", "tldr": "", "abstract": "Deep neural networks (DNNs) have set benchmarks on a wide array of supervised learning tasks. Trained DNNs, however, often lack robustness to minor adversarial perturbations to the input, which undermines their true practicality. Recent works have increased the robustness of DNNs by fitting networks using adversarially-perturbed training samples, but the improved performance can still be far below the performance seen in non-adversarial settings. A significant portion of this gap can be attributed to the decrease in generalization performance due to adversarial training. In this work, we extend the notion of margin loss to adversarial settings and bound the generalization error for DNNs trained under several well-known gradient-based attack schemes, motivating an effective regularization scheme based on spectral normalization of the DNN's weight matrices. We also provide a computationally-efficient method for normalizing the spectral norm of convolutional layers with arbitrary stride and padding schemes in deep convolutional networks. We evaluate the power of spectral normalization extensively on combinations of datasets, network architectures, and adversarial training schemes.", "keywords": "Adversarial attacks;adversarial training;spectral normalization;generalization guarantee", "primary_area": "", "supplementary_material": "", "author": "Farzan Farnia;Jesse Zhang;David Tse", "authorids": "farnia@stanford.edu;jessez@stanford.edu;dntse@stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nfarnia2018generalizable,\ntitle={Generalizable Adversarial Training via Spectral Normalization},\nauthor={Farzan Farnia and Jesse Zhang and David Tse},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyx4knR9Ym},\n}", "github": "[![github](/images/github_icon.svg) jessemzhang/dl_spectral_normalization](https://github.com/jessemzhang/dl_spectral_normalization)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;5", "wc_review": "184;257;280", "wc_reply_reviewers": "0;10;62", "wc_reply_authors": "245;646;290", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 240.33333333333334, 40.92540639857946 ], "wc_reply_reviewers_avg": [ 24.0, 27.17842281418601 ], "wc_reply_authors_avg": [ 393.6666666666667, 179.3698847509123 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 183, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16959420457208400665&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Hyx4knR9Ym", "pdf": "https://openreview.net/pdf?id=Hyx4knR9Ym", "email": ";;", "author_num": 3 }, { "title": "Adversarial Domain Adaptation for Stable Brain-Machine Interfaces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/686", "id": "Hyx6Bi0qYm", "author_site": "Ali Farshchian, Juan \u00c1lvaro Gallego, Joseph Paul Cohen, Yoshua Bengio, Lee E Miller, Sara A Solla", "tldr": "We implement an adversarial domain adaptation network to stabilize a fixed Brain-Machine Interface against gradual changes in the recorded neural signals.", "abstract": "Brain-Machine Interfaces (BMIs) have recently emerged as a clinically viable option\nto restore voluntary movements after paralysis. These devices are based on the\nability to extract information about movement intent from neural signals recorded\nusing multi-electrode arrays chronically implanted in the motor cortices of the\nbrain. However, the inherent loss and turnover of recorded neurons requires repeated\nrecalibrations of the interface, which can potentially alter the day-to-day\nuser experience. The resulting need for continued user adaptation interferes with\nthe natural, subconscious use of the BMI. Here, we introduce a new computational\napproach that decodes movement intent from a low-dimensional latent representation\nof the neural data. We implement various domain adaptation methods\nto stabilize the interface over significantly long times. This includes Canonical\nCorrelation Analysis used to align the latent variables across days; this method\nrequires prior point-to-point correspondence of the time series across domains.\nAlternatively, we match the empirical probability distributions of the latent variables\nacross days through the minimization of their Kullback-Leibler divergence.\nThese two methods provide a significant and comparable improvement in the performance\nof the interface. However, implementation of an Adversarial Domain\nAdaptation Network trained to match the empirical probability distribution of the\nresiduals of the reconstructed neural signals outperforms the two methods based\non latent variables, while requiring remarkably few data points to solve the domain\nadaptation problem.", "keywords": "Brain-Machine Interfaces;Domain Adaptation;Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Ali Farshchian;Juan A. Gallego;Joseph P. Cohen;Yoshua Bengio;Lee E. Miller;Sara A. Solla", "authorids": "a-farshchiansadegh@northwestern.edu;juan.gallego@northwestern.edu;joseph@josephpcohen.com;yoshua.bengio@umontreal.ca;lm@northwestern.edu;solla@northwestern.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nfarshchian2018adversarial,\ntitle={{ADVERSARIAL} {DOMAIN} {ADAPTATION} {FOR} {STABLE} {BRAIN}-{MACHINE} {INTERFACES}},\nauthor={Ali Farshchian and Juan A. Gallego and Joseph P. Cohen and Yoshua Bengio and Lee E. Miller and Sara A. Solla},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyx6Bi0qYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;7;9", "confidence": "3;5;4", "wc_review": "894;239;254", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "2835;690;221", "reply_reviewers": "0;0;0", "reply_authors": "5;1;1", "rating_avg": [ 7.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 462.3333333333333, 305.2958492275248 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1248.6666666666667, 1137.9309684198286 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1197289951589381792&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=Hyx6Bi0qYm", "pdf": "https://openreview.net/pdf?id=Hyx6Bi0qYm", "email": ";;;;;", "author_num": 6 }, { "title": "Deep Online Learning Via Meta-Learning: Continual Adaptation for Model-Based RL", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1078", "id": "HyxAfnA5tm", "author_site": "Anusha Nagabandi, Chelsea Finn, Sergey Levine", "tldr": "", "abstract": "Humans and animals can learn complex predictive models that allow them to accurately and reliably reason about real-world phenomena, and they can adapt such models extremely quickly in the face of unexpected changes. Deep neural network models allow us to represent very complex functions, but lack this capacity for rapid online adaptation. The goal in this paper is to develop a method for continual online learning from an incoming stream of data, using deep neural network models. We formulate an online learning procedure that uses stochastic gradient descent to update model parameters, and an expectation maximization algorithm with a Chinese restaurant process prior to develop and maintain a mixture of models to handle non-stationary task distributions. This allows for all models to be adapted as necessary, with new models instantiated for task changes and old models recalled when previously seen tasks are encountered again. Furthermore, we observe that meta-learning can be used to meta-train a model such that this direct online adaptation with SGD is effective, which is otherwise not the case for large function approximators. We apply our method to model-based reinforcement learning, where adapting the predictive model is critical for control; we demonstrate that our online learning via meta-learning algorithm outperforms alternative prior methods, and enables effective continuous adaptation in non-stationary task distributions such as varying terrains, motor failures, and unexpected disturbances.", "keywords": "meta-learning;model-based;reinforcement learning;online learning;adaptation", "primary_area": "", "supplementary_material": "", "author": "Anusha Nagabandi;Chelsea Finn;Sergey Levine", "authorids": "nagaban2@berkeley.edu;cbfinn@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nnagabandi2018deep,\ntitle={Deep Online Learning Via Meta-Learning: Continual Adaptation for Model-Based {RL}},\nauthor={Anusha Nagabandi and Chelsea Finn and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxAfnA5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;3", "wc_review": "177;174;104", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "151;70;199", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 151.66666666666666, 33.7276675083759 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 140.0, 53.23532661682466 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 236, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11074585999071693969&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyxAfnA5tm", "pdf": "https://openreview.net/pdf?id=HyxAfnA5tm", "email": ";;", "author_num": 3 }, { "id": "HyxBpoR5tm", "title": "Adversarially Robust Training through Structured Gradient Regularization", "track": "main", "status": "Reject", "tldr": "We propose a novel data-dependent structured gradient regularizer to increase the robustness of neural networks against adversarial perturbations.", "abstract": "We propose a novel data-dependent structured gradient regularizer to increase the robustness of neural networks vis-a-vis adversarial perturbations. Our regularizer can be derived as a controlled approximation from first principles, leveraging the fundamental link between training with noise and regularization. It adds very little computational overhead during learning and is simple to implement generically in standard deep learning frameworks. Our experiments provide strong evidence that structured gradient regularization can act as an effective first line of defense against attacks based on long-range correlated signal corruptions.", "keywords": "Adversarial Training;Gradient Regularization;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Kevin Roth;Aurelien Lucchi;Sebastian Nowozin;Thomas Hofmann", "authorids": "kevin.roth@inf.ethz.ch;aurelien.lucchi@inf.ethz.ch;sebastian.nowozin@microsoft.com;thomas.hofmann@inf.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nroth2019adversarially,\ntitle={Adversarially Robust Training through Structured Gradient Regularization},\nauthor={Kevin Roth and Aurelien Lucchi and Sebastian Nowozin and Thomas Hofmann},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxBpoR5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyxBpoR5tm", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "wc_review": "710;1078;861", "wc_reply_reviewers": "0;987;508", "wc_reply_authors": "931;957;1031", "reply_reviewers": "0;2;1", "reply_authors": "2;2;2", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 883.0, 151.03862640618348 ], "wc_reply_reviewers_avg": [ 498.3333333333333, 402.9990350140087 ], "wc_reply_authors_avg": [ 973.0, 42.36350630751268 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3877499078861787958&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Deep Anomaly Detection with Outlier Exposure", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/772", "id": "HyxCxhRcY7", "author_site": "Dan Hendrycks, Mantas Mazeika, Thomas Dietterich", "tldr": "OE teaches anomaly detectors to learn heuristics for detecting unseen anomalies; experiments are in classification, density estimation, and calibration in NLP and vision settings; we do not tune on test distribution samples, unlike previous work", "abstract": "It is important to detect anomalous inputs when deploying machine learning systems. The use of larger and more complex inputs in deep learning magnifies the difficulty of distinguishing between anomalous and in-distribution examples. At the same time, diverse image and text data are available in enormous quantities. We propose leveraging these data to improve deep anomaly detection by training anomaly detectors against an auxiliary dataset of outliers, an approach we call Outlier Exposure (OE). This enables anomaly detectors to generalize and detect unseen anomalies. In extensive experiments on natural language processing and small- and large-scale vision tasks, we find that Outlier Exposure significantly improves detection performance. We also observe that cutting-edge generative models trained on CIFAR-10 may assign higher likelihoods to SVHN images than to CIFAR-10 images; we use OE to mitigate this issue. We also analyze the flexibility and robustness of Outlier Exposure, and identify characteristics of the auxiliary dataset that improve performance.", "keywords": "confidence;uncertainty;anomaly;robustness", "primary_area": "", "supplementary_material": "", "author": "Dan Hendrycks;Mantas Mazeika;Thomas Dietterich", "authorids": "hendrycks@berkeley.edu;mantas@ttic.edu;tgd@oregonstate.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhendrycks2018deep,\ntitle={Deep Anomaly Detection with Outlier Exposure},\nauthor={Dan Hendrycks and Mantas Mazeika and Thomas Dietterich},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxCxhRcY7},\n}", "github": "[![github](/images/github_icon.svg) hendrycks/outlier-exposure](https://github.com/hendrycks/outlier-exposure) + [![Papers with Code](/images/pwc_icon.svg) 8 community implementations](https://paperswithcode.com/paper/?openreview=HyxCxhRcY7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;5;4", "wc_review": "488;488;216", "wc_reply_reviewers": "164;0;0", "wc_reply_authors": "538;534;166", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 397.3333333333333, 128.2220296551606 ], "wc_reply_reviewers_avg": [ 54.666666666666664, 77.3103414097292 ], "wc_reply_authors_avg": [ 412.6666666666667, 174.42731692280566 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1914, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13915279318347653817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyxCxhRcY7", "pdf": "https://openreview.net/pdf?id=HyxCxhRcY7", "email": ";;", "author_num": 3 }, { "title": "Contingency-Aware Exploration in Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/695", "id": "HyxGB2AcY7", "author_site": "Jongwook Choi, Yijie Guo, Marcin Moczulski, Junhyuk Oh, Neal Wu, Mohammad Norouzi, Honglak Lee", "tldr": "We investigate contingency-awareness and controllable aspects in exploration and achieve state-of-the-art performance on Montezuma's Revenge without expert demonstrations.", "abstract": "This paper investigates whether learning contingency-awareness and controllable aspects of an environment can lead to better exploration in reinforcement learning. To investigate this question, we consider an instantiation of this hypothesis evaluated on the Arcade Learning Element (ALE). In this study, we develop an attentive dynamics model (ADM) that discovers controllable elements of the observations, which are often associated with the location of the character in Atari games. The ADM is trained in a self-supervised fashion to predict the actions taken by the agent. The learned contingency information is used as a part of the state representation for exploration purposes. We demonstrate that combining actor-critic algorithm with count-based exploration using our representation achieves impressive results on a set of notoriously challenging Atari games due to sparse rewards. For example, we report a state-of-the-art score of >11,000 points on Montezuma's Revenge without using expert demonstrations, explicit high-level information (e.g., RAM states), or supervisory data. Our experiments confirm that contingency-awareness is indeed an extremely powerful concept for tackling exploration problems in reinforcement learning and opens up interesting research questions for further investigations.", "keywords": "Reinforcement Learning;Exploration;Contingency-Awareness", "primary_area": "", "supplementary_material": "", "author": "Jongwook Choi;Yijie Guo;Marcin Moczulski;Junhyuk Oh;Neal Wu;Mohammad Norouzi;Honglak Lee", "authorids": "jwook@umich.edu;guoyijie@umich.edu;marcin.lukasz.moczulski@gmail.com;junhyuk@umich.edu;neal@nealwu.com;mnorouzi@google.com;honglak@eecs.umich.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nchoi2018contingencyaware,\ntitle={Contingency-Aware Exploration in Reinforcement Learning},\nauthor={Jongwook Choi and Yijie Guo and Marcin Moczulski and Junhyuk Oh and Neal Wu and Mohammad Norouzi and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxGB2AcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;2", "wc_review": "851;1292;143", "wc_reply_reviewers": "0;314;0", "wc_reply_authors": "981;1434;58", "reply_reviewers": "0;2;0", "reply_authors": "2;3;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 762.0, 473.2800439486119 ], "wc_reply_reviewers_avg": [ 104.66666666666667, 148.02101952838393 ], "wc_reply_authors_avg": [ 824.3333333333334, 572.5686761331683 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 96, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6536781875136948962&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyxGB2AcY7", "pdf": "https://openreview.net/pdf?id=HyxGB2AcY7", "email": ";;;;;;", "author_num": 7 }, { "id": "HyxH2o05FQ", "title": "Domain Adaptive Transfer Learning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Transfer learning is a widely used method to build high performing computer vision models. In this paper, we study the efficacy of transfer learning by examining how the choice of data impacts performance. We find that more pre-training data does not always help, and transfer performance depends on a judicious choice of pre-training data. These findings are important given the continued increase in dataset sizes. We further propose domain adaptive transfer learning, a simple and effective pre-training method using importance weights computed based on the target dataset. Our methods achieve state-of-the-art results on multiple fine-grained classification datasets and are well-suited for use in practice.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiquan Ngiam;Daiyi Peng;Vijay Vasudevan;Simon Kornblith;Quoc Le;Ruoming Pang", "authorids": "jngiam@google.com;daiyip@google.com;vrv@google.com;skornblith@google.com;qvl@google.com;rpang@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=HyxH2o05FQ", "pdf_size": 0, "rating": "3;4;7", "confidence": "5;4;4", "wc_review": "249;71;626", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "209;113;89", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 315.3333333333333, 231.38183929503967 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 137.0, 51.84592558726288 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.6933752452815364, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Context-adaptive Entropy Model for End-to-end Optimized Image Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1115", "id": "HyxKIiAqYQ", "author_site": "Jooyoung Lee, Seunghyun Cho, Seung-Kwon Beack", "tldr": "Context-adaptive entropy model for use in end-to-end optimized image compression, which significantly improves compression performance", "abstract": "We propose a context-adaptive entropy model for use in end-to-end optimized image compression. Our model exploits two types of contexts, bit-consuming contexts and bit-free contexts, distinguished based upon whether additional bit\nallocation is required. Based on these contexts, we allow the model to more accurately estimate the distribution of each latent representation with a more generalized form of the approximation models, which accordingly leads to an\nenhanced compression performance. Based on the experimental results, the proposed method outperforms the traditional image codecs, such as BPG and JPEG2000, as well as other previous artificial-neural-network (ANN) based approaches, in terms of the peak signal-to-noise ratio (PSNR) and multi-scale structural similarity (MS-SSIM) index. The test code is publicly available at https://github.com/JooyoungLeeETRI/CA_Entropy_Model.", "keywords": "image compression;deep learning;entropy model", "primary_area": "", "supplementary_material": "", "author": "Jooyoung Lee;Seunghyun Cho;Seung-Kwon Beack", "authorids": "leejy1003@etri.re.kr;shcho@etri.re.kr;skbeack@etri.re.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlee2018contextadaptive,\ntitle={Context-adaptive Entropy Model for End-to-end Optimized Image Compression},\nauthor={Jooyoung Lee and Seunghyun Cho and Seung-Kwon Beack},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxKIiAqYQ},\n}", "github": "[![github](/images/github_icon.svg) JooyoungLeeETRI/CA_Entropy_Model](https://github.com/JooyoungLeeETRI/CA_Entropy_Model) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=HyxKIiAqYQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;4", "wc_review": "275;325;255", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "783;158;172", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 285.0, 29.43920288775949 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 371.0, 291.3840535558984 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 495, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17458297235582784877&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=HyxKIiAqYQ", "pdf": "https://openreview.net/pdf?id=HyxKIiAqYQ", "email": ";;", "author_num": 3 }, { "id": "HyxOIoRqFQ", "title": "Discrete flow posteriors for variational inference in discrete dynamical systems", "track": "main", "status": "Reject", "tldr": "We give a fast normalising-flow like sampling procedure for discrete latent variable models.", "abstract": "Each training step for a variational autoencoder (VAE) requires us to sample from the approximate posterior, so we usually choose simple (e.g. factorised) approximate posteriors in which sampling is an efficient computation that fully exploits GPU parallelism. However, such simple approximate posteriors are often insufficient, as they eliminate statistical dependencies in the posterior. While it is possible to use normalizing flow approximate posteriors for continuous latents, there is nothing analogous for discrete latents. The most natural approach to model discrete dependencies is an autoregressive distribution, but sampling from such distributions is inherently sequential and thus slow. We develop a fast, parallel sampling procedure for autoregressive distributions based on fixed-point iterations which enables efficient and accurate variational inference in discrete state-space models. To optimize the variational bound, we considered two ways to evaluate probabilities: inserting the relaxed samples directly into the pmf for the discrete distribution, or converting to continuous logistic latent variables and interpreting the K-step fixed-point iterations as a normalizing flow. We found that converting to continuous latent variables gave considerable additional scope for mismatch between the true and approximate posteriors, which resulted in biased inferences, we thus used the former approach. We tested our approach on the neuroscience problem of inferring discrete spiking activity from noisy calcium-imaging data, and found that it gave accurate connectivity estimates in an order of magnitude less time.", "keywords": "normalising flow;variational inference;discrete latent variable", "primary_area": "", "supplementary_material": "", "author": "Laurence Aitchison;Vincent Adam;Srinivas C. Turaga", "authorids": "laurence.aitchison@gmail.com;vincent.adam@prowler.io;turagas@janelia.hhmi.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\naitchison2019discrete,\ntitle={Discrete flow posteriors for variational inference in discrete dynamical systems},\nauthor={Laurence Aitchison and Vincent Adam and Srinivas C. Turaga},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxOIoRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyxOIoRqFQ", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;3;4", "wc_review": "651;600;531", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 594.0, 49.17316341257699 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17531121672957262059&as_sdt=5,31&sciodt=0,31&hl=en", "gs_version_total": 5 }, { "title": "Variational Discriminator Bottleneck: Improving Imitation Learning, Inverse RL, and GANs by Constraining Information Flow", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/752", "id": "HyxPx3R9tm", "author_site": "Xue Bin Peng, Angjoo Kanazawa, Samuel Toyer, Pieter Abbeel, Sergey Levine", "tldr": "Regularizing adversarial learning with an information bottleneck, applied to imitation learning, inverse reinforcement learning, and generative adversarial networks.", "abstract": "Adversarial learning methods have been proposed for a wide range of applications, but the training of adversarial models can be notoriously unstable. Effectively balancing the performance of the generator and discriminator is critical, since a discriminator that achieves very high accuracy will produce relatively uninformative gradients. In this work, we propose a simple and general technique to constrain information flow in the discriminator by means of an information bottleneck. By enforcing a constraint on the mutual information between the observations and the discriminator's internal representation, we can effectively modulate the discriminator's accuracy and maintain useful and informative gradients. We demonstrate that our proposed variational discriminator bottleneck (VDB) leads to significant improvements across three distinct application areas for adversarial learning algorithms. Our primary evaluation studies the applicability of the VDB to imitation learning of dynamic continuous control skills, such as running. We show that our method can learn such skills directly from raw video demonstrations, substantially outperforming prior adversarial imitation learning methods. The VDB can also be combined with adversarial inverse reinforcement learning to learn parsimonious reward functions that can be transferred and re-optimized in new settings. Finally, we demonstrate that VDB can train GANs more effectively for image generation, improving upon a number of prior stabilization methods.", "keywords": "reinforcement learning;generative adversarial networks;imitation learning;inverse reinforcement learning;information bottleneck", "primary_area": "", "supplementary_material": "", "author": "Xue Bin Peng;Angjoo Kanazawa;Sam Toyer;Pieter Abbeel;Sergey Levine", "authorids": "jasonpeng142@hotmail.com;kanazawa@eecs.berkeley.edu;sdt@berkeley.edu;pabbeel@cs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\npeng2018variational,\ntitle={Variational Discriminator Bottleneck: Improving Imitation Learning, Inverse {RL}, and {GAN}s by Constraining Information Flow},\nauthor={Xue Bin Peng and Angjoo Kanazawa and Sam Toyer and Pieter Abbeel and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxPx3R9tm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=HyxPx3R9tm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;8;10", "confidence": "3;3;4", "wc_review": "216;742;235", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "289;279;72", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 8.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 397.6666666666667, 243.6039590993728 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 213.33333333333334, 100.02110888318636 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 282, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9333510293426778540&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=HyxPx3R9tm", "pdf": "https://openreview.net/pdf?id=HyxPx3R9tm", "email": ";;;;", "author_num": 5 }, { "id": "HyxSBh09t7", "title": "Graph Generation via Scattering", "track": "main", "status": "Reject", "tldr": "This work proposes a graph generation system based on scattering and demonstrates competitive performance as well as indicates better promise of the generative scattering framework to datasets with a graph structure.", "abstract": "Generative networks have made it possible to generate meaningful signals such as images and texts from simple noise. Recently, generative methods based on GAN and VAE were developed for graphs and graph signals. However, the mathematical properties of these methods are unclear, and training good generative models is difficult. This work proposes a graph generation model that uses a recent adaptation of Mallat's scattering transform to graphs. The proposed model is naturally composed of an encoder and a decoder. The encoder is a Gaussianized graph scattering transform, which is robust to signal and graph manipulation. The decoder is a simple fully connected network that is adapted to specific tasks, such as link prediction, signal generation on graphs and full graph and signal generation. The training of our proposed system is efficient since it is only applied to the decoder and the hardware requirement is moderate. Numerical results demonstrate state-of-the-art performance of the proposed system for both link prediction and graph and signal generation. These results are in contrast to experience with Euclidean data, where it is difficult to form a generative scattering network that performs as well as state-of-the-art methods. We believe that this is because of the discrete and simpler nature of graph applications, unlike the more complex and high-frequency nature of Euclidean data, in particular, of some natural images. ", "keywords": "graph generative neural network;link prediction;graph and signal generation;scattering network", "primary_area": "", "supplementary_material": "", "author": "Dongmian Zou;Gilad Lerman", "authorids": "dzou@umn.edu;lerman@umn.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzou2019graph,\ntitle={Graph Generation via Scattering},\nauthor={Dongmian Zou and Gilad Lerman},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxSBh09t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=HyxSBh09t7", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;4", "wc_review": "535;335;214", "wc_reply_reviewers": "62;0;0", "wc_reply_authors": "1333;608;349", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 361.3333333333333, 132.3639763010398 ], "wc_reply_reviewers_avg": [ 20.666666666666668, 29.227080289043965 ], "wc_reply_authors_avg": [ 763.3333333333334, 416.4615495123436 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uFD1Vq6HeFgJ:scholar.google.com/&scioq=Graph+Generation+via+Scattering&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "id": "HyxUIj09KX", "title": "S-System, Geometry, Learning, and Optimization: A Theory of Neural Networks", "track": "main", "status": "Reject", "tldr": "We present a formal measure-theoretical theory of neural networks (NN) that quantitatively shows NNs renormalize on semantic difference, and under practical conditions large size deep nonlinear NNs can optimize objective functions to zero losses.", "abstract": "We present a formal measure-theoretical theory of neural networks (NN) built on {\\it probability coupling theory}. Particularly, we present an algorithm framework, Hierarchical Measure Group and Approximate System (HMGAS), nicknamed S-System, of which NNs are special cases. In addition to many other results, the framework enables us to prove that 1) NNs implement {\\it renormalization group (RG)} using information geometry, which points out that the large scale property to renormalize is dual Bregman divergence and completes the analog between NNs and RG; 2) and under a set of {\\it realistic} boundedness and diversity conditions, for {\\it large size nonlinear deep} NNs with a class of losses, including the hinge loss, all local minima are global minima with zero loss errors, using random matrix theory.", "keywords": "neural network theory;probability measure theory;probability coupling theory;S-System;optimization;random matrix;renormalization group;information geometry;coarse graining;hierarchy;activation function;symmetry", "primary_area": "", "supplementary_material": "", "author": "Shuai Li;Kui Jia", "authorids": "lishuai918@gmail.com;kuijia@scut.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2019ssystem,\ntitle={S-System, Geometry, Learning, and Optimization: A Theory of Neural Networks},\nauthor={Shuai Li and Kui Jia},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxUIj09KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=HyxUIj09KX", "pdf_size": 0, "rating": "4;4", "confidence": "1;2", "wc_review": "82;198", "wc_reply_reviewers": "0;0", "wc_reply_authors": "0;0", "reply_reviewers": "0;0", "reply_authors": "0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 1.5, 0.5 ], "wc_review_avg": [ 140.0, 58.0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 3, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L6exZrLIaIYJ:scholar.google.com/&scioq=S-System,+Geometry,+Learning,+and+Optimization:+A+Theory+of+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "HyxhusA9Fm", "title": "Talk The Walk: Navigating Grids in New York City through Grounded Dialogue", "track": "main", "status": "Reject", "tldr": "First large-scale dialogue dataset grounded in action and perception", "abstract": "We introduce `\"Talk The Walk\", the first large-scale dialogue dataset grounded in action and perception. The task involves two agents (a 'guide' and a 'tourist') that communicate via natural language in order to achieve a common goal: having the tourist navigate to a given target location. The task and dataset, which are described in detail, are challenging and their full solution is an open problem that we pose to the community. We (i) focus on the task of tourist localization and develop the novel Masked Attention for Spatial Convolutions (MASC) mechanism that allows for grounding tourist utterances into the guide's map, (ii) show it yields significant improvements for both emergent and natural language communication, and (iii) using this method, we establish non-trivial baselines on the full task. ", "keywords": "Dialogue;Navigation;Grounded Language Learning", "primary_area": "", "supplementary_material": "", "author": "Harm de Vries;Kurt Shuster;Dhruv Batra;Devi Parikh;Jason Weston;Douwe Kiela", "authorids": "mail@harmdevries.com;kshuster@fb.com;dbatra@gatech.edu;parikh@gatech.edu;jase@fb.com;dkiela@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nvries2019talk,\ntitle={Talk The Walk: Navigating Grids in New York City through Grounded Dialogue},\nauthor={Harm de Vries and Kurt Shuster and Dhruv Batra and Devi Parikh and Jason Weston and Douwe Kiela},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxhusA9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyxhusA9Fm", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;4;4", "wc_review": "478;596;72", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "670;720;35", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 382.0, 224.4341031721041 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 475.0, 311.79587339582713 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.9449111825230683, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7904849112907883172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "HyxlHsActm", "title": "Efficient Dictionary Learning with Gradient Descent", "track": "main", "status": "Reject", "tldr": "We provide an efficient convergence rate for gradient descent on the complete orthogonal dictionary learning objective based on a geometric analysis.", "abstract": "Randomly initialized first-order optimization algorithms are the method of choice for solving many high-dimensional nonconvex problems in machine learning, yet general theoretical guarantees cannot rule out convergence to critical points of poor objective value. For some highly structured nonconvex problems however, the success of gradient descent can be understood by studying the geometry of the objective. We study one such problem -- complete orthogonal dictionary learning, and provide converge guarantees for randomly initialized gradient descent to the neighborhood of a global optimum. The resulting rates scale as low order polynomials in the dimension even though the objective possesses an exponential number of saddle points. This efficient convergence can be viewed as a consequence of negative curvature normal to the stable manifolds associated with saddle points, and we provide evidence that this feature is shared by other nonconvex problems of importance as well. ", "keywords": "dictionary learning;nonconvex optimization", "primary_area": "", "supplementary_material": "", "author": "Dar Gilboa;Sam Buchanan;John Wright", "authorids": "dg2893@columbia.edu;sdb2157@columbia.edu;jw2966@columbia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngilboa2019efficient,\ntitle={Efficient Dictionary Learning with Gradient Descent},\nauthor={Dar Gilboa and Sam Buchanan and John Wright},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxlHsActm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyxlHsActm", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;2;4", "wc_review": "392;330;257", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 326.3333333333333, 55.174470747096635 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17820633958201377013&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Meta-learning with differentiable closed-form solvers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/669", "id": "HyxnZh0ct7", "author_site": "Luca Bertinetto, Joao F. Henriques, Philip Torr, Andrea Vedaldi", "tldr": "We propose a meta-learning approach for few-shot classification that achieves strong performance at high-speed by back-propagating through the solution of fast solvers, such as ridge regression or logistic regression.", "abstract": "Adapting deep networks to new concepts from a few examples is challenging, due to the high computational requirements of standard fine-tuning procedures.\nMost work on few-shot learning has thus focused on simple learning techniques for adaptation, such as nearest neighbours or gradient descent.\nNonetheless, the machine learning literature contains a wealth of methods that learn non-deep models very efficiently.\nIn this paper, we propose to use these fast convergent methods as the main adaptation mechanism for few-shot learning.\nThe main idea is to teach a deep network to use standard machine learning tools, such as ridge regression, as part of its own internal model, enabling it to quickly adapt to novel data.\nThis requires back-propagating errors through the solver steps.\nWhile normally the cost of the matrix operations involved in such a process would be significant, by using the Woodbury identity we can make the small number of examples work to our advantage.\nWe propose both closed-form and iterative solvers, based on ridge regression and logistic regression components.\nOur methods constitute a simple and novel approach to the problem of few-shot learning and achieve performance competitive with or superior to the state of the art on three benchmarks.", "keywords": "few-shot learning;one-shot learning;meta-learning;deep learning;ridge regression;classification", "primary_area": "", "supplementary_material": "", "author": "Luca Bertinetto;Joao F. Henriques;Philip Torr;Andrea Vedaldi", "authorids": "luca@robots.ox.ac.uk;joao@robots.ox.ac.uk;philip.torr@eng.ox.ac.uk;vedaldi@robots.ox.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbertinetto2018metalearning,\ntitle={Meta-learning with differentiable closed-form solvers},\nauthor={Luca Bertinetto and Joao F. Henriques and Philip Torr and Andrea Vedaldi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxnZh0ct7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=HyxnZh0ct7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "2;5;7", "confidence": "5;3;4", "wc_review": "166;109;767", "wc_reply_reviewers": "93;0;0", "wc_reply_authors": "942;618;245", "reply_reviewers": "1;0;0", "reply_authors": "3;3;1", "rating_avg": [ 4.666666666666667, 2.0548046676563256 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 347.3333333333333, 297.6601343068224 ], "wc_reply_reviewers_avg": [ 31.0, 43.840620433565945 ], "wc_reply_authors_avg": [ 601.6666666666666, 284.78334845203915 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.5960395606792698, "gs_citation": 1245, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12967631409063437571&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=HyxnZh0ct7", "pdf": "https://openreview.net/pdf?id=HyxnZh0ct7", "email": ";;;", "author_num": 4 }, { "id": "HyxpNnRcFX", "title": "Modulating transfer between tasks in gradient-based meta-learning", "track": "main", "status": "Reject", "tldr": "We use the connection between gradient-based meta-learning and hierarchical Bayes to learn a mixture of meta-learners that is appropriate for a heterogeneous and evolving task distribution.", "abstract": "Learning-to-learn or meta-learning leverages data-driven inductive bias to increase the efficiency of learning on a novel task. This approach encounters difficulty when transfer is not mutually beneficial, for instance, when tasks are sufficiently dissimilar or change over time. Here, we use the connection between gradient-based meta-learning and hierarchical Bayes to propose a mixture of hierarchical Bayesian models over the parameters of an arbitrary function approximator such as a neural network. Generalizing the model-agnostic meta-learning (MAML) algorithm, we present a stochastic expectation maximization procedure to jointly estimate parameter initializations for gradient descent as well as a latent assignment of tasks to initializations. This approach better captures the diversity of training tasks as opposed to consolidating inductive biases into a single set of hyperparameters. Our experiments demonstrate better generalization on the standard miniImageNet benchmark for 1-shot classification. We further derive a novel and scalable non-parametric variant of our method that captures the evolution of a task distribution over time as demonstrated on a set of few-shot regression tasks.", "keywords": "meta-learning;clustering;learning-to-learn;mixture;hierarchical Bayes;hierarchical model;gradient-based meta-learning", "primary_area": "", "supplementary_material": "", "author": "Erin Grant;Ghassen Jerfel;Katherine Heller;Thomas L. Griffiths", "authorids": "eringrant@berkeley.edu;gj47@duke.edu;kheller@stat.duke.edu;tomg@princeton.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngrant2019modulating,\ntitle={Modulating transfer between tasks in gradient-based meta-learning},\nauthor={Erin Grant and Ghassen Jerfel and Katherine Heller and Thomas L. Griffiths},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxpNnRcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=HyxpNnRcFX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;2", "wc_review": "642;300;153", "wc_reply_reviewers": "226;1788;0", "wc_reply_authors": "3660;3041;455", "reply_reviewers": "1;3;0", "reply_authors": "6;5;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 365.0, 204.856047018388 ], "wc_reply_reviewers_avg": [ 671.3333333333334, 794.9747724858247 ], "wc_reply_authors_avg": [ 2385.3333333333335, 1388.1475265819392 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 4.0, 2.160246899469287 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13179033364304890924&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "Hyxsl2AqKm", "title": "ON THE EFFECTIVENESS OF TASK GRANULARITY FOR TRANSFER LEARNING", "track": "main", "status": "Reject", "tldr": "If the model architecture is fixed, how would the complexity and granularity of task, effect the quality of learned features for transferring to a new task.", "abstract": "We describe a DNN for video classification and captioning, trained end-to-end,\nwith shared features, to solve tasks at different levels of granularity, exploring the\nlink between granularity in a source task and the quality of learned features for\ntransfer learning. For solving the new task domain in transfer learning, we freeze\nthe trained encoder and fine-tune an MLP on the target domain. We train on the\nSomething-Something dataset with over 220, 000 videos, and multiple levels of\ntarget granularity, including 50 action groups, 174 fine-grained action categories\nand captions. Classification and captioning with Something-Something are challenging\nbecause of the subtle differences between actions, applied to thousands\nof different object classes, and the diversity of captions penned by crowd actors.\nOur model performs better than existing classification baselines for SomethingSomething,\nwith impressive fine-grained results. And it yields a strong baseline on\nthe new Something-Something captioning task. Experiments reveal that training\nwith more fine-grained tasks tends to produce better features for transfer learning.", "keywords": "Transfer Learning;Video Understanding;Fine-grained Video Classification;Video Captioning;Common Sense;Something-Something Dataset.", "primary_area": "", "supplementary_material": "", "author": "Farzaneh Mahdisoltani;Guillaume Berger;Waseem Gharbieh;David Fleet;Roland Memisevic", "authorids": "farzaneh@cs.toronto.edu;guillaume.berger@twentybn.com;waseem.gharbieh@twentybn.com;fleet@cs.toronto.edu;roland.memisevic@twentybn.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmahdisoltani2019on,\ntitle={{ON} {THE} {EFFECTIVENESS} {OF} {TASK} {GRANULARITY} {FOR} {TRANSFER} {LEARNING}},\nauthor={Farzaneh Mahdisoltani and Guillaume Berger and Waseem Gharbieh and David Fleet and Roland Memisevic},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyxsl2AqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyxsl2AqKm", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;4", "wc_review": "601;259;333", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 397.6666666666667, 146.91796652856618 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 55, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18226584697611392657&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Hyxtso0qtX", "title": "Adversarial Exploration Strategy for Self-Supervised Imitation Learning", "track": "main", "status": "Reject", "tldr": "A simple yet effective imitation learning scheme that incentivizes exploration of an environment without any extrinsic reward or human demonstration.", "abstract": "We present an adversarial exploration strategy, a simple yet effective imitation learning scheme that incentivizes exploration of an environment without any extrinsic reward or human demonstration. Our framework consists of a deep reinforcement learning (DRL) agent and an inverse dynamics model contesting with each other. The former collects training samples for the latter, and its objective is to maximize the error of the latter. The latter is trained with samples collected by the former, and generates rewards for the former when it fails to predict the actual action taken by the former. In such a competitive setting, the DRL agent learns to generate samples that the inverse dynamics model fails to predict correctly, and the inverse dynamics model learns to adapt to the challenging samples. We further propose a reward structure that ensures the DRL agent collects only moderately hard samples and not overly hard ones that prevent the inverse model from imitating effectively. We evaluate the effectiveness of our method on several OpenAI gym robotic arm and hand manipulation tasks against a number of baseline models. Experimental results show that our method is comparable to that directly trained with expert demonstrations, and superior to the other baselines even without any human priors.", "keywords": "adversarial exploration;self-supervised;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Zhang-Wei Hong;Tsu-Jui Fu;Tzu-Yun Shann;Yi-Hsiang Chang;Chun-Yi Lee", "authorids": "williamd4112@gapp.nthu.edu.tw;yesray0216@gmail.com;ariel@shann.net;shawn420@gapp.nthu.edu.tw;cylee@cs.nthu.edu.tw", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhong2019adversarial,\ntitle={Adversarial Exploration Strategy for Self-Supervised Imitation Learning},\nauthor={Zhang-Wei Hong and Tsu-Jui Fu and Tzu-Yun Shann and Yi-Hsiang Chang and Chun-Yi Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyxtso0qtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Hyxtso0qtX", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;3;3", "wc_review": "632;240;403", "wc_reply_reviewers": "571;0;0", "wc_reply_authors": "3654;937;1103", "reply_reviewers": "1;0;0", "reply_authors": "7;2;2", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 425.0, 160.78764463312058 ], "wc_reply_reviewers_avg": [ 190.33333333333334, 269.17198137167907 ], "wc_reply_authors_avg": [ 1898.0, 1243.5275094129067 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.6666666666666665, 2.3570226039551585 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2169376195814264157&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "Hyxu6oAqYX", "title": "An Energy-Based Framework for Arbitrary Label Noise Correction", "track": "main", "status": "Reject", "tldr": "We show how to learn a discriminative representation using an energy based semi-supervised model and we show how to use it to correct input dependent label noise of various types on several datasets.", "abstract": "We propose an energy-based framework for correcting mislabelled training examples in the context of binary classification. While existing work addresses random and class-dependent label noise, we focus on feature dependent label noise, which is ubiquitous in real-world data and difficult to model. Two elements distinguish our approach from others: 1) instead of relying on the original feature space, we employ an autoencoder to learn a discriminative representation and 2) we introduce an energy-based formalism for the label correction problem. We prove that a discriminative representation can be learned by training a generative model using a loss function comprised of the difference of energies corresponding to each class. The learned energy value for each training instance is compared to the original training labels and contradictions between energy assignment and training label are used to correct labels. We validate our method across eight datasets, spanning synthetic and realistic settings, and demonstrate the technique's state-of-the-art label correction performance. Furthermore, we derive analytical expressions to show the effect of label noise on the gradients of empirical risk.", "keywords": "label noise;feature dependent noise;label correction;unsupervised machine learning;semi-supervised machine learning", "primary_area": "", "supplementary_material": "", "author": "Jaspreet Sahota;Divya Shanmugam;Janahan Ramanan;Sepehr Eghbali;Marcus Brubaker", "authorids": "sahotaj1@gmail.com;divyas@mit.edu;janahan.ramanan@borealisai.com;sepehr3pehr@gmail.com;mbrubake@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nsahota2019an,\ntitle={An Energy-Based Framework for Arbitrary Label Noise Correction},\nauthor={Jaspreet Sahota and Divya Shanmugam and Janahan Ramanan and Sepehr Eghbali and Marcus Brubaker},\nyear={2019},\nurl={https://openreview.net/forum?id=Hyxu6oAqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Hyxu6oAqYX", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;4;4", "wc_review": "336;235;288", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;639;709", "reply_reviewers": "0;0;0", "reply_authors": "0;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 286.3333333333333, 41.24991582482994 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 449.3333333333333, 319.00923009983404 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15737923978175341881&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "HyxyV209Y7", "title": "Data Poisoning Attack against Unsupervised Node Embedding Methods", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Unsupervised node embedding methods (e.g., DeepWalk, LINE, and node2vec) have attracted growing interests given their simplicity and effectiveness. However, although these methods have been proved effective in a variety of applications, none of the existing work has analyzed the robustness of them. This could be very risky if these methods are attacked by an adversarial party. In this paper, we take the task of link prediction as an example, which is one of the most fundamental problems for graph analysis, and introduce a data poisoning\nattack to node embedding methods. We give a complete characterization of attacker's utilities and present efficient solutions to adversarial attacks for two popular node embedding methods: DeepWalk and LINE. We evaluate our proposed attack model on multiple real-world graphs. Experimental results show that our proposed model can significantly affect the results of link prediction by slightly changing the graph structures (e.g., adding or removing a few edges). We also show that our proposed model is very general and can be transferable across different embedding methods. \nFinally, we conduct a case study on a coauthor network to better understand our attack method.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mingjie Sun;Jian Tang;Huichen Li;Bo Li;Chaowei Xiao;Yao Chen;Dawn Song", "authorids": "sunmj15@gmail.com;tangjianpku@gmail.com;huichen3@illinois.edu;lxbosky@gmail.com;xiaocw@umich.edu;antoniechen@tencent.com;dawnsong@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=HyxyV209Y7", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;5", "wc_review": "146;232;91", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 156.33333333333334, 58.02489887013065 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17434556766872406256&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Learning Self-Imitating Diverse Policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/986", "id": "HyxzRsR9Y7", "author_site": "Tanmay Gangwani, Qiang Liu, Jian Peng", "tldr": "Policy optimization by using past good rollouts from the agent; learning shaped rewards via divergence minimization; SVPG with JS-kernel for population-based exploration.", "abstract": "The success of popular algorithms for deep reinforcement learning, such as policy-gradients and Q-learning, relies heavily on the availability of an informative reward signal at each timestep of the sequential decision-making process. When rewards are only sparsely available during an episode, or a rewarding feedback is provided only after episode termination, these algorithms perform sub-optimally due to the difficultly in credit assignment. Alternatively, trajectory-based policy optimization methods, such as cross-entropy method and evolution strategies, do not require per-timestep rewards, but have been found to suffer from high sample complexity by completing forgoing the temporal nature of the problem. Improving the efficiency of RL algorithms in real-world problems with sparse or episodic rewards is therefore a pressing need. In this work, we introduce a self-imitation learning algorithm that exploits and explores well in the sparse and episodic reward settings. We view each policy as a state-action visitation distribution and formulate policy optimization as a divergence minimization problem. We show that with Jensen-Shannon divergence, this divergence minimization problem can be reduced into a policy-gradient algorithm with shaped rewards learned from experience replays. Experimental results indicate that our algorithm works comparable to existing algorithms in environments with dense rewards, and significantly better in environments with sparse and episodic rewards. We then discuss limitations of self-imitation learning, and propose to solve them by using Stein variational policy gradient descent with the Jensen-Shannon kernel to learn multiple diverse policies. We demonstrate its effectiveness on a challenging variant of continuous-control MuJoCo locomotion tasks.", "keywords": "Reinforcement-learning;Imitation-learning;Ensemble-training", "primary_area": "", "supplementary_material": "", "author": "Tanmay Gangwani;Qiang Liu;Jian Peng", "authorids": "gangwan2@uiuc.edu;lqiang@cs.utexas.edu;jianpeng@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngangwani2018learning,\ntitle={Learning Self-Imitating Diverse Policies},\nauthor={Tanmay Gangwani and Qiang Liu and Jian Peng},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyxzRsR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;8;8", "confidence": "2;4;3", "wc_review": "514;450;622", "wc_reply_reviewers": "55;0;0", "wc_reply_authors": "604;1127;149", "reply_reviewers": "1;0;0", "reply_authors": "3;2;1", "rating_avg": [ 7.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 528.6666666666666, 70.98043548909955 ], "wc_reply_reviewers_avg": [ 18.333333333333332, 25.927248643506744 ], "wc_reply_authors_avg": [ 626.6666666666666, 399.5883993422343 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9114374846526316019&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HyxzRsR9Y7", "pdf": "https://openreview.net/pdf?id=HyxzRsR9Y7", "email": ";;", "author_num": 3 }, { "title": "ProxQuant: Quantized Neural Networks via Proximal Operators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/828", "id": "HyzMyhCcK7", "author_site": "Yu Bai, Yu-Xiang Wang, Edo Liberty", "tldr": "A principled framework for model quantization using the proximal gradient method, with empirical evaluation and theoretical convergence analyses.", "abstract": "To make deep neural networks feasible in resource-constrained environments (such as mobile devices), it is beneficial to quantize models by using low-precision weights. One common technique for quantizing neural networks is the straight-through gradient method, which enables back-propagation through the quantization mapping. Despite its empirical success, little is understood about why the straight-through gradient method works.\nBuilding upon a novel observation that the straight-through gradient method is in fact identical to the well-known Nesterov\u2019s dual-averaging algorithm on a quantization constrained optimization problem, we propose a more principled alternative approach, called ProxQuant , that formulates quantized network training as a regularized learning problem instead and optimizes it via the prox-gradient method. ProxQuant does back-propagation on the underlying full-precision vector and applies an efficient prox-operator in between stochastic gradient steps to encourage quantizedness. For quantizing ResNets and LSTMs, ProxQuant outperforms state-of-the-art results on binary quantization and is on par with state-of-the-art on multi-bit quantization. We further perform theoretical analyses showing that ProxQuant converges to stationary points under mild smoothness assumptions, whereas variants such as lazy prox-gradient method can fail to converge in the same setting.", "keywords": "Model quantization;Optimization;Regularization", "primary_area": "", "supplementary_material": "", "author": "Yu Bai;Yu-Xiang Wang;Edo Liberty", "authorids": "yub@stanford.edu;yuxiangw@cs.ucsb.edu;libertye@amazon.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbai2018proxquant,\ntitle={ProxQuant: Quantized Neural Networks via Proximal Operators},\nauthor={Yu Bai and Yu-Xiang Wang and Edo Liberty},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyzMyhCcK7},\n}", "github": "[![github](/images/github_icon.svg) allenbai01/ProxQuant](https://github.com/allenbai01/ProxQuant)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "4;4;4", "wc_review": "230;744;106", "wc_reply_reviewers": "0;176;0", "wc_reply_authors": "781;911;8", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 360.0, 276.2076513543147 ], "wc_reply_reviewers_avg": [ 58.666666666666664, 82.96719565922157 ], "wc_reply_authors_avg": [ 566.6666666666666, 398.5861122629448 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13740367040689029941&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=HyzMyhCcK7", "pdf": "https://openreview.net/pdf?id=HyzMyhCcK7", "email": ";;", "author_num": 3 }, { "title": "Universal Transformers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1068", "id": "HyzdRiR9Y7", "author_site": "Mostafa Dehghani, Stephan Gouws, Oriol Vinyals, Jakob Uszkoreit, Lukasz Kaiser", "tldr": "We introduce the Universal Transformer, a self-attentive parallel-in-time recurrent sequence model that outperforms Transformers and LSTMs on a wide range of sequence-to-sequence tasks, including machine translation.", "abstract": "Recurrent neural networks (RNNs) sequentially process data by updating their state with each new data point, and have long been the de facto choice for sequence modeling tasks. However, their inherently sequential computation makes them slow to train. Feed-forward and convolutional architectures have recently been shown to achieve superior results on some sequence modeling tasks such as machine translation, with the added advantage that they concurrently process all inputs in the sequence, leading to easy parallelization and faster training times. Despite these successes, however, popular feed-forward sequence models like the Transformer fail to generalize in many simple tasks that recurrent models handle with ease, e.g. copying strings or even simple logical inference when the string or formula lengths exceed those observed at training time. We propose the Universal Transformer (UT), a parallel-in-time self-attentive recurrent sequence model which can be cast as a generalization of the Transformer model and which addresses these issues. UTs combine the parallelizability and global receptive field of feed-forward sequence models like the Transformer with the recurrent inductive bias of RNNs. We also add a dynamic per-position halting mechanism and find that it improves accuracy on several tasks. In contrast to the standard Transformer, under certain assumptions UTs can be shown to be Turing-complete. Our experiments show that UTs outperform standard Transformers on a wide range of algorithmic and language understanding tasks, including the challenging LAMBADA language modeling task where UTs achieve a new state of the art, and machine translation where UTs achieve a 0.9 BLEU improvement over Transformers on the WMT14 En-De dataset.", "keywords": "sequence-to-sequence;rnn;transformer;machine translation;language understanding;learning to execute", "primary_area": "", "supplementary_material": "", "author": "Mostafa Dehghani;Stephan Gouws;Oriol Vinyals;Jakob Uszkoreit;Lukasz Kaiser", "authorids": "dehghani@uva.nl;sgouws@google.com;vinyals@google.com;usz@google.com;lukaszkaiser@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndehghani2018universal,\ntitle={Universal Transformers},\nauthor={Mostafa Dehghani and Stephan Gouws and Oriol Vinyals and Jakob Uszkoreit and Lukasz Kaiser},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyzdRiR9Y7},\n}", "github": "[![github](/images/github_icon.svg) tensorflow/tensor2tensor](https://github.com/tensorflow/tensor2tensor) + [![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=HyzdRiR9Y7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "2;4;4", "wc_review": "548;383;624", "wc_reply_reviewers": "0;0;97", "wc_reply_authors": "532;1115;992", "reply_reviewers": "0;0;1", "reply_authors": "2;2;2", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 518.3333333333334, 100.5993152174617 ], "wc_reply_reviewers_avg": [ 32.333333333333336, 45.72623851673007 ], "wc_reply_authors_avg": [ 879.6666666666666, 250.91344235723113 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 1087, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8443376534582904234&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=HyzdRiR9Y7", "pdf": "https://openreview.net/pdf?id=HyzdRiR9Y7", "email": ";;;;", "author_num": 5 }, { "title": "Learning to Adapt in Dynamic, Real-World Environments through Meta-Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1073", "id": "HyztsoC5Y7", "author_site": "Anusha Nagabandi, Ignasi Clavera, Simin Liu, Ronald Fearing, Pieter Abbeel, Sergey Levine, Chelsea Finn", "tldr": "A model-based meta-RL algorithm that enables a real robot to adapt online in dynamic environments", "abstract": "Although reinforcement learning methods can achieve impressive results in simulation, the real world presents two major challenges: generating samples is exceedingly expensive, and unexpected perturbations or unseen situations cause proficient but specialized policies to fail at test time. Given that it is impractical to train separate policies to accommodate all situations the agent may see in the real world, this work proposes to learn how to quickly and effectively adapt online to new tasks. To enable sample-efficient learning, we consider learning online adaptation in the context of model-based reinforcement learning. Our approach uses meta-learning to train a dynamics model prior such that, when combined with recent data, this prior can be rapidly adapted to the local context. Our experiments demonstrate online adaptation for continuous control tasks on both simulated and real-world agents. We first show simulated agents adapting their behavior online to novel terrains, crippled body parts, and highly-dynamic environments. We also illustrate the importance of incorporating online adaptation into autonomous agents that operate in the real world by applying our method to a real dynamic legged millirobot: We demonstrate the agent's learned ability to quickly adapt online to a missing leg, adjust to novel terrains and slopes, account for miscalibration or errors in pose estimation, and compensate for pulling payloads.", "keywords": "meta-learning;reinforcement learning;meta reinforcement learning;online adaptation", "primary_area": "", "supplementary_material": "", "author": "Anusha Nagabandi;Ignasi Clavera;Simin Liu;Ronald S. Fearing;Pieter Abbeel;Sergey Levine;Chelsea Finn", "authorids": "nagaban2@berkeley.edu;iclavera@berkeley.edu;simin.liu@berkeley.edu;ronf@berkeley.edu;pabbeel@berkeley.edu;svlevine@eecs.berkeley.edu;cbfinn@eecs.berkeley.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nclavera2018learning,\ntitle={Learning to Adapt in Dynamic, Real-World Environments through Meta-Reinforcement Learning},\nauthor={Ignasi Clavera and Anusha Nagabandi and Simin Liu and Ronald S. Fearing and Pieter Abbeel and Sergey Levine and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=HyztsoC5Y7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=HyztsoC5Y7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "2;7;7", "confidence": "5;5;3", "wc_review": "774;370;524", "wc_reply_reviewers": "981;0;0", "wc_reply_authors": "1480;256;523", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.333333333333333, 2.357022603955158 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 556.0, 166.4772256696593 ], "wc_reply_reviewers_avg": [ 327.0, 462.44783489600206 ], "wc_reply_authors_avg": [ 753.0, 525.4959562166011 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.5000000000000001, "gs_citation": 740, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15863647627484548433&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=HyztsoC5Y7", "pdf": "https://openreview.net/pdf?id=HyztsoC5Y7", "email": ";;;;;;", "author_num": 7 }, { "id": "S14g5s09tm", "title": "Unseen Action Recognition with Unpaired Adversarial Multimodal Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we present a method to learn a joint multimodal representation space that allows for the recognition of unseen activities in videos. We compare the effect of placing various constraints on the embedding space using paired text and video data. Additionally, we propose a method to improve the joint embedding space using an adversarial formulation with unpaired text and video data. In addition to testing on publicly available datasets, we introduce a new, large-scale text/video dataset. We experimentally confirm that learning such shared embedding space benefits three difficult tasks (i) zero-shot activity classification, (ii) unsupervised activity discovery, and (iii) unseen activity captioning.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "AJ Piergiovanni;Michael S. Ryoo", "authorids": "ajpiergi@indiana.edu;mryoo@indiana.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npiergiovanni2019unseen,\ntitle={Unseen Action Recognition with Unpaired Adversarial Multimodal Learning},\nauthor={AJ Piergiovanni and Michael S. Ryoo},\nyear={2019},\nurl={https://openreview.net/forum?id=S14g5s09tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S14g5s09tm", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;4", "wc_review": "795;364;382", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "494;623;179", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 513.6666666666666, 199.06838579297875 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 432.0, 186.48860555004427 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:m9gjXFnjlXUJ:scholar.google.com/&scioq=Unseen+Action+Recognition+with+Unpaired+Adversarial+Multimodal+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S14h9sCqYm", "title": "Weakly-supervised Knowledge Graph Alignment with Adversarial Learning", "track": "main", "status": "Reject", "tldr": "This paper studies weakly-supervised knowledge graph alignment with adversarial training frameworks.", "abstract": "Aligning knowledge graphs from different sources or languages, which aims to align both the entity and relation, is critical to a variety of applications such as knowledge graph construction and question answering. Existing methods of knowledge graph alignment usually rely on a large number of aligned knowledge triplets to train effective models. However, these aligned triplets may not be available or are expensive to obtain for many domains. Therefore, in this paper we study how to design fully-unsupervised methods or weakly-supervised methods, i.e., to align knowledge graphs without or with only a few aligned triplets. We propose an unsupervised framework based on adversarial training, which is able to map the entities and relations in a source knowledge graph to those in a target knowledge graph. This framework can be further seamlessly integrated with existing supervised methods, where only a limited number of aligned triplets are utilized as guidance. Experiments on real-world datasets prove the effectiveness of our proposed approach in both the weakly-supervised and unsupervised settings.", "keywords": "Knowledge Graph Alignment;Generative Adversarial Network;Weakly Supervised", "primary_area": "", "supplementary_material": "", "author": "Meng Qu;Jian Tang;Yoshua Bengio", "authorids": "qumn123@gmail.com;tangjianpku@gmail.com;yoshua.bengio@mila.quebec", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nqu2019weaklysupervised,\ntitle={Weakly-supervised Knowledge Graph Alignment with Adversarial Learning},\nauthor={Meng Qu and Jian Tang and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=S14h9sCqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S14h9sCqYm", "pdf_size": 0, "rating": "5;5;5", "confidence": "3;4;3", "wc_review": "215;272;495", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 327.3333333333333, 120.82034412943689 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15461097099151386427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "L-Shapley and C-Shapley: Efficient Model Interpretation for Structured Data", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1065", "id": "S1E3Ko09F7", "author_site": "Jianbo Chen, Le Song, Martin Wainwright, Michael Jordan", "tldr": "We develop two linear-complexity algorithms for model-agnostic model interpretation based on the Shapley value, in the settings where the contribution of features to the target is well-approximated by a graph-structured factorization.", "abstract": "Instancewise feature scoring is a method for model interpretation, which yields, for each test instance, a vector of importance scores associated with features. Methods based on the Shapley score have been proposed as a fair way of computing feature attributions, but incur an exponential complexity in the number of features. This combinatorial explosion arises from the definition of Shapley value and prevents these methods from being scalable to large data sets and complex models. We focus on settings in which the data have a graph structure, and the contribution of features to the target variable is well-approximated by a graph-structured factorization. In such settings, we develop two algorithms with linear complexity for instancewise feature importance scoring on black-box models. We establish the relationship of our methods to the Shapley value and a closely related concept known as the Myerson value from cooperative game theory. We demonstrate on both language and image data that our algorithms compare favorably with other methods using both quantitative metrics and human evaluation.", "keywords": "Model Interpretation;Feature Selection", "primary_area": "", "supplementary_material": "", "author": "Jianbo Chen;Le Song;Martin J. Wainwright;Michael I. Jordan", "authorids": "jianbochen@berkeley.edu;lsong@cc.gatech.edu;wainwrig@berkeley.edu;jordan@cs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchen2018lshapley,\ntitle={L-Shapley and C-Shapley: Efficient Model Interpretation for Structured Data},\nauthor={Jianbo Chen and Le Song and Martin J. Wainwright and Michael I. Jordan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1E3Ko09F7},\n}", "github": "[![github](/images/github_icon.svg) Jianbo-Lab/LCShapley](https://github.com/Jianbo-Lab/LCShapley)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;2;3", "wc_review": "358;165;390", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "893;371;234", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 304.3333333333333, 99.38589213543116 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 499.3333333333333, 283.92761205787804 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 268, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13478206087371335896&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1E3Ko09F7", "pdf": "https://openreview.net/pdf?id=S1E3Ko09F7", "email": ";;;", "author_num": 4 }, { "id": "S1E64jC5tm", "title": "The Forward-Backward Embedding of Directed Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce a novel embedding of directed graphs derived from the singular value decomposition (SVD) of the normalized adjacency matrix. Specifically, we show that, after proper normalization of the singular vectors, \n the distances between vectors in the embedding space are proportional to the mean commute times between the corresponding nodes by a forward-backward random walk in the graph, which follows the edges alternately in forward and backward directions. In particular, two nodes having many common successors in the graph tend to be represented by close vectors in the embedding space. More formally, we prove that our representation of the graph is equivalent to the spectral embedding of some co-citation graph, where nodes are linked with respect to their common set of successors in the original graph. The interest of our approach is that it does not require to build this co-citation graph, which is typically much denser than the original graph. Experiments on real datasets show the efficiency of the approach. \n", "keywords": "Graph embedding;SVD;random walk;co-clustering", "primary_area": "", "supplementary_material": "", "author": "Thomas Bonald;Nathan De Lara", "authorids": "thomas.bonald@telecom-paristech.fr;nathan.delara@telecom-paristech.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbonald2019the,\ntitle={The Forward-Backward Embedding of Directed Graphs},\nauthor={Thomas Bonald and Nathan De Lara},\nyear={2019},\nurl={https://openreview.net/forum?id=S1E64jC5tm},\n}", "github": "[![github](/images/github_icon.svg) tbonald/directed](https://github.com/tbonald/directed)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1E64jC5tm", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;5", "wc_review": "119;1191;53", "wc_reply_reviewers": "0;112;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;1;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 454.3333333333333, 521.5983980888831 ], "wc_reply_reviewers_avg": [ 37.333333333333336, 52.797306328595546 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rbPBC--vNvQJ:scholar.google.com/&scioq=The+Forward-Backward+Embedding+of+Directed+Graphs&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "title": "Discovery of Natural Language Concepts in Individual Units of CNNs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1097", "id": "S1EERs09YQ", "author_site": "Seil Na, Yo Joong Choe, Dong-Hyun Lee, Gunhee Kim", "tldr": "We show that individual units in CNN representations learned in NLP tasks are selectively responsive to natural language concepts.", "abstract": "Although deep convolutional networks have achieved improved performance in many natural language tasks, they have been treated as black boxes because they are difficult to interpret. Especially, little is known about how they represent language in their intermediate layers. In an attempt to understand the representations of deep convolutional networks trained on language tasks, we show that individual units are selectively responsive to specific morphemes, words, and phrases, rather than responding to arbitrary and uninterpretable patterns. In order to quantitatively analyze such intriguing phenomenon, we propose a concept alignment method based on how units respond to replicated text. We conduct analyses with different architectures on multiple datasets for classification and translation tasks and provide new insights into how deep models understand natural language.", "keywords": "interpretability of deep neural networks;natural language representation", "primary_area": "", "supplementary_material": "", "author": "Seil Na;Yo Joong Choe;Dong-Hyun Lee;Gunhee Kim", "authorids": "seil.na@vision.snu.ac.kr;yj.c@kakaocorp.com;benjamin.lee@kakaobrain.com;gunhee@snu.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nna2018discovery,\ntitle={Discovery of Natural Language Concepts in Individual Units of CNNs},\nauthor={Seil Na and Yo Joong Choe and Dong-Hyun Lee and Gunhee Kim},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1EERs09YQ},\n}", "github": "[![github](/images/github_icon.svg) seilna/CNN-Units-in-NLP](https://github.com/seilna/CNN-Units-in-NLP)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;4", "wc_review": "184;1211;473", "wc_reply_reviewers": "0;0;42", "wc_reply_authors": "225;1065;710", "reply_reviewers": "0;0;1", "reply_authors": "1;3;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 622.6666666666666, 432.4213480185989 ], "wc_reply_reviewers_avg": [ 14.0, 19.79898987322333 ], "wc_reply_authors_avg": [ 666.6666666666666, 344.2947703478647 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16647657304104807726&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1EERs09YQ", "pdf": "https://openreview.net/pdf?id=S1EERs09YQ", "email": ";;;", "author_num": 4 }, { "title": "Towards the first adversarially robust neural network model on MNIST", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1131", "id": "S1EHOsC9tX", "author_site": "Lukas Schott, Jonas Rauber, Matthias Bethge, Wieland Brendel", "tldr": "", "abstract": "Despite much effort, deep neural networks remain highly susceptible to tiny input perturbations and even for MNIST, one of the most common toy datasets in computer vision, no neural network model exists for which adversarial perturbations are large and make semantic sense to humans. We show that even the widely recognized and by far most successful L-inf defense by Madry et~al. (1) has lower L0 robustness than undefended networks and still highly susceptible to L2 perturbations, (2) classifies unrecognizable images with high certainty, (3) performs not much better than simple input binarization and (4) features adversarial perturbations that make little sense to humans. These results suggest that MNIST is far from being solved in terms of adversarial robustness. We present a novel robust classification model that performs analysis by synthesis using learned class-conditional data distributions. We derive bounds on the robustness and go to great length to empirically evaluate our model using maximally effective adversarial attacks by (a) applying decision-based, score-based, gradient-based and transfer-based attacks for several different Lp norms, (b) by designing a new attack that exploits the structure of our defended model and (c) by devising a novel decision-based attack that seeks to minimize the number of perturbed pixels (L0). The results suggest that our approach yields state-of-the-art robustness on MNIST against L0, L2 and L-inf perturbations and we demonstrate that most adversarial examples are strongly perturbed towards the perceptual boundary between the original and the adversarial class.", "keywords": "adversarial examples;MNIST;robustness;deep learning;security", "primary_area": "", "supplementary_material": "", "author": "Lukas Schott;Jonas Rauber;Matthias Bethge;Wieland Brendel", "authorids": "lukas.schott@bethgelab.org;jonas.rauber@bethgelab.org;matthias.bethge@bethgelab.org;wieland.brendel@bethgelab.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nschott2018towards,\ntitle={Towards the first adversarially robust neural network model on {MNIST}},\nauthor={Lukas Schott and Jonas Rauber and Matthias Bethge and Wieland Brendel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1EHOsC9tX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=S1EHOsC9tX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;4", "wc_review": "212;176;345", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "313;181;651", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 244.33333333333334, 72.68348429243667 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 381.6666666666667, 197.92478930700477 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 439, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7711598507862406800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1EHOsC9tX", "pdf": "https://openreview.net/pdf?id=S1EHOsC9tX", "email": ";;;", "author_num": 4 }, { "id": "S1G_cj05YQ", "title": "Activity Regularization for Continual Learning", "track": "main", "status": "Reject", "tldr": "This paper develops a novel regularization for continual learning", "abstract": "While deep neural networks have achieved remarkable successes, they suffer the well-known catastrophic forgetting issue when switching from existing tasks to tackle a new one. In this paper, we study continual learning with deep neural networks that learn from tasks arriving sequentially. We first propose an approximated multi-task learning framework that unifies a family of popular regularization based continual learning methods. We then analyze the weakness of existing approaches, and propose a novel regularization method named \u201cActivity Regularization\u201d (AR), which alleviates forgetting meanwhile keeping model\u2019s plasticity to acquire new knowledge. Extensive experiments show that our method outperform state-of-the-art methods and effectively overcomes catastrophic forgetting.\n", "keywords": "continual learning;regularization", "primary_area": "", "supplementary_material": "", "author": "Quang H. Pham;Steven C. H. Hoi", "authorids": "hqpham.2017@smu.edu.sg;chhoi@smu.edu.sg", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npham2019activity,\ntitle={Activity Regularization for Continual Learning},\nauthor={Quang H. Pham and Steven C. H. Hoi},\nyear={2019},\nurl={https://openreview.net/forum?id=S1G_cj05YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1G_cj05YQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;5", "wc_review": "633;484;214", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 443.6666666666667, 173.41728736073446 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mUwp5RrKqxAJ:scholar.google.com/&scioq=Activity+Regularization+for+Continual+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1GcHsAqtm", "title": "Adaptive Pruning of Neural Language Models for Mobile Devices", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural language models (NLMs) exist in an accuracy-efficiency tradeoff space where better perplexity typically comes at the cost of greater computation complexity. In a software keyboard application on mobile devices, this translates into higher power consumption and shorter battery life. This paper represents the first attempt, to our knowledge, in exploring accuracy-efficiency tradeoffs for NLMs. Building on quasi-recurrent neural networks (QRNNs), we apply pruning techniques to provide a \"knob\" to select different operating points. In addition, we propose a simple technique to recover some perplexity using a negligible amount of memory. Our empirical evaluations consider both perplexity as well as energy consumption on a Raspberry Pi, where we demonstrate which methods provide the best perplexity-power consumption operating point. At one operating point, one of the techniques is able to provide energy savings of 40% over the state of the art with only a 17% relative increase in perplexity.", "keywords": "Inference-time pruning;Neural Language Models", "primary_area": "", "supplementary_material": "", "author": "Raphael Tang;Jimmy Lin", "authorids": "r33tang@uwaterloo.ca;jimmylin@uwaterloo.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntang2019adaptive,\ntitle={Adaptive Pruning of Neural Language Models for Mobile Devices},\nauthor={Raphael Tang and Jimmy Lin},\nyear={2019},\nurl={https://openreview.net/forum?id=S1GcHsAqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1GcHsAqtm", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;4", "wc_review": "145;495;473", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "172;530;294", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 371.0, 160.0583227035279 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 332.0, 148.60237772884614 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13223004971416309761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Discriminator Rejection Sampling", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/938", "id": "S1GkToR5tm", "author_site": "Samaneh Azadi, Catherine Olsson, Trevor Darrell, Ian Goodfellow, Augustus Odena", "tldr": "We use a GAN discriminator to perform an approximate rejection sampling scheme on the output of the GAN generator.", "abstract": "We propose a rejection sampling scheme using the discriminator of a GAN to\napproximately correct errors in the GAN generator distribution. We show that\nunder quite strict assumptions, this will allow us to recover the data distribution\nexactly. We then examine where those strict assumptions break down and design a\npractical algorithm\u2014called Discriminator Rejection Sampling (DRS)\u2014that can be\nused on real data-sets. Finally, we demonstrate the efficacy of DRS on a mixture of\nGaussians and on the state of the art SAGAN model. On ImageNet, we train an\nimproved baseline that increases the best published Inception Score from 52.52 to\n62.36 and reduces the Frechet Inception Distance from 18.65 to 14.79. We then use\nDRS to further improve on this baseline, improving the Inception Score to 76.08\nand the FID to 13.75.", "keywords": "GANs;rejection sampling", "primary_area": "", "supplementary_material": "", "author": "Samaneh Azadi;Catherine Olsson;Trevor Darrell;Ian Goodfellow;Augustus Odena", "authorids": "sazadi@berkeley.edu;catherio@google.com;trevor@eecs.berkeley.edu;goodfellow@google.com;augustusodena@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nazadi2018discriminator,\ntitle={Discriminator Rejection Sampling},\nauthor={Samaneh Azadi and Catherine Olsson and Trevor Darrell and Ian Goodfellow and Augustus Odena},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1GkToR5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "wc_review": "194;356;201", "wc_reply_reviewers": "59;0;0", "wc_reply_authors": "1069;700;37", "reply_reviewers": "1;0;0", "reply_authors": "2;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 250.33333333333334, 74.77224678240509 ], "wc_reply_reviewers_avg": [ 19.666666666666668, 27.812866726670865 ], "wc_reply_authors_avg": [ 602.0, 426.9730670662963 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17625130787933588736&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1GkToR5tm", "pdf": "https://openreview.net/pdf?id=S1GkToR5tm", "email": ";;;;", "author_num": 5 }, { "title": "Harmonic Unpaired Image-to-image Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/765", "id": "S1M6Z2Cctm", "author_site": "Rui Zhang, Tomas Pfister, Li-Jia Li", "tldr": "Smooth regularization over sample graph for unpaired image-to-image translation results in significantly improved consistency", "abstract": "The recent direction of unpaired image-to-image translation is on one hand very exciting as it alleviates the big burden in obtaining label-intensive pixel-to-pixel supervision, but it is on the other hand not fully satisfactory due to the presence of artifacts and degenerated transformations. In this paper, we take a manifold view of the problem by introducing a smoothness term over the sample graph to attain harmonic functions to enforce consistent mappings during the translation. We develop HarmonicGAN to learn bi-directional translations between the source and the target domains. With the help of similarity-consistency, the inherent self-consistency property of samples can be maintained. Distance metrics defined on two types of features including histogram and CNN are exploited. Under an identical problem setting as CycleGAN, without additional manual inputs and only at a small training-time cost, HarmonicGAN demonstrates a significant qualitative and quantitative improvement over the state of the art, as well as improved interpretability. We show experimental results in a number of applications including medical imaging, object transfiguration, and semantic labeling. We outperform the competing methods in all tasks, and for a medical imaging task in particular our method turns CycleGAN from a failure to a success, halving the mean-squared error, and generating images that radiologists prefer over competing methods in 95% of cases.", "keywords": "unpaired image-to-image translation;cyclegan;smoothness constraint", "primary_area": "", "supplementary_material": "", "author": "Rui Zhang;Tomas Pfister;Jia Li", "authorids": "zhangrui@ict.ac.cn;tpfister@google.com;lijiali@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhang2018harmonic,\ntitle={Harmonic Unpaired Image-to-image Translation},\nauthor={Rui Zhang and Tomas Pfister and Jia Li},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1M6Z2Cctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;5;5", "wc_review": "453;902;304", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1088;2051;1307", "reply_reviewers": "0;0;0", "reply_authors": "2;3;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 553.0, 254.16661202185205 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1482.0, 412.15773679502854 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9289522450448532503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1M6Z2Cctm", "pdf": "https://openreview.net/pdf?id=S1M6Z2Cctm", "email": ";;", "author_num": 3 }, { "id": "S1MAriC5F7", "title": "Massively Parallel Hyperparameter Tuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Modern learning models are characterized by large hyperparameter spaces. In order to adequately explore these large spaces, we must evaluate a large number of configurations, typically orders of magnitude more configurations than available parallel workers. Given the growing costs of model training, we would ideally like to perform this search in roughly the same wall-clock time needed to train a single model. In this work, we tackle this challenge by introducing ASHA, a simple and robust hyperparameter tuning algorithm with solid theoretical underpinnings that exploits parallelism and aggressive early-stopping. Our extensive empirical results show that ASHA outperforms state-of-the-art hyperparameter tuning methods; scales linearly with the number of workers in distributed settings; converges to a high quality configuration in half the time taken by Vizier, Google's internal hyperparameter tuning service) in an experiment with 500 workers; and beats the published result for a near state-of-the-art LSTM architecture in under $2\\times$ the time to train a single model.", "keywords": "hyperparameter optimization;automl", "primary_area": "", "supplementary_material": "", "author": "Liam Li;Kevin Jamieson;Afshin Rostamizadeh;Ekaterina Gonina;Moritz Hardt;Ben Recht;Ameet Talwalkar", "authorids": ";;;;;;talwalkar@cmu.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nli2019massively,\ntitle={Massively Parallel Hyperparameter Tuning},\nauthor={Liam Li and Kevin Jamieson and Afshin Rostamizadeh and Ekaterina Gonina and Moritz Hardt and Ben Recht and Ameet Talwalkar},\nyear={2019},\nurl={https://openreview.net/forum?id=S1MAriC5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1MAriC5F7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "314;361;633", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 436.0, 140.61531448127073 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 212, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17243713618182045142&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "S1MB-3RcF7", "title": "Multi-objective training of Generative Adversarial Networks with multiple discriminators", "track": "main", "status": "Reject", "tldr": "We introduce hypervolume maximization for training GANs with multiple discriminators, showing performance improvements in terms of sample quality and diversity. ", "abstract": "Recent literature has demonstrated promising results on the training of Generative Adversarial Networks by employing a set of discriminators, as opposed to the traditional game involving one generator against a single adversary. Those methods perform single-objective optimization on some simple consolidation of the losses, e.g. an average. In this work, we revisit the multiple-discriminator approach by framing the simultaneous minimization of losses provided by different models as a multi-objective optimization problem. Specifically, we evaluate the performance of multiple gradient descent and the hypervolume maximization algorithm on a number of different datasets. Moreover, we argue that the previously proposed methods and hypervolume maximization can all be seen as variations of multiple gradient descent in which the update direction computation can be done efficiently. Our results indicate that hypervolume maximization presents a better compromise between sample quality and diversity, and computational cost than previous methods.", "keywords": "Generative Adversarial Networks;Multi-objective optimization;Generative models", "primary_area": "", "supplementary_material": "", "author": "Isabela Albuquerque;Jo\u00e3o Monteiro;Thang Doan;Breandan Considine;Tiago Falk;Ioannis Mitliagkas", "authorids": "isabelamcalbuquerque@gmail.com;joaomonteirof@gmail.com;thang.doan@mail.mcgill.ca;breandan.considine@gmail.com;falk@emt.inrs.ca;ioannis@iro.umontreal.ca", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nalbuquerque2019multiobjective,\ntitle={Multi-objective training of Generative Adversarial Networks with multiple discriminators},\nauthor={Isabela Albuquerque and Jo\u00e3o Monteiro and Thang Doan and Breandan Considine and Tiago Falk and Ioannis Mitliagkas},\nyear={2019},\nurl={https://openreview.net/forum?id=S1MB-3RcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1MB-3RcF7", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;3", "wc_review": "506;131;759", "wc_reply_reviewers": "290;67;0", "wc_reply_authors": "1586;825;963", "reply_reviewers": "1;1;0", "reply_authors": "4;2;3", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 465.3333333333333, 257.98751046427725 ], "wc_reply_reviewers_avg": [ 119.0, 123.97042658096593 ], "wc_reply_authors_avg": [ 1124.6666666666667, 331.0411186276143 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15572645706146610128&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "S1MQ6jCcK7", "title": "ChoiceNet: Robust Learning by Revealing Output Correlations", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we focus on the supervised learning problem with corrupt training data. We assume that the training dataset is generated from a mixture of a target distribution and other unknown distributions. We estimate the quality of each data by revealing the correlation between the generated distribution and the target distribution. To this end, we present a novel framework referred to here as ChoiceNet that can robustly infer the target distribution in the presence of inconsistent data. We demonstrate that the proposed framework is applicable to both classification and regression tasks. Particularly, ChoiceNet is evaluated in comprehensive experiments, where we show that it constantly outperforms existing baseline methods in the handling of noisy data in synthetic regression tasks as well as behavior cloning problems. In the classification tasks, we apply the proposed method to the MNIST and CIFAR-10 datasets and it shows superior performances in terms of robustness to different types of noisy labels.", "keywords": "Robust Deep Learning;weakly supervised learning", "primary_area": "", "supplementary_material": "", "author": "Sungjoon Choi;Sanghoon Hong;Kyungjae Lee;Sungbin Lim", "authorids": "sungjoon.s.choi@gmail.com;sanghoon.hong@kakaobrain.com;kyungjae.lee@cpslab.snu.ac.kr;sungbin.lim@kakaobrain.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchoi2019choicenet,\ntitle={ChoiceNet: Robust Learning by Revealing Output Correlations},\nauthor={Sungjoon Choi and Sanghoon Hong and Kyungjae Lee and Sungbin Lim},\nyear={2019},\nurl={https://openreview.net/forum?id=S1MQ6jCcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1MQ6jCcK7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "wc_review": "507;465;745", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1434;869;1198", "reply_reviewers": "0;0;0", "reply_authors": "3;2;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 572.3333333333334, 123.29188492714714 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1167.0, 231.69951805445487 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12241024736417026025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "S1MeM2RcFm", "title": "BlackMarks: Black-box Multi-bit Watermarking for Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "Proposing the first watermarking framework for multi-bit signature embedding and extraction using the outputs of the DNN. ", "abstract": "Deep Neural Networks (DNNs) are increasingly deployed in cloud servers and autonomous agents due to their superior performance. The deployed DNN is either leveraged in a white-box setting (model internals are publicly known) or a black-box setting (only model outputs are known) depending on the application. A practical concern in the rush to adopt DNNs is protecting the models against Intellectual Property (IP) infringement. We propose BlackMarks, the first end-to-end multi-bit watermarking framework that is applicable in the black-box scenario. BlackMarks takes the pre-trained unmarked model and the owner\u2019s binary signature as inputs. The output is the corresponding marked model with specific keys that can be later used to trigger the embedded watermark. To do so, BlackMarks first designs a model-dependent encoding scheme that maps all possible classes in the task to bit \u20180\u2019 and bit \u20181\u2019. Given the owner\u2019s watermark signature (a binary string), a set of key image and label pairs is designed using targeted adversarial attacks. The watermark (WM) is then encoded in the distribution of output activations of the DNN by fine-tuning the model with a WM-specific regularized loss. To extract the WM, BlackMarks queries the model with the WM key images and decodes the owner\u2019s signature from the corresponding predictions using the designed encoding scheme. We perform a comprehensive evaluation of BlackMarks\u2019 performance on MNIST, CIFAR-10, ImageNet datasets and corroborate its effectiveness and robustness. BlackMarks preserves the functionality of the original DNN and incurs negligible WM embedding overhead as low as 2.054%.", "keywords": "Digital Watermarking;IP Protection;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Huili Chen;Bita Darvish Rouhani;Farinaz Koushanfar", "authorids": "huc044@ucsd.edu;bita@ucsd.edu;farinaz@ucsd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2019blackmarks,\ntitle={BlackMarks: Black-box Multi-bit Watermarking for Deep Neural Networks},\nauthor={Huili Chen and Bita Darvish Rouhani and Farinaz Koushanfar},\nyear={2019},\nurl={https://openreview.net/forum?id=S1MeM2RcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1MeM2RcFm", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "448;775;212", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 478.3333333333333, 230.84241859377195 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13904849225780897192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Universal Successor Features Approximators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/815", "id": "S1VWjiRcKX", "author_site": "Diana Borsa, Andre Barreto, John Quan, Daniel J. Mankowitz, Hado van Hasselt, Remi Munos, David Silver, Tom Schaul", "tldr": "", "abstract": "The ability of a reinforcement learning (RL) agent to learn about many reward functions at the same time has many potential benefits, such as the decomposition of complex tasks into simpler ones, the exchange of information between tasks, and the reuse of skills. We focus on one aspect in particular, namely the ability to generalise to unseen tasks. Parametric generalisation relies on the interpolation power of a function approximator that is given the task description as input; one of its most common form are universal value function approximators (UVFAs). Another way to generalise to new tasks is to exploit structure in the RL problem itself. Generalised policy improvement (GPI) combines solutions of previous tasks into a policy for the unseen task; this relies on instantaneous policy evaluation of old policies under the new reward function, which is made possible through successor features (SFs). Our proposed \\emph{universal successor features approximators} (USFAs) combine the advantages of all of these, namely the scalability of UVFAs, the instant inference of SFs, and the strong generalisation of GPI. We discuss the challenges involved in training a USFA, its generalisation properties and demonstrate its practical benefits and transfer abilities on a large-scale domain in which the agent has to navigate in a first-person perspective three-dimensional environment. ", "keywords": "reinforcement learning;zero-shot transfer;successor features;universal value functions;general value functions", "primary_area": "", "supplementary_material": "", "author": "Diana Borsa;Andre Barreto;John Quan;Daniel J. Mankowitz;Hado van Hasselt;Remi Munos;David Silver;Tom Schaul", "authorids": "borsa@google.com;andrebarreto@google.com;johnquan@google.com;dmankowitz@google.com;hado@google.com;munos@google.com;davidsilver@google.com;schaul@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nborsa2018universal,\ntitle={Universal Successor Features Approximators},\nauthor={Diana Borsa and Andre Barreto and John Quan and Daniel J. Mankowitz and Hado van Hasselt and Remi Munos and David Silver and Tom Schaul},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1VWjiRcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;4;3", "wc_review": "111;554;349", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 338.0, 181.02117739829964 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 160, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14497638041903171358&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1VWjiRcKX", "pdf": "https://openreview.net/pdf?id=S1VWjiRcKX", "email": ";;;;;;;", "author_num": 8 }, { "id": "S1VeG309Fm", "title": "Teaching Machine How to Think by Natural Language: A study on Machine Reading Comprehension", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep learning ends up as a black box, in which how it makes the decision cannot be directly understood by humans, let alone guide the reasoning process of deep network. In this work, we seek the possibility to guide the learning of network in reading comprehension task by natural language. Two approaches are proposed. In the first approach, the latent representation in the neural network is deciphered into text by a decoder; in the second approach, deep network uses text as latent representation. Human tutor provides ground truth for the output of the decoder or latent representation represented by text. On the bAbI QA tasks, we found that with the guidance on a few examples, the model can achieve the same performance with remarkably less training examples.", "keywords": "Machine Reading Comprehension", "primary_area": "", "supplementary_material": "", "author": "Tsung Han Wu;Hung-yi Lee;Yu Tsao;ChaoI;Tuan", "authorids": ";;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=S1VeG309Fm", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9aD30mtnkY0J:scholar.google.com/&scioq=Teaching+Machine+How+to+Think+by+Natural+Language:+A+study+on+Machine+Reading+Comprehension&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "S1eB3sRqtm", "title": "Exploring Curvature Noise in Large-Batch Stochastic Optimization", "track": "main", "status": "Reject", "tldr": "Engineer large-batch training such that we retain fast training while achieving better generalization.", "abstract": "Using stochastic gradient descent (SGD) with large batch-sizes to train deep neural networks is an increasingly popular technique. By doing so, one can improve parallelization by scaling to multiple workers (GPUs) and hence leading to significant reductions in training time. Unfortunately, a major drawback is the so-called generalization gap: large-batch training typically leads to a degradation in generalization performance of the model as compared to small-batch training. In this paper, we propose to correct this generalization gap by adding diagonal Fisher curvature noise to large-batch gradient updates. We provide a theoretical analysis of our method in the convex quadratic setting. Our empirical study with state-of-the-art deep learning models shows that our method not only improves the generalization performance in large-batch training but furthermore, does so in a way where the training convergence remains desirable and the training duration is not elongated. We additionally connect our method to recent works on loss surface landscape in the experimental section. ", "keywords": "optimization;large-batch training;generalization;noise covariance", "primary_area": "", "supplementary_material": "", "author": "Yeming Wen;Kevin Luk;Maxime Gazeau;Guodong Zhang;Harris Chan;Jimmy Ba", "authorids": "ywen@cs.toronto.edu;kevin.luk@borealisai.com;maxime.gazeau@borealisai.com;gdzhang.cs@gmail.com;hchan@cs.toronto.edu;jba@cs.toronto.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nwen2019exploring,\ntitle={Exploring Curvature Noise in Large-Batch Stochastic Optimization},\nauthor={Yeming Wen and Kevin Luk and Maxime Gazeau and Guodong Zhang and Harris Chan and Jimmy Ba},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eB3sRqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1eB3sRqtm", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;4;4", "wc_review": "810;433;183", "wc_reply_reviewers": "228;577;0", "wc_reply_authors": "1008;2061;76", "reply_reviewers": "2;5;0", "reply_authors": "4;6;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 475.3333333333333, 257.7160366674056 ], "wc_reply_reviewers_avg": [ 268.3333333333333, 237.27948827396682 ], "wc_reply_authors_avg": [ 1048.3333333333333, 810.8745621583869 ], "reply_reviewers_avg": [ 2.3333333333333335, 2.0548046676563256 ], "reply_authors_avg": [ 3.6666666666666665, 2.0548046676563256 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ddX80AwX730J:scholar.google.com/&scioq=Exploring+Curvature+Noise+in+Large-Batch+Stochastic+Optimization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "S1eBzhRqK7", "title": "Evolutionary-Neural Hybrid Agents for Architecture Search", "track": "main", "status": "Reject", "tldr": "We propose a class of Evolutionary-Neural hybrid agents, that retain the best qualities of the two approaches.", "abstract": "Neural Architecture Search has recently shown potential to automate the design of Neural Networks. The use of Neural Network agents trained with Reinforcement Learning can offer the possibility to learn complex patterns, as well as the ability to explore a vast and compositional search space. On the other hand, evolutionary algorithms offer the greediness and sample efficiency needed for such an application, as each sample requires a considerable amount of resources. We propose a class of Evolutionary-Neural hybrid agents (Evo-NAS), that retain the best qualities of the two approaches. We show that the Evo-NAS agent can outperform both Neural and Evolutionary agents, both on a synthetic task, and on architecture search for a suite of text classification datasets.", "keywords": "Evolutionary;Architecture Search;NAS", "primary_area": "", "supplementary_material": "", "author": "Krzysztof Maziarz;Andrey Khorlin;Quentin de Laroussilhe;Andrea Gesmundo", "authorids": "kmaziarz@google.com;akhorlin@google.com;underflow@google.com;agesmundo@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmaziarz2019evolutionaryneural,\ntitle={Evolutionary-Neural Hybrid Agents for Architecture Search},\nauthor={Krzysztof Maziarz and Andrey Khorlin and Quentin de Laroussilhe and Andrea Gesmundo},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eBzhRqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1eBzhRqK7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;2", "wc_review": "253;395;99", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "44;0;0", "reply_reviewers": "0;0;0", "reply_authors": "1;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 249.0, 120.87459065770054 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 14.666666666666666, 20.741798914805393 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18074572902824934344&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "S1eEdj0cK7", "title": "On the Relationship between Neural Machine Translation and Word Alignment", "track": "main", "status": "Reject", "tldr": "It proposes methods to induce word alignment for neural machine translation (NMT) and uses them to interpret the relationship between NMT and word alignment.", "abstract": "Prior researches suggest that attentional neural machine translation (NMT) is able to capture word alignment by attention, however, to our surprise, it almost fails for NMT models with multiple attentional layers except for those with a single layer. This paper introduce two methods to induce word alignment from general neural machine translation models. Experiments verify that both methods obtain much better word alignment than the method by attention. Furthermore, based on one of the proposed method, we design a criterion to divide target words into two categories (i.e. those mostly contributed from source \"CFS\" words and the other words mostly contributed from target \"CFT\" words), and analyze word alignment under these two categories in depth. We find that although NMT models are difficult to capture word alignment for CFT words but these words do not sacrifice translation quality significantly, which provides an explanation why NMT is more successful for translation yet worse for word alignment compared to statistical machine translation. We further demonstrate that word alignment errors for CFS words are responsible for translation errors in some extent by measuring the correlation between word alignment and translation for several NMT systems.", "keywords": "Neural Machine Translation;Word Alignment;Neural Network;Pointwise Mutual Information", "primary_area": "", "supplementary_material": "", "author": "Xintong Li;Lemao Liu;Guanlin Li;Max Meng;Shuming Shi", "authorids": "znculee@gmail.com;redmondliu@tencent.com;epsilonlee.green@gmail.com;max.meng@ieee.org;shumingshi@tencent.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019on,\ntitle={On the Relationship between Neural Machine Translation and Word Alignment},\nauthor={Xintong Li and Lemao Liu and Guanlin Li and Max Meng and Shuming Shi},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eEdj0cK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1eEdj0cK7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "267;398;664", "wc_reply_reviewers": "145;0;35", "wc_reply_authors": "623;751;506", "reply_reviewers": "2;0;1", "reply_authors": "3;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 443.0, 165.1686007286696 ], "wc_reply_reviewers_avg": [ 60.0, 61.77917664283546 ], "wc_reply_authors_avg": [ 626.6666666666666, 100.0544296315205 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9BR-jUhoFtgJ:scholar.google.com/&scioq=On+the+Relationship+between+Neural+Machine+Translation+and+Word+Alignment&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1eEmn05tQ", "title": "Uncertainty in Multitask Transfer Learning", "track": "main", "status": "Reject", "tldr": "A scalable method for learning an expressive prior over neural networks across multiple tasks.", "abstract": "Using variational Bayes neural networks, we develop an algorithm capable of accumulating knowledge into a prior from multiple different tasks. This results in a rich prior capable of few-shot learning on new tasks. The posterior can go beyond the mean field approximation and yields good uncertainty on the performed experiments. Analysis on toy tasks show that it can learn from significantly different tasks while finding similarities among them. Experiments on Mini-Imagenet reach state of the art with 74.5% accuracy on 5 shot learning. Finally, we provide two new benchmarks, each showing a failure mode of existing meta learning algorithms such as MAML and prototypical Networks.", "keywords": "Multi Task;Transfer Learning;Hierarchical Bayes;Variational Bayes;Meta Learning;Few Shot learning", "primary_area": "", "supplementary_material": "", "author": "Alexandre Lacoste;Boris Oreshkin;Wonchang Chung;Thomas Boquet;Negar Rostamzadeh;David Krueger", "authorids": "alex.lacoste.shmu@gmail.com;boris@elementai.com;wonchang@elementai.com;thomas@elementai.com;negar.rostamzadeh@gmail.com;david.scott.krueger@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlacoste2019uncertainty,\ntitle={Uncertainty in Multitask Transfer Learning},\nauthor={Alexandre Lacoste and Boris Oreshkin and Wonchang Chung and Thomas Boquet and Negar Rostamzadeh and David Krueger},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eEmn05tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1eEmn05tQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;4", "wc_review": "862;500;725", "wc_reply_reviewers": "0;0;135", "wc_reply_authors": "352;587;304", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 695.6666666666666, 149.23434219002303 ], "wc_reply_reviewers_avg": [ 45.0, 63.63961030678928 ], "wc_reply_authors_avg": [ 414.3333333333333, 123.65633380012885 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1112742765270124998&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "S1eFtj0cKQ", "title": "Generative Models from the perspective of Continual Learning", "track": "main", "status": "Reject", "tldr": "A comparative study of generative models on Continual Learning scenarios.", "abstract": "Which generative model is the most suitable for Continual Learning? This paper aims at evaluating and comparing generative models on disjoint sequential image generation tasks. We investigate how several models learn and forget, considering various strategies: rehearsal, regularization, generative replay and fine-tuning. We used two quantitative metrics to estimate the generation quality and memory ability. We experiment with sequential tasks on three commonly used benchmarks for Continual Learning (MNIST, Fashion MNIST and CIFAR10). We found that among all models, the original GAN performs best and among Continual Learning strategies, generative replay outperforms all other methods. Even if we found satisfactory combinations on MNIST and Fashion MNIST, training generative models sequentially on CIFAR10 is particularly instable, and remains a challenge.", "keywords": "Generative Models;Continual Learning", "primary_area": "", "supplementary_material": "", "author": "Timoth\u00e9e Lesort;Hugo Caselles-Dupr\u00e9;Michael Garcia-Ortiz;Jean-Fran\u00e7ois Goudou;David Filliat", "authorids": "timothee.lesort@thalesgroup.com;caselles@ensta.fr;mgarciaortiz@softbankrobotics.com;jean-francois.goudou@thalesgroup.com;david.filliat@ensta.fr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlesort2019generative,\ntitle={Generative Models from the perspective of Continual Learning},\nauthor={Timoth\u00e9e Lesort and Hugo Caselles-Dupr\u00e9 and Michael Garcia-Ortiz and Jean-Fran\u00e7ois Goudou and David Filliat},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eFtj0cKQ},\n}", "github": "[![github](/images/github_icon.svg) TLESORT/Generative_Continual_Learning](https://github.com/TLESORT/Generative_Continual_Learning)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1eFtj0cKQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "396;209;854", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "753;219;637", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 486.3333333333333, 270.95674111480514 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 536.3333333333334, 229.33139534064867 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 203, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16761246148327543504&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 13 }, { "title": "Gradient Descent Provably Optimizes Over-parameterized Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/956", "id": "S1eK3i09YQ", "author_site": "Simon Du, Xiyu Zhai, Barnab\u00e1s P\u00f3czos, Aarti Singh", "tldr": "We prove gradient descent achieves zero training loss with a linear rate on over-parameterized neural networks.", "abstract": "One of the mysteries in the success of neural networks is randomly initialized first order methods like gradient descent can achieve zero training loss even though the objective function is non-convex and non-smooth. This paper demystifies this surprising phenomenon for two-layer fully connected ReLU activated neural networks. For an $m$ hidden node shallow neural network with ReLU activation and $n$ training data, we show as long as $m$ is large enough and no two inputs are parallel, randomly initialized gradient descent converges to a globally optimal solution at a linear convergence rate for the quadratic loss function.\n\nOur analysis relies on the following observation: over-parameterization and random initialization jointly restrict every weight vector to be close to its initialization for all iterations, which allows us to exploit a strong convexity-like property to show that gradient descent converges at a global linear rate to the global optimum. We believe these insights are also useful in analyzing deep models and other first order methods.", "keywords": "theory;non-convex optimization;overparameterization;gradient descent", "primary_area": "", "supplementary_material": "", "author": "Simon S. Du;Xiyu Zhai;Barnabas Poczos;Aarti Singh", "authorids": "ssdu@cs.cmu.edu;xiyuzhai@mit.edu;bapoczos@cs.cmu.edu;aartisingh@cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ndu2018gradient,\ntitle={Gradient Descent Provably Optimizes Over-parameterized Neural Networks},\nauthor={Simon S. Du and Xiyu Zhai and Barnabas Poczos and Aarti Singh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eK3i09YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "3;7;8;8", "confidence": "5;4;4;4", "wc_review": "1814;419;363;359", "wc_reply_reviewers": "0;0;0;73", "wc_reply_authors": "768;192;72;192", "reply_reviewers": "0;0;0;1", "reply_authors": "2;1;1;2", "rating_avg": [ 6.5, 2.0615528128088303 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "wc_review_avg": [ 738.75, 621.2488933591753 ], "wc_reply_reviewers_avg": [ 18.25, 31.60992723813201 ], "wc_reply_authors_avg": [ 306.0, 271.19734511974855 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 42, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9801960588196067, "gs_citation": 888, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8128763459913409987&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1eK3i09YQ", "pdf": "https://openreview.net/pdf?id=S1eK3i09YQ", "email": ";;;", "author_num": 4 }, { "id": "S1eKJ3R5KQ", "title": "Answer-based Adversarial Training for Generating Clarification Questions", "track": "main", "status": "Withdraw", "tldr": "We propose an adversarial training approach to the problem of clarification question generation which uses the answer to the question to model the reward. ", "abstract": "We propose a generative adversarial training approach for the problem of clarification question generation. Our approach generates clarification questions with the goal of eliciting new information that would make the given context more complete. We develop a Generative Adversarial Network (GAN) where the generator is a sequence-to-sequence model and the discriminator is a utility function that models the value of updating the context with the answer to the clarification question. We evaluate on two datasets, using both automatic metrics and human judgments of usefulness, specificity and relevance, showing that our approach outperforms both a retrieval-based model and ablations that exclude the utility model and the adversarial training.\n", "keywords": "natural language processing;text generation;generative adversarial network", "primary_area": "", "supplementary_material": "", "author": "Sudha Rao;Hal Daum\u00e9 III", "authorids": "raosudha@cs.umd.edu;hal@umiacs.umd.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1eKJ3R5KQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;5;4", "wc_review": "496;614;258", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 456.0, 148.06304963314335 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 129, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=502436189650788664&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "S1eL8i0cFm", "title": "Cosine similarity-based Adversarial process", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "An adversarial process between two deep neural networks is a promising approach to train robust networks. In this study, we propose a framework for training networks that eliminates subsidiary information via the adversarial process. The objective of the proposed framework is to train a primary model that is robust to existing subsidiary information. This primary model can be used for various recognition tasks, such as digit recognition and speaker identification. Subsidiary information refers to the factors that might decrease the performance of the primary model such as channel information in speaker recognition and noise information in digit recognition.\nOur proposed framework comprises two discriminative models for the primary and subsidiary task, as well as an encoder network for feature representation. A subsidiary task is an operation associated with subsidiary information such as identifying the noise type. The discriminative model for the subsidiary task is trained for modeling the dependency of subsidiary class labels on codes from the encoder. Therefore, we expect that subsidiary information could be eliminated by training the encoder to reduce the dependency between the class labels and codes. In order to do so, we train the weight parameters of the subsidiary model; then, we develop the codes and the parameters of subsidiary model to make them orthogonal. For this purpose, we design a loss function to train the encoder based on cosine similarity between the weight parameters of the subsidiary model and codes. Finally, the proposed framework involves repeatedly performing the adversarial process of modeling the subsidiary information and eliminating it. Furthermore, we discuss possible applications of the proposed framework: reducing channel information for speaker identification and domain information for unsupervised domain adaptation. ", "keywords": "adversarial process;cosine similarity;speaker identification;domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Hee-Soo Heo;Hye-Jin Shim;Jee-Weon Jung;IL-Ho Yang;Sung-Hyun Yoon;Ha-Jin Yu", "authorids": "zhasgone@naver.com;shimhyejin930615@gmail.com;aberforth19@naver.com;heisco@hanmail.net;ysh901108@naver.com;hjyu@uos.ac.kr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1eL8i0cFm", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;3;4", "wc_review": "648;284;233", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 388.3333333333333, 184.7887682974506 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8130853108164881070&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Opportunistic Learning: Budgeted Cost-Sensitive Learning from Data Streams", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/758", "id": "S1eOHo09KX", "author_site": "Mohammad Kachuee, Orpaz Goldstein, Kimmo K\u00e4rkk\u00e4inen, Sajad Darabi, Majid Sarrafzadeh", "tldr": "An online algorithm for cost-aware feature acquisition and prediction", "abstract": "In many real-world learning scenarios, features are only acquirable at a cost constrained under a budget. In this paper, we propose a novel approach for cost-sensitive feature acquisition at the prediction-time. The suggested method acquires features incrementally based on a context-aware feature-value function. We formulate the problem in the reinforcement learning paradigm, and introduce a reward function based on the utility of each feature. Specifically, MC dropout sampling is used to measure expected variations of the model uncertainty which is used as a feature-value function. Furthermore, we suggest sharing representations between the class predictor and value function estimator networks. The suggested approach is completely online and is readily applicable to stream learning setups. The solution is evaluated on three different datasets including the well-known MNIST dataset as a benchmark as well as two cost-sensitive datasets: Yahoo Learning to Rank and a dataset in the medical domain for diabetes classification. According to the results, the proposed method is able to efficiently acquire features and make accurate predictions. ", "keywords": "Cost-Aware Learning;Feature Acquisition;Reinforcement Learning;Stream Learning;Deep Q-Learning", "primary_area": "", "supplementary_material": "", "author": "Mohammad Kachuee;Orpaz Goldstein;Kimmo K\u00e4rkk\u00e4inen;Sajad Darabi;Majid Sarrafzadeh", "authorids": "mkachuee@cs.ucla.edu;orpgol@cs.ucla.edu;kimmo@cs.ucla.edu;sajad.darabi@cs.ucla.edu;majid@cs.ucla.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nkachuee2018opportunistic,\ntitle={Opportunistic Learning: Budgeted Cost-Sensitive Learning from Data Streams},\nauthor={Mohammad Kachuee and Orpaz Goldstein and Kimmo K\u00e4rkk\u00e4inen and Majid Sarrafzadeh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eOHo09KX},\n}", "github": "[![github](/images/github_icon.svg) mkachuee/Opportunistic](https://github.com/mkachuee/Opportunistic)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "355;302;755", "wc_reply_reviewers": "351;115;0", "wc_reply_authors": "1267;1153;1165", "reply_reviewers": "3;1;0", "reply_authors": "5;2;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 470.6666666666667, 202.21495713445356 ], "wc_reply_reviewers_avg": [ 155.33333333333334, 146.10574557110644 ], "wc_reply_authors_avg": [ 1195.0, 51.146847410177685 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=926797319762361897&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1eOHo09KX", "pdf": "https://openreview.net/pdf?id=S1eOHo09KX", "email": ";;;;", "author_num": 5 }, { "id": "S1eVe2AqKX", "title": "PCNN: Environment Adaptive Model Without Finetuning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Convolutional Neural Networks (CNNs) have achieved tremendous success for many computer vision tasks, which shows a promising perspective of deploying CNNs on mobile platforms. An obstacle to this promising perspective is the tension between intensive resource consumption of CNNs and limited resource budget on mobile platforms. Existing works generally utilize a simpler architecture with lower accuracy for a higher energy-efficiency, \\textit{i.e.}, trading accuracy for resource consumption. An emerging opportunity to both increasing accuracy and decreasing resource consumption is \\textbf{class skew}, \\textit{i.e.}, the strong temporal and spatial locality of the appearance of classes. However, it is challenging to efficiently utilize the class skew due to both the frequent switches and the huge number of class skews. Existing works use transfer learning to adapt the model towards the class skew during runtime, which consumes resource intensively. In this paper, we propose \\textbf{probability layer}, an \\textit{easily-implemented and highly flexible add-on module} to adapt the model efficiently during runtime \\textit{without any fine-tuning} and achieving an \\textit{equivalent or better} performance than transfer learning. Further, both \\textit{increasing accuracy} and \\textit{decreasing resource consumption} can be achieved during runtime through the combination of probability layer and pruning methods.", "keywords": "Class skew;Runtime adaption", "primary_area": "", "supplementary_material": "", "author": "Boyuan Feng;Kun Wan;Shu Yang;Yufei Ding", "authorids": "boyuan@cs.ucsb.edu;kun@cs.ucsb.edu;shuyang1995@ucsb.edu;yufeiding@cs.ucsb.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfeng2019pcnn,\ntitle={{PCNN}: Environment Adaptive Model Without Finetuning},\nauthor={Boyuan Feng and Kun Wan and Shu Yang and Yufei Ding},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eVe2AqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1eVe2AqKX", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "wc_review": "167;449;539", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 385.0, 158.46766231632245 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7LyOTCVaDiEJ:scholar.google.com/&scioq=PCNN:+Environment+Adaptive+Model+Without+Finetuning&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "id": "S1eX-nA5KX", "title": "VHEGAN: Variational Hetero-Encoder Randomized GAN for Zero-Shot Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "To extract and relate visual and linguistic concepts from images and textual descriptions for text-based zero-shot learning (ZSL), we develop variational hetero-encoder (VHE) that decodes text via a deep probabilisitic topic model, the variational posterior of whose local latent variables is encoded from an image via a Weibull distribution based inference network. To further improve VHE and add an image generator, we propose VHE randomized generative adversarial net (VHEGAN) that exploits the synergy between VHE and GAN through their shared latent space. After training with a hybrid stochastic-gradient MCMC/variational inference/stochastic gradient descent inference algorithm, VHEGAN can be used in a variety of settings, such as text generation/retrieval conditioning on an image, image generation/retrieval conditioning on a document/image, and generation of text-image pairs. The efficacy of VHEGAN is demonstrated quantitatively with experiments on both conventional and generalized ZSL tasks, and qualitatively on (conditional) image and/or text generation/retrieval.", "keywords": "Deep generative models;deep topic modeling;generative adversarial learning;variational encoder;zero-short learning", "primary_area": "", "supplementary_material": "", "author": "Hao Zhang;Bo Chen;Long Tian;Zhengjue Wang;Mingyuan Zhou", "authorids": "zhanghao_xidian@163.com;bchen@mail.xidian.edu.cn;zhengjuewang@163.com;tianlong_xidian@163.com;mingyuan.zhou@mccombs.utexas.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2019vhegan,\ntitle={{VHEGAN}: Variational Hetero-Encoder Randomized {GAN} for Zero-Shot Learning},\nauthor={Hao Zhang and Bo Chen and Long Tian and Zhengjue Wang and Mingyuan Zhou},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eX-nA5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1eX-nA5KX", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;5;4", "wc_review": "366;379;158", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "704;476;393", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 301.0, 101.25545252808199 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 524.3333333333334, 131.48468436370155 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FxpXc5OJq7gJ:scholar.google.com/&scioq=VHEGAN:+Variational+Hetero-Encoder+Randomized+GAN+for+Zero-Shot+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "DARTS: Differentiable Architecture Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1030", "id": "S1eYHoC5FX", "author_site": "Hanxiao Liu, Karen Simonyan, Yiming Yang", "tldr": "We propose a differentiable architecture search algorithm for both convolutional and recurrent networks, achieving competitive performance with the state of the art using orders of magnitude less computation resources.", "abstract": "This paper addresses the scalability challenge of architecture search by formulating the task in a differentiable manner. Unlike conventional approaches of applying evolution or reinforcement learning over a discrete and non-differentiable search space, our method is based on the continuous relaxation of the architecture representation, allowing efficient search of the architecture using gradient descent. Extensive experiments on CIFAR-10, ImageNet, Penn Treebank and WikiText-2 show that our algorithm excels in discovering high-performance convolutional architectures for image classification and recurrent architectures for language modeling, while being orders of magnitude faster than state-of-the-art non-differentiable techniques.", "keywords": "deep learning;autoML;neural architecture search;image classification;language modeling", "primary_area": "", "supplementary_material": "", "author": "Hanxiao Liu;Karen Simonyan;Yiming Yang", "authorids": "hanxiaol@cs.cmu.edu;simonyan@google.com;yiming@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nliu2018darts,\ntitle={{DARTS}: Differentiable Architecture Search},\nauthor={Hanxiao Liu and Karen Simonyan and Yiming Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1eYHoC5FX},\n}", "github": "[![github](/images/github_icon.svg) quark0/darts](https://github.com/quark0/darts) + [![Papers with Code](/images/pwc_icon.svg) 56 community implementations](https://paperswithcode.com/paper/?openreview=S1eYHoC5FX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "2;5;3", "wc_review": "442;1425;122", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "284;1013;82", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 663.0, 554.4264303464137 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 459.6666666666667, 399.8619206120477 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 44, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.32732683535398854, "gs_citation": 5782, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=895422516420751823&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=S1eYHoC5FX", "pdf": "https://openreview.net/pdf?id=S1eYHoC5FX", "email": ";;", "author_num": 3 }, { "id": "S1e_H3AqYQ", "title": "Exploiting Cross-Lingual Subword Similarities in Low-Resource Document Classification", "track": "main", "status": "Reject", "tldr": "We propose a cross-lingual document classification framework for related language pairs.", "abstract": "Text classification must sometimes be applied in situations with no training data in a target language. However, training data may be available in a related language. We introduce a cross-lingual document classification framework CACO between related language pairs. To best use limited training data, our transfer learning scheme exploits cross-lingual subword similarity by jointly training a character-based embedder and a word-based classifier. The embedder derives vector representations for input words from their written forms, and the classifier makes predictions based on the word vectors. We use a joint character representation for both the source language and the target language, which allows the embedder to generalize knowledge about source language words to target language words with similar forms. We propose a multi-task objective that can further improve the model if additional cross-lingual or monolingual resources are available. CACO models trained under low-resource settings rival cross-lingual word embedding models trained under high-resource settings on related language pairs.\n", "keywords": "cross-lingual transfer;character-based method;low-resource language", "primary_area": "", "supplementary_material": "", "author": "Mozhi Zhang;Yoshinari Fujinuma;Jordan Boyd-Graber", "authorids": "mozhi@cs.umd.edu;yoshinari.fujinuma@colorado.edu;jbg@umiacs.umd.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019exploiting,\ntitle={Exploiting Cross-Lingual Subword Similarities in Low-Resource Document Classification},\nauthor={Mozhi Zhang and Yoshinari Fujinuma and Jordan Boyd-Graber},\nyear={2019},\nurl={https://openreview.net/forum?id=S1e_H3AqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1e_H3AqYQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;3;4", "wc_review": "279;370;383", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "198;118;520", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 344.0, 46.26733909213568 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 278.6666666666667, 173.74566341510672 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10837355471581326985&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "S1e_ssC5F7", "title": "Hyper-Regularization: An Adaptive Choice for the Learning Rate in Gradient Descent", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a novel approach for adaptively selecting the learning rate in gradient descent methods. Specifically, we impose a regularization term on the learning rate via a generalized distance, and cast the joint updating process of the parameter and the learning rate into a maxmin problem. Some existing schemes such as AdaGrad (diagonal version) and WNGrad can be rederived from our approach. Based on our approach, the updating rules for the learning rate do not rely on the smoothness constant of optimization problems and are robust to the initial learning rate. We theoretically analyze our approach in full batch and online learning settings, which achieves comparable performances with other first-order gradient-based algorithms in terms of accuracy as well as convergence rate.", "keywords": "Adaptive learning rate;novel framework", "primary_area": "", "supplementary_material": "", "author": "Guangzeng Xie;Hao Jin;Dachao Lin;Zhihua Zhang", "authorids": "smsxgz@pku.edu.cn;jin.hao@pku.edu.cn;lindachao@pku.edu.cn;zhzhang@math.pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxie2019hyperregularization,\ntitle={Hyper-Regularization: An Adaptive Choice for the Learning Rate in Gradient Descent},\nauthor={Guangzeng Xie and Hao Jin and Dachao Lin and Zhihua Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=S1e_ssC5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1e_ssC5F7", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;5", "wc_review": "186;702;282", "wc_reply_reviewers": "12;0;0", "wc_reply_authors": "111;726;186", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 390.0, 224.07141718657468 ], "wc_reply_reviewers_avg": [ 4.0, 5.656854249492381 ], "wc_reply_authors_avg": [ 341.0, 273.95255063605447 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:riW_fZKgY5EJ:scholar.google.com/&scioq=Hyper-Regularization:+An+Adaptive+Choice+for+the+Learning+Rate+in+Gradient+Descent&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Feature-Wise Bias Amplification", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1127", "id": "S1ecm2C9K7", "author_site": "Klas Leino, Matt Fredrikson, Emily Black, Shayak Sen, Anupam Datta", "tldr": "", "abstract": "We study the phenomenon of bias amplification in classifiers, wherein a machine learning model learns to predict classes with a greater disparity than the underlying ground truth. We demonstrate that bias amplification can arise via inductive bias in gradient descent methods resulting in overestimation of importance of moderately-predictive ``weak'' features if insufficient training data is available. This overestimation gives rise to feature-wise bias amplification -- a previously unreported form of bias that can be traced back to the features of a trained model. Through analysis and experiments, we show that the while some bias cannot be mitigated without sacrificing accuracy, feature-wise bias amplification can be mitigated through targeted feature selection. We present two new feature selection algorithms for mitigating bias amplification in linear models, and show how they can be adapted to convolutional neural networks efficiently. Our experiments on synthetic and real data demonstrate that these algorithms consistently lead to reduced bias without harming accuracy, in some cases eliminating predictive bias altogether while providing modest gains in accuracy.", "keywords": "bias;bias amplification;classification", "primary_area": "", "supplementary_material": "", "author": "Klas Leino;Emily Black;Matt Fredrikson;Shayak Sen;Anupam Datta", "authorids": "kleino@cs.cmu.edu;emilybla@cs.cmu.edu;mfredrik@cs.cmu.edu;shayaks@cs.cmu.edu;danupam@cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nleino2018featurewise,\ntitle={Feature-Wise Bias Amplification},\nauthor={Klas Leino and Matt Fredrikson and Emily Black and Shayak Sen and Anupam Datta},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1ecm2C9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;4", "wc_review": "313;337;724", "wc_reply_reviewers": "85;372;603", "wc_reply_authors": "435;871;1083", "reply_reviewers": "1;1;3", "reply_authors": "2;2;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 458.0, 188.3454273402994 ], "wc_reply_reviewers_avg": [ 353.3333333333333, 211.88414024860114 ], "wc_reply_authors_avg": [ 796.3333333333334, 269.76203505229483 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17718568382375744404&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=S1ecm2C9K7", "pdf": "https://openreview.net/pdf?id=S1ecm2C9K7", "email": ";;;;", "author_num": 5 }, { "id": "S1ej8o05tm", "title": "Object detection deep learning networks for Optical Character Recognition", "track": "main", "status": "Reject", "tldr": "Yolo / RCNN neural network for object detection adapted to the task of OCR", "abstract": "In this article, we show how we applied a simple approach coming from deep learning networks for object detection to the task of optical character recognition in order to build image features taylored for documents. In contrast to scene text reading in natural images using networks pretrained on ImageNet, our document reading is performed with small networks inspired by MNIST digit recognition challenge, at a small computational budget and a small stride. The object detection modern frameworks allow a direct end-to-end training, with no other algorithm than the deep learning and the non-max-suppression algorithm to filter the duplicate predictions. The trained weights can be used for higher level models, such as, for example, document classification, or document segmentation.\n", "keywords": "OCR;object detection;RCNN;Yolo", "primary_area": "", "supplementary_material": "", "author": "Christopher Bourez;Aurelien Coquard", "authorids": "christopher.bourez@gmail.com;acq@ivalua.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbourez2019object,\ntitle={Object detection deep learning networks for Optical Character Recognition},\nauthor={Christopher Bourez and Aurelien Coquard},\nyear={2019},\nurl={https://openreview.net/forum?id=S1ej8o05tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1ej8o05tm", "pdf_size": 0, "rating": "1;1;2;2", "confidence": "5;5;5;5", "wc_review": "558;238;135;84", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 1.5, 0.5 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 253.75, 184.20962922713895 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MuzOZWpJDfYJ:scholar.google.com/&scioq=Object+detection+deep+learning+networks+for+Optical+Character+Recognition&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "id": "S1en0sRqKm", "title": "On the Computational Inefficiency of Large Batch Sizes for Stochastic Gradient Descent", "track": "main", "status": "Reject", "tldr": "Large batch training results in rapidly diminishing returns in wall-clock time to convergence to find a good model.", "abstract": "Increasing the mini-batch size for stochastic gradient descent offers significant opportunities to reduce wall-clock training time, but there are a variety of theoretical and systems challenges that impede the widespread success of this technique (Daset al., 2016; Keskar et al., 2016). We investigate these issues, with an emphasis on time to convergence and total computational cost, through an extensive empirical analysis of network training across several architectures and problem domains, including image classification, image segmentation, and language modeling. Although it is common practice to increase the batch size in order to fully exploit available computational resources, we find a substantially more nuanced picture. Our main finding is that across a wide range of network architectures and problem domains, increasing the batch size beyond a certain point yields no decrease in wall-clock time to convergence for either train or test loss. This batch size is usually substantially below the capacity of current systems. We show that popular training strategies for large batch size optimization begin to fail before we can populate all available compute resources, and we show that the point at which these methods break down depends more on attributes like model architecture and data complexity than it does directly on the size of the dataset.", "keywords": "Deep learning;large batch training;scaling rules;stochastic gradient descent", "primary_area": "", "supplementary_material": "", "author": "Noah Golmant;Nikita Vemuri;Zhewei Yao;Vladimir Feinberg;Amir Gholami;Kai Rothauge;Michael Mahoney;Joseph Gonzalez", "authorids": "noah.golmant@berkeley.edu;nikitavemuri@berkeley.edu;zheweiy@berkeley.edu;vladf@berkeley.edu;amirgh@berkeley.edu;kai.rothauge@berkeley.edu;mmahoney@stat.berkeley.edu;jegonzal@cs.berkeley.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\ngolmant2019on,\ntitle={On the Computational Inefficiency of Large Batch Sizes for Stochastic Gradient Descent},\nauthor={Noah Golmant and Nikita Vemuri and Zhewei Yao and Vladimir Feinberg and Amir Gholami and Kai Rothauge and Michael Mahoney and Joseph Gonzalez},\nyear={2019},\nurl={https://openreview.net/forum?id=S1en0sRqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1en0sRqKm", "pdf_size": 0, "rating": "5;5;8", "confidence": "3;3;4", "wc_review": "302;336;266", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "547;713;207", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 301.3333333333333, 28.581268146968025 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 489.0, 210.60547634538534 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=614261227121747028&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "The relativistic discriminator: a key element missing from standard GAN", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/728", "id": "S1erHoR5t7", "tldr": "Improving the quality and stability of GANs using a relativistic discriminator; IPM GANs (such as WGAN-GP) are a special case.", "abstract": "In standard generative adversarial network (SGAN), the discriminator estimates the probability that the input data is real. The generator is trained to increase the probability that fake data is real. We argue that it should also simultaneously decrease the probability that real data is real because 1) this would account for a priori knowledge that half of the data in the mini-batch is fake, 2) this would be observed with divergence minimization, and 3) in optimal settings, SGAN would be equivalent to integral probability metric (IPM) GANs. \n\nWe show that this property can be induced by using a relativistic discriminator which estimate the probability that the given real data is more realistic than a randomly sampled fake data. We also present a variant in which the discriminator estimate the probability that the given real data is more realistic than fake data, on average. We generalize both approaches to non-standard GAN loss functions and we refer to them respectively as Relativistic GANs (RGANs) and Relativistic average GANs (RaGANs). We show that IPM-based GANs are a subset of RGANs which use the identity function. \n\nEmpirically, we observe that 1) RGANs and RaGANs are significantly more stable and generate higher quality data samples than their non-relativistic counterparts, 2) Standard RaGAN with gradient penalty generate data of better quality than WGAN-GP while only requiring a single discriminator update per generator update (reducing the time taken for reaching the state-of-the-art by 400%), and 3) RaGANs are able to generate plausible high resolutions images (256x256) from a very small sample (N=2011), while GAN and LSGAN cannot; these images are of significantly better quality than the ones generated by WGAN-GP and SGAN with spectral normalization.\n\nThe code is freely available on https://github.com/AlexiaJM/RelativisticGAN.", "keywords": "AI;deep learning;generative models;GAN", "primary_area": "", "supplementary_material": "", "author": "Alexia Jolicoeur-Martineau", "authorids": "alexia.jolicoeur-martineau@mail.mcgill.ca", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\njolicoeur-martineau2018,\ntitle={ The relativistic discriminator: a key element missing from standard {GAN}},\nauthor={Alexia Jolicoeur-Martineau},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1erHoR5t7},\n}", "github": "[![github](/images/github_icon.svg) AlexiaJM/RelativisticGAN](https://github.com/AlexiaJM/RelativisticGAN) + [![Papers with Code](/images/pwc_icon.svg) 9 community implementations](https://paperswithcode.com/paper/?openreview=S1erHoR5t7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;2;3", "wc_review": "209;268;209", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "405;695;66", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 228.66666666666666, 27.81286672667087 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 388.6666666666667, 257.04776901493534 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1409, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9348243398459465041&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1erHoR5t7", "pdf": "https://openreview.net/pdf?id=S1erHoR5t7", "email": "", "author_num": 1 }, { "id": "S1ey2sRcYQ", "title": "Direct Optimization through $\\arg \\max$ for Discrete Variational Auto-Encoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "Reparameterization of variational auto-encoders is an effective method for reducing the variance of their gradient estimates. However, when the latent variables are discrete, a reparameterization is problematic due to discontinuities in the discrete space. In this work, we extend the direct loss minimization technique to discrete variational auto-encoders. We first reparameterize a discrete random variable using the $\\arg \\max$ function of the Gumbel-Max perturbation model. We then use direct optimization to propagate gradients through the non-differentiable $\\arg \\max$ using two perturbed $\\arg \\max$ operations.\n", "keywords": "discrete variational auto encoders;generative models;perturbation models", "primary_area": "", "supplementary_material": "", "author": "Guy Lorberbom;Tamir Hazan", "authorids": "guy_lorber@campus.technion.ac.il;tamir.hazan@technion.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlorberbom2019direct,\ntitle={Direct Optimization through $\\arg \\max$ for Discrete Variational Auto-Encoder},\nauthor={Guy Lorberbom and Tamir Hazan},\nyear={2019},\nurl={https://openreview.net/forum?id=S1ey2sRcYQ},\n}", "github": "[![github](/images/github_icon.svg) GuyLor/direct_vae](https://github.com/GuyLor/direct_vae) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=S1ey2sRcYQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1ey2sRcYQ", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;4", "wc_review": "507;228;1081", "wc_reply_reviewers": "0;0;1128", "wc_reply_authors": "697;0;445", "reply_reviewers": "0;0;2", "reply_authors": "1;0;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 605.3333333333334, 355.1096857153982 ], "wc_reply_reviewers_avg": [ 376.0, 531.7442994522837 ], "wc_reply_authors_avg": [ 380.6666666666667, 288.16237706466046 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-dDK4HIbfCsJ:scholar.google.com/&scioq=Direct+Optimization+through+%24%5Carg+%5Cmax%24+for+Discrete+Variational+Auto-Encoder&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1fDssA5Y7", "title": "Distributionally Robust Optimization Leads to Better Generalization: on SGD and Beyond", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we adopt distributionally robust optimization (DRO) (Ben-Tal et al., 2013) in hope to achieve a better generalization in deep learning tasks. We establish the generalization guarantees and analyze the localized Rademacher complexity for DRO, and conduct experiments to show that DRO obtains a better performance. We reveal the profound connection between SGD and DRO, i.e., selecting a batch can be viewed as choosing a distribution over the training set. From this perspective, we prove that SGD is prone to escape from bad stationary points and small batch SGD outperforms large batch SGD. We give an upper bound for the robust loss when SGD converges and keeps stable. We propose a novel Weighted SGD (WSGD) algorithm framework, which assigns high-variance weights to the data of the current batch. We devise a practical implement of WSGD that can directly optimize the robust loss. We test our algorithm on CIFAR-10 and CIFAR-100, and WSGD achieves significant improvements over the conventional SGD.", "keywords": "distributionally robust optimization;deep learning;SGD;learning theory", "primary_area": "", "supplementary_material": "", "author": "Jikai Hou;Kaixuan Huang;Zhihua Zhang", "authorids": "houjikai@pku.edu.cn;hackyhuang@pku.edu.cn;zhzhang@math.pku.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhou2019distributionally,\ntitle={Distributionally Robust Optimization Leads to Better Generalization: on {SGD} and Beyond},\nauthor={Jikai Hou and Kaixuan Huang and Zhihua Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=S1fDssA5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1fDssA5Y7", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;4", "wc_review": "836;1050;278", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "700;1402;38", "reply_reviewers": "0;0;0", "reply_authors": "1;3;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 721.3333333333334, 325.43031341423756 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 713.3333333333334, 556.9304764111546 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12268882465310058672&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "S1fNJhRqFX", "title": "Exploration using Distributional RL and UCB", "track": "main", "status": "Withdraw", "tldr": "Exploration using Distributional RL and truncagted variance.", "abstract": " We establish the relation between Distributional RL and the Upper Confidence Bound (UCB) approach to exploration.\n In this paper we show that the density of the Q function estimated by Distributional RL can be successfully used for the estimation of UCB. This approach does not require counting and, therefore, generalizes well to the Deep RL. We also point to the asymmetry of the empirical densities estimated by the Distributional RL algorithms like QR-DQN. This observation leads to the reexamination of the variance's performance in the UCB type approach to exploration. We introduce truncated variance as an alternative estimator of the UCB and a novel algorithm based on it. We empirically show that newly introduced algorithm achieves better performance in multi-armed bandits setting. Finally, we extend this approach to high-dimensional setting and test it on the Atari 2600 games. New approach achieves better performance compared to QR-DQN in 26 of games, 13 ties out of 49 games.", "keywords": "Distributional RL;UCB;exploration;Atari 2600;multi-armed bandits", "primary_area": "", "supplementary_material": "", "author": "Borislav Mavrin;Hengshuai Yao;Linglong Kong;ShangtongZhang", "authorids": "mavrin@ualberta.ca;hengshuai.yao@huawei.com;lkong@ualberta.ca;zhangshangtong.cpp@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1fNJhRqFX", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;3", "wc_review": "411;367;291", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "94;123;179", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 356.3333333333333, 49.56701412117628 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 132.0, 35.27983371087039 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3F9NV04023gJ:scholar.google.com/&scioq=Exploration+using+Distributional+RL+and+UCB&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Understanding and Improving Interpolation in Autoencoders via an Adversarial Regularizer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/799", "id": "S1fQSiCcYm", "author_site": "David Berthelot, Colin Raffel, Aurko Roy, Ian Goodfellow", "tldr": "We propose a regularizer that improves interpolation and autoencoders and show that it also improves the learned representation for downstream tasks.", "abstract": "Autoencoders provide a powerful framework for learning compressed representations by encoding all of the information needed to reconstruct a data point in a latent code. In some cases, autoencoders can \"interpolate\": By decoding the convex combination of the latent codes for two datapoints, the autoencoder can produce an output which semantically mixes characteristics from the datapoints. In this paper, we propose a regularization procedure which encourages interpolated outputs to appear more realistic by fooling a critic network which has been trained to recover the mixing coefficient from interpolated data. We then develop a simple benchmark task where we can quantitatively measure the extent to which various autoencoders can interpolate and show that our regularizer dramatically improves interpolation in this setting. We also demonstrate empirically that our regularizer produces latent codes which are more effective on downstream tasks, suggesting a possible link between interpolation abilities and learning useful representations.", "keywords": "autoencoders;interpolation;unsupervised learning;representation learning;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "David Berthelot*;Colin Raffel*;Aurko Roy;Ian Goodfellow", "authorids": "dberth@google.com;craffel@gmail.com;aurkor@google.com;goodfellow@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nberthelot*2018understanding,\ntitle={Understanding and Improving Interpolation in Autoencoders via an Adversarial Regularizer},\nauthor={David Berthelot* and Colin Raffel* and Aurko Roy and Ian Goodfellow},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1fQSiCcYm},\n}", "github": "[![github](/images/github_icon.svg) brain-research/acai](https://github.com/brain-research/acai) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=S1fQSiCcYm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;8;9", "confidence": "4;3;4", "wc_review": "289;379;190", "wc_reply_reviewers": "951;24;0", "wc_reply_authors": "1960;363;358", "reply_reviewers": "3;1;0", "reply_authors": "6;1;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 286.0, 77.18808198161165 ], "wc_reply_reviewers_avg": [ 325.0, 442.7572698443245 ], "wc_reply_authors_avg": [ 893.6666666666666, 754.0142939994942 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.6666666666666665, 2.3570226039551585 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 338, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4790513317265776731&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1fQSiCcYm", "pdf": "https://openreview.net/pdf?id=S1fQSiCcYm", "email": ";;;", "author_num": 4 }, { "title": "Quasi-hyperbolic momentum and Adam for deep learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/716", "id": "S1fUpoR5FQ", "author_site": "Jerry Ma, Denis Yarats", "tldr": "Mix plain SGD and momentum (or do something similar with Adam) for great profit.", "abstract": "Momentum-based acceleration of stochastic gradient descent (SGD) is widely used in deep learning. We propose the quasi-hyperbolic momentum algorithm (QHM) as an extremely simple alteration of momentum SGD, averaging a plain SGD step with a momentum step. We describe numerous connections to and identities with other algorithms, and we characterize the set of two-state optimization algorithms that QHM can recover. Finally, we propose a QH variant of Adam called QHAdam, and we empirically demonstrate that our algorithms lead to significantly improved training in a variety of settings, including a new state-of-the-art result on WMT16 EN-DE. We hope that these empirical results, combined with the conceptual and practical simplicity of QHM and QHAdam, will spur interest from both practitioners and researchers. Code is immediately available.", "keywords": "sgd;momentum;nesterov;adam;qhm;qhadam;optimization", "primary_area": "", "supplementary_material": "", "author": "Jerry Ma;Denis Yarats", "authorids": "maj@fb.com;denisy@fb.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nma2018quasihyperbolic,\ntitle={Quasi-hyperbolic momentum and Adam for deep learning},\nauthor={Jerry Ma and Denis Yarats},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1fUpoR5FQ},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/qhoptim](https://github.com/facebookresearch/qhoptim) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=S1fUpoR5FQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;3", "wc_review": "362;792;233", "wc_reply_reviewers": "0;480;22", "wc_reply_authors": "374;1058;250", "reply_reviewers": "0;1;1", "reply_authors": "1;3;2", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 462.3333333333333, 238.98442534097398 ], "wc_reply_reviewers_avg": [ 167.33333333333334, 221.27107558126272 ], "wc_reply_authors_avg": [ 560.6666666666666, 355.29268060134433 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 189, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4018448922538302075&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1fUpoR5FQ", "pdf": "https://openreview.net/pdf?id=S1fUpoR5FQ", "email": ";", "author_num": 2 }, { "id": "S1fcnoR9K7", "title": "Learning with Random Learning Rates.", "track": "main", "status": "Reject", "tldr": "We test stochastic gradient descent with random per-feature learning rates in neural networks, and find performance comparable to using SGD with the optimal learning rate, alleviating the need for learning rate tuning.", "abstract": "Hyperparameter tuning is a bothersome step in the training of deep learning mod- els. One of the most sensitive hyperparameters is the learning rate of the gradient descent. We present the All Learning Rates At Once (Alrao) optimization method for neural networks: each unit or feature in the network gets its own learning rate sampled from a random distribution spanning several orders of magnitude. This comes at practically no computational cost. Perhaps surprisingly, stochastic gra- dient descent (SGD) with Alrao performs close to SGD with an optimally tuned learning rate, for various architectures and problems. Alrao could save time when testing deep learning models: a range of models could be quickly assessed with Alrao, and the most promising models could then be trained more extensively. This text comes with a PyTorch implementation of the method, which can be plugged on an existing PyTorch model.", "keywords": "step size;stochastic gradient descent;hyperparameter tuning", "primary_area": "", "supplementary_material": "", "author": "L\u00e9onard Blier;Pierre Wolinski;Yann Ollivier", "authorids": "leonardb@fb.com;pierre.wolinski@u-psud.fr;yol@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nblier2019learning,\ntitle={Learning with Random Learning Rates.},\nauthor={L\u00e9onard Blier and Pierre Wolinski and Yann Ollivier},\nyear={2019},\nurl={https://openreview.net/forum?id=S1fcnoR9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1fcnoR9K7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "747;303;488", "wc_reply_reviewers": "0;0;76", "wc_reply_authors": "533;264;618", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 512.6666666666666, 182.09948440954525 ], "wc_reply_reviewers_avg": [ 25.333333333333332, 35.82674358011841 ], "wc_reply_authors_avg": [ 471.6666666666667, 150.8870070247564 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6456266193345141498&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Local SGD Converges Fast and Communicates Little", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/979", "id": "S1g2JnRcFX", "author_site": "Sebastian Stich", "tldr": "We prove that parallel local SGD achieves linear speedup with much lesser communication than parallel mini-batch SGD.", "abstract": "Mini-batch stochastic gradient descent (SGD) is state of the art in large scale distributed training. The scheme can reach a linear speed-up with respect to the number of workers, but this is rarely seen in practice as the scheme often suffers from large network delays and bandwidth limits. To overcome this communication bottleneck recent works propose to reduce the communication frequency. An algorithm of this type is local SGD that runs SGD independently in parallel on different workers and averages the sequences only once in a while. This scheme shows promising results in practice, but eluded thorough theoretical analysis.\n \nWe prove concise convergence rates for local SGD on convex problems and show that it converges at the same rate as mini-batch SGD in terms of number of evaluated gradients, that is, the scheme achieves linear speed-up in the number of workers and mini-batch size. The number of communication rounds can be reduced up to a factor of T^{1/2}---where T denotes the number of total steps---compared to mini-batch SGD. This also holds for asynchronous implementations.\n\nLocal SGD can also be used for large scale training of deep learning models. The results shown here aim serving as a guideline to further explore the theoretical and practical aspects of local SGD in these applications.", "keywords": "optimization;communication;theory;stochastic gradient descent;SGD;mini-batch;local SGD;parallel restart SGD;distributed training", "primary_area": "", "supplementary_material": "", "author": "Sebastian U. Stich", "authorids": "sebastian.stich@epfl.ch", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nstich2018local,\ntitle={Local {SGD} Converges Fast and Communicates Little},\nauthor={Sebastian U. Stich},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1g2JnRcFX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=S1g2JnRcFX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;8;8", "confidence": "5;4;5", "wc_review": "416;154;463", "wc_reply_reviewers": "449;61;0", "wc_reply_authors": "707;114;370", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 344.3333333333333, 135.94688505278654 ], "wc_reply_reviewers_avg": [ 170.0, 198.84835092770234 ], "wc_reply_authors_avg": [ 397.0, 242.84288473551507 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 1278, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17405656068558747431&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1g2JnRcFX", "pdf": "https://openreview.net/pdf?id=S1g2JnRcFX", "email": "", "author_num": 1 }, { "id": "S1g2V3Cct7", "title": "Experience replay for continual learning", "track": "main", "status": "Reject", "tldr": "We show that, in continual learning settings, catastrophic forgetting can be avoided by applying off-policy RL to a mixture of new and replay experience, with a behavioral cloning loss.", "abstract": "Continual learning is the problem of learning new tasks or knowledge while protecting old knowledge and ideally generalizing from old experience to learn new tasks faster. Neural networks trained by stochastic gradient descent often degrade on old tasks when trained successively on new tasks with different data distributions. This phenomenon, referred to as catastrophic forgetting, is considered a major hurdle to learning with non-stationary data or sequences of new tasks, and prevents networks from continually accumulating knowledge and skills. We examine this issue in the context of reinforcement learning, in a setting where an agent is exposed to tasks in a sequence. Unlike most other work, we do not provide an explicit indication to the model of task boundaries, which is the most general circumstance for a learning agent exposed to continuous experience. While various methods to counteract catastrophic forgetting have recently been proposed, we explore a straightforward, general, and seemingly overlooked solution - that of using experience replay buffers for all past events - with a mixture of on- and off-policy learning, leveraging behavioral cloning. We show that this strategy can still learn new tasks quickly yet can substantially reduce catastrophic forgetting in both Atari and DMLab domains, even matching the performance of methods that require task identities. When buffer storage is constrained, we confirm that a simple mechanism for randomly discarding data allows a limited size buffer to perform almost as well as an unbounded one.", "keywords": "continual learning;catastrophic forgetting;lifelong learning;behavioral cloning;reinforcement learning;interference;stability-plasticity", "primary_area": "", "supplementary_material": "", "author": "David Rolnick;Arun Ahuja;Jonathan Schwarz;Timothy P. Lillicrap;Greg Wayne", "authorids": "drolnick@mit.edu;arahuja@google.com;schwarzjn@google.com;countzero@google.com;gregwayne@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nrolnick2019experience,\ntitle={Experience replay for continual learning},\nauthor={David Rolnick and Arun Ahuja and Jonathan Schwarz and Timothy P. Lillicrap and Greg Wayne},\nyear={2019},\nurl={https://openreview.net/forum?id=S1g2V3Cct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1g2V3Cct7", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;4;5", "wc_review": "195;317;367", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "316;616;435", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 293.0, 72.24033960791344 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 455.6666666666667, 123.34324284514152 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1369, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6568395747360267252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "S1g6zn09tm", "title": "Latent Transformations for Object View Points Synthesis", "track": "main", "status": "Withdraw", "tldr": "We introduce an effective, general framework for incorporating conditioning information into inference-based generative models.", "abstract": "We propose a fully-convolutional conditional generative model, the latent transformation neural network (LTNN), capable of view synthesis using a light-weight neural network suited for real-time applications. In contrast to existing conditional\ngenerative models which incorporate conditioning information via concatenation, we introduce a dedicated network component, the conditional transformation unit (CTU), designed to learn the latent space transformations corresponding to specified target views. In addition, a consistency loss term is defined to guide the network toward learning the desired latent space mappings, a task-divided decoder is constructed to refine the quality of generated views, and an adaptive discriminator is introduced to improve the adversarial training process. The generality of the proposed methodology is demonstrated on a collection of three diverse tasks: multi-view reconstruction on real hand depth images, view synthesis of real and synthetic faces, and the rotation of rigid objects. The proposed model is shown to exceed state-of-the-art results in each category while simultaneously achieving a reduction in the computational demand required for inference by 30% on average.", "keywords": "conditional generative model;deep learning;fully-convolutional network;image attribute modification;multi-view reconstruction;view sythesis", "primary_area": "", "supplementary_material": "", "author": "Sangpil Kim;Nick Winovich;Hyung-gun Chi;Guang Lin;Karthik Ramani", "authorids": "kim2030@purdue.edu;nwinovic@purdue.edu;chi45@purdue.edu;guanglin@purdue.edu;ramani@purdue.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1g6zn09tm", "pdf_size": 0, "rating": "2;4;5", "confidence": "4;4;2", "wc_review": "289;477;180", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 315.3333333333333, 122.6711955685695 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RzUHwl2elyAJ:scholar.google.com/&scioq=Latent+Transformations+for+Object+View+Points+Synthesis&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "S1g9N2A5FX", "title": "Interpretable Continual Learning", "track": "main", "status": "Reject", "tldr": "The paper develops an interpretable continual learning framework where explanations of the finished tasks are used to enhance the attention of the learner during the future tasks, and where an explanation metric is proposed too. ", "abstract": "We present a framework for interpretable continual learning (ICL). We show that explanations of previously performed tasks can be used to improve performance on future tasks. ICL generates a good explanation of a finished task, then uses this to focus attention on what is important when facing a new task. The ICL idea is general and may be applied to many continual learning approaches. Here we focus on the variational continual learning framework to take advantage of its flexibility and efficacy in overcoming catastrophic forgetting. We use saliency maps to provide explanations of performed tasks and propose a new metric to assess their quality. Experiments show that ICL achieves state-of-the-art results in terms of overall continual learning performance as measured by average classification accuracy, and also in terms of its explanations, which are assessed qualitatively and quantitatively using the proposed metric.", "keywords": "Interpretability;Continual Learning", "primary_area": "", "supplementary_material": "", "author": "Tameem Adel;Cuong V. Nguyen;Richard E. Turner;Zoubin Ghahramani;Adrian Weller", "authorids": "tah47@cam.ac.uk;nvcuong92@gmail.com;ret26@cam.ac.uk;zoubin@eng.cam.ac.uk;aw665@cam.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nadel2019interpretable,\ntitle={Interpretable Continual Learning},\nauthor={Tameem Adel and Cuong V. Nguyen and Richard E. Turner and Zoubin Ghahramani and Adrian Weller},\nyear={2019},\nurl={https://openreview.net/forum?id=S1g9N2A5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1g9N2A5FX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;3", "wc_review": "555;729;323", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;550;342", "reply_reviewers": "0;0;0", "reply_authors": "0;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 535.6666666666666, 166.3116218294908 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 297.3333333333333, 226.7470445721889 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4990327797967791382&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "S1gARiAcFm", "title": "Modeling Dynamics of Biological Systems with Deep Generative Neural Networks", "track": "main", "status": "Reject", "tldr": "Dynamics Modeling Networks (DyMoN) offer advantages in representation, generation, visualization and feature extraction over shallow learning techniques for modeling stochastic dynamical systems in biology.", "abstract": "Biological data often contains measurements of dynamic entities such as cells or organisms in various states of progression. However, biological systems are notoriously difficult to describe analytically due to their many interacting components, and in many cases, the technical challenge of taking longitudinal measurements. This leads to difficulties in studying the features of the dynamics, for examples the drivers of the transition. To address this problem, we present a deep neural network framework we call Dynamics Modeling Network or DyMoN. DyMoN is a neural network framework trained as a deep generative Markov model whose next state is a probability distribution based on the current state. DyMoN is well-suited to the idiosyncrasies of biological data, including noise, sparsity, and the lack of longitudinal measurements in many types of systems. Thus, DyMoN can be trained using probability distributions derived from the data in any way, such as trajectories derived via dimensionality reduction methods, and does not require longitudinal measurements. We show the advantage of learning deep models over shallow models such as Kalman filters and hidden Markov models that do not learn representations of the data, both in terms of learning embeddings of the data and also in terms training efficiency, accuracy and ability to multitask. We perform three case studies of applying DyMoN to different types of biological systems and extracting features of the dynamics in each case by examining the learned model. ", "keywords": "neural networks;markovian dynamics;single-cell biology;calcium imaging;stochastic dynamics;generative models", "primary_area": "", "supplementary_material": "", "author": "Scott Gigante;David van Dijk;Kevin R. Moon;Alexander Strzalkowski;Katie Ferguson;Guy Wolf;Smita Krishnaswamy", "authorids": "scott.gigante@yale.edu;david.vandijk@yale.edu;kevin.moon@usu.edu;alexander.strzalkowski@yale.edu;katie.ferguson@yale.edu;guy.wolf@yale.edu;smita.krishnaswamy@yale.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ngigante2019modeling,\ntitle={Modeling Dynamics of Biological Systems with Deep Generative Neural Networks},\nauthor={Scott Gigante and David van Dijk and Kevin R. Moon and Alexander Strzalkowski and Katie Ferguson and Guy Wolf and Smita Krishnaswamy},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gARiAcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1gARiAcFm", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;5;2", "wc_review": "227;184;372", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 261.0, 80.4280216508318 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10406413558257145556&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "S1gBgnR9Y7", "title": "End-to-end learning of pharmacological assays from high-resolution microscopy images", "track": "main", "status": "Reject", "tldr": "", "abstract": "Predicting the outcome of pharmacological assays based on high-resolution microscopy\nimages of treated cells is a crucial task in drug discovery which tremendously\nincreases discovery rates. However, end-to-end learning on these images\nwith convolutional neural networks (CNNs) has not been ventured for this task\nbecause it has been considered infeasible and overly complex. On the largest\navailable public dataset, we compare several state-of-the-art CNNs trained in an\nend-to-end fashion with models based on a cell-centric approach involving segmentation.\nWe found that CNNs operating on full images containing hundreds\nof cells perform significantly better at assay prediction than networks operating\non a single-cell level. Surprisingly, we could predict 29% of the 209 pharmacological\nassays at high predictive performance (AUC > 0.9). We compared a\nnovel CNN architecture called \u201cGapNet\u201d against four competing CNN architectures\nand found that it performs on par with the best methods and at the same time\nhas the lowest training time. Our results demonstrate that end-to-end learning on\nhigh-resolution imaging data is not only possible but even outperforms cell-centric\nand segmentation-dependent approaches. Hence, the costly cell segmentation and\nfeature extraction steps are not necessary, in fact they even hamper predictive performance.\nOur work further suggests that many pharmacological assays could\nbe replaced by high-resolution microscopy imaging together with convolutional\nneural networks.", "keywords": "Convolutional Neural Networks;High-resolution images;Multiple-Instance Learning;Drug Discovery;Molecular Biology", "primary_area": "", "supplementary_material": "", "author": "Markus Hofmarcher;Elisabeth Rumetshofer;Sepp Hochreiter;G\u00fcnter Klambauer", "authorids": "hofmarcher@ml.jku.at;rumetshofer@ml.jku.at;hochreit@ml.jku.at;klambauer@ml.jku.at", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhofmarcher2019endtoend,\ntitle={End-to-end learning of pharmacological assays from high-resolution microscopy images},\nauthor={Markus Hofmarcher and Elisabeth Rumetshofer and Sepp Hochreiter and G\u00fcnter Klambauer},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gBgnR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1gBgnR9Y7", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;4;5", "wc_review": "721;874;175", "wc_reply_reviewers": "51;0;0", "wc_reply_authors": "637;295;326", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 590.0, 300.0233324259965 ], "wc_reply_reviewers_avg": [ 17.0, 24.041630560342615 ], "wc_reply_authors_avg": [ 419.3333333333333, 154.433013166083 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9819805060619659, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:rpEKjNB3qp8J:scholar.google.com/&scioq=End-to-end+learning+of+pharmacological+assays+from+high-resolution+microscopy+images&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "S1gBz2C9tX", "title": "Importance Resampling for Off-policy Policy Evaluation", "track": "main", "status": "Reject", "tldr": "A resampling approach for off-policy policy evaluation in reinforcement learning.", "abstract": "Importance sampling is a common approach to off-policy learning in reinforcement learning. While it is consistent and unbiased, it can result in high variance updates to the parameters for the value function. Weighted importance sampling (WIS) has been explored to reduce variance for off-policy policy evaluation, but only for linear value function approximation. In this work, we explore a resampling strategy to reduce variance, rather than a reweighting strategy. We propose Importance Resampling (IR) for off-policy learning, that resamples experience from the replay buffer and applies a standard on-policy update. The approach avoids using importance sampling ratios directly in the update, instead correcting the distribution over transitions before the update. We characterize the bias and consistency of the our estimator, particularly compared to WIS. We then demonstrate in several toy domains that IR has improved sample efficiency and parameter sensitivity, as compared to several baseline WIS estimators and to IS. We conclude with a demonstration showing IR improves over IS for learning a value function from images in a racing car simulator.", "keywords": "Reinforcement Learning;Off-policy policy evaluation;importance resampling;importance sampling", "primary_area": "", "supplementary_material": "", "author": "Matthew Schlegel;Wesley Chung;Daniel Graves;Martha White", "authorids": "mkschleg@ualberta.ca;wchung@ualberta.ca;daniel.graves@huawei.com;whitem@ualberta.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nschlegel2019importance,\ntitle={Importance Resampling for Off-policy Policy Evaluation},\nauthor={Matthew Schlegel and Wesley Chung and Daniel Graves and Martha White},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gBz2C9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer5", "site": "https://openreview.net/forum?id=S1gBz2C9tX", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;4", "wc_review": "296;867;344", "wc_reply_reviewers": "0;271;0", "wc_reply_authors": "861;669;584", "reply_reviewers": "0;2;0", "reply_authors": "2;3;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 502.3333333333333, 258.6017959892949 ], "wc_reply_reviewers_avg": [ 90.33333333333333, 127.7506251343696 ], "wc_reply_authors_avg": [ 704.6666666666666, 115.86294585510169 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qhEGOvFAv2oJ:scholar.google.com/&scioq=Importance+Resampling+for+Off-policy+Policy+Evaluation&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1gDCiCqtQ", "title": "Learning Representations in Model-Free Hierarchical Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We offer an original approach to model-free deep hierarchical reinforcement learning, including unsupervised subgoal discovery and unified temporal abstraction and intrinsic motivation learning. ", "abstract": "Common approaches to Reinforcement Learning (RL) are seriously challenged by large-scale applications involving huge state spaces and sparse delayed reward feedback. Hierarchical Reinforcement Learning (HRL) methods attempt to address this scalability issue by learning action selection policies at multiple levels of temporal abstraction. Abstraction can be had by identifying a relatively small set of states that are likely to be useful as subgoals, in concert with the learning of corresponding skill policies to achieve those subgoals. Many approaches to subgoal discovery in HRL depend on the analysis of a model of the environment, but the need to learn such a model introduces its own problems of scale. Once subgoals are identified, skills may be learned through intrinsic motivation, introducing an internal reward signal marking subgoal attainment. In this paper, we present a novel model-free method for subgoal discovery using incremental unsupervised learning over a small memory of the most recent experiences of the agent. When combined with an intrinsic motivation learning mechanism, this method learns subgoals and skills together, based on experiences in the environment. Thus, we offer an original approach to HRL that does not require the acquisition of a model of the environment, suitable for large-scale applications. We demonstrate the efficiency of our method on two RL problems with sparse delayed feedback: a variant of the rooms environment and the ATARI 2600 game called Montezuma's Revenge.\n", "keywords": "Reinforcement Learning;Model-Free Hierarchical Reinforcement Learning;Subgoal Discovery;Unsupervised Learning;Temporal Difference;Temporal Abstraction;Intrinsic Motivation;Markov Decision Processes;Deep Reinforcement Learning;Optimization", "primary_area": "", "supplementary_material": "", "author": "Jacob Rafati;David Noelle", "authorids": "jrafatiheravi@ucmerced.edu;dnoelle@ucmerced.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nrafati2019learning,\ntitle={Learning Representations in Model-Free Hierarchical Reinforcement Learning},\nauthor={Jacob Rafati and David Noelle},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gDCiCqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1gDCiCqtQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "wc_review": "594;582;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 469.0, 168.36270370839262 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5935041430616141795&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "title": "Learning Finite State Representations of Recurrent Policy Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1062", "id": "S1gOpsCctm", "author_site": "Anurag Koul, Alan Fern, Samuel Greydanus", "tldr": "Extracting a finite state machine from a recurrent neural network via quantization for the purpose of interpretability with experiments on Atari.", "abstract": "Recurrent neural networks (RNNs) are an effective representation of control policies for a wide range of reinforcement and imitation learning problems. RNN policies, however, are particularly difficult to explain, understand, and analyze due to their use of continuous-valued memory vectors and observation features. In this paper, we introduce a new technique, Quantized Bottleneck Insertion, to learn finite representations of these vectors and features. The result is a quantized representation of the RNN that can be analyzed to improve our understanding of memory use and general behavior. We present results of this approach on synthetic environments and six Atari games. The resulting finite representations are surprisingly small in some cases, using as few as 3 discrete memory states and 10 observations for a perfect Pong policy. We also show that these finite policy representations lead to improved interpretability. ", "keywords": "recurrent neural networks;finite state machine;quantization;interpretability;autoencoder;moore machine;reinforcement learning;imitation learning;representation;Atari;Tomita", "primary_area": "", "supplementary_material": "", "author": "Anurag Koul;Alan Fern;Sam Greydanus", "authorids": "koula@oregonstate.edu;alan.fern@oregonstate.edu;sgrey@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkoul2018learning,\ntitle={Learning Finite State Representations of Recurrent Policy Networks},\nauthor={Anurag Koul and Alan Fern and Sam Greydanus},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gOpsCctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;3", "wc_review": "315;491;567", "wc_reply_reviewers": "0;284;192", "wc_reply_authors": "728;976;1049", "reply_reviewers": "0;2;1", "reply_authors": "1;2;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 457.6666666666667, 105.54409294487094 ], "wc_reply_reviewers_avg": [ 158.66666666666666, 118.31408294122141 ], "wc_reply_authors_avg": [ 917.6666666666666, 137.38591226500952 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9225081332116410200&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1gOpsCctm", "pdf": "https://openreview.net/pdf?id=S1gOpsCctm", "email": ";;", "author_num": 3 }, { "id": "S1gQ5sRcFm", "title": "Consistent Jumpy Predictions for Videos and Scenes", "track": "main", "status": "Reject", "tldr": "We present a model for consistent 3D reconstruction and jumpy video prediction e.g. producing image frames multiple time-steps in the future without generating intermediate frames.", "abstract": "Stochastic video prediction models take in a sequence of image frames, and generate a sequence of consecutive future image frames. These models typically generate future frames in an autoregressive fashion, which is slow and requires the input and output frames to be consecutive. We introduce a model that overcomes these drawbacks by generating a latent representation from an arbitrary set of frames that can then be used to simultaneously and efficiently sample temporally consistent frames at arbitrary time-points. For example, our model can \"jump\" and directly sample frames at the end of the video, without sampling intermediate frames. Synthetic video evaluations confirm substantial gains in speed and functionality without loss in fidelity. We also apply our framework to a 3D scene reconstruction dataset. Here, our model is conditioned on camera location and can sample consistent sets of images for what an occluded region of a 3D scene might look like, even if there are multiple possibilities for what that region might contain. Reconstructions and videos are available at https://bit.ly/2O4Pc4R.\n", "keywords": "jumpy predictions;generative models;scene reconstruction;video prediction;variational auto-encoders;DRAW", "primary_area": "", "supplementary_material": "", "author": "Ananya Kumar;S. M. Ali Eslami;Danilo Rezende;Marta Garnelo;Fabio Viola;Edward Lockhart;Murray Shanahan", "authorids": "skywalker94@gmail.com;aeslami@google.com;danilor@google.com;garnelo@google.com;fviola@google.com;locked@google.com;mshanahan@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nkumar2019consistent,\ntitle={Consistent Jumpy Predictions for Videos and Scenes},\nauthor={Ananya Kumar and S. M. Ali Eslami and Danilo Rezende and Marta Garnelo and Fabio Viola and Edward Lockhart and Murray Shanahan},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gQ5sRcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=S1gQ5sRcFm", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;2", "wc_review": "304;544;332", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "597;1309;598", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 393.3333333333333, 107.14890988194368 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 834.6666666666666, 335.4045649990802 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10915540989840821833&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "S1gUVjCqKm", "title": "Unsupervised classification into unknown number of classes", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a novel unsupervised classification method based on graph Laplacian. Unlike the widely used classification method, this architecture does not require the labels of data and the number of classes. Our key idea is to introduce a approximate linear map and a spectral clustering theory on the dimension reduced spaces into generative adversarial networks. Inspired by the human visual recognition system, the proposed framework can classify and also generate images as the human brains do. We build an approximate linear connector network $C$ analogous to the cerebral cortex, between the discriminator $D$ and the generator $G$. The connector network allows us to estimate the unknown number of classes. Estimating the number of classes is one of the challenging researches in the unsupervised learning, especially in spectral clustering. The proposed method can also classify the images by using the estimated number of classes. Therefore, we define our method as an unsupervised classification method.", "keywords": "unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Sungyeob Han;Daeyoung Kim;Jungwoo Lee", "authorids": "syhan@cml.snu.ac.kr;kimdy7@snu.ac.kr;junglee@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhan2019unsupervised,\ntitle={Unsupervised classification into unknown number of classes},\nauthor={Sungyeob Han and Daeyoung Kim and Jungwoo Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gUVjCqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=S1gUVjCqKm", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;2;4", "wc_review": "802;169;54", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "838;1086;182", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 341.6666666666667, 328.8731602845625 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 702.0, 381.3799505305263 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:akh7Wz7q0Q0J:scholar.google.com/&scioq=Unsupervised+classification+into+unknown+number+of+classes&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Multilingual Neural Machine Translation with Knowledge Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/797", "id": "S1gUsoR9YX", "author_site": "Xu Tan, Yi Ren, Di He, Tao Qin, Zhou Zhao, Tie-Yan Liu", "tldr": "We proposed a knowledge distillation based method to boost the accuracy of multilingual neural machine translation.", "abstract": "Multilingual machine translation, which translates multiple languages with a single model, has attracted much attention due to its efficiency of offline training and online serving. However, traditional multilingual translation usually yields inferior accuracy compared with the counterpart using individual models for each language pair, due to language diversity and model capacity limitations. In this paper, we propose a distillation-based approach to boost the accuracy of multilingual machine translation. Specifically, individual models are first trained and regarded as teachers, and then the multilingual model is trained to fit the training data and match the outputs of individual models simultaneously through knowledge distillation. Experiments on IWSLT, WMT and Ted talk translation datasets demonstrate the effectiveness of our method. Particularly, we show that one model is enough to handle multiple languages (up to 44 languages in our experiment), with comparable or even better accuracy than individual models.", "keywords": "NMT;Multilingual NMT;Knowledge Distillation", "primary_area": "", "supplementary_material": "", "author": "Xu Tan;Yi Ren;Di He;Tao Qin;Zhou Zhao;Tie-Yan Liu", "authorids": "xuta@microsoft.com;rayeren613@gmail.com;dihe@microsoft.com;taoqin@microsoft.com;zhaozhou@zju.edu.cn;tyliu@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ntan2018multilingual,\ntitle={Multilingual Neural Machine Translation with Knowledge Distillation},\nauthor={Xu Tan and Yi Ren and Di He and Tao Qin and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gUsoR9YX},\n}", "github": "[![github](/images/github_icon.svg) RayeRen/multilingual-kd-pytorch](https://github.com/RayeRen/multilingual-kd-pytorch)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;3", "wc_review": "388;629;229", "wc_reply_reviewers": "35;250;44", "wc_reply_authors": "817;1852;864", "reply_reviewers": "1;3;1", "reply_authors": "4;4;3", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 415.3333333333333, 164.43911402772218 ], "wc_reply_reviewers_avg": [ 109.66666666666667, 99.29865166366672 ], "wc_reply_authors_avg": [ 1177.6666666666667, 477.2115766501153 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 3.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 280, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5753623392275205285&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1gUsoR9YX", "pdf": "https://openreview.net/pdf?id=S1gUsoR9YX", "email": ";;;;;", "author_num": 6 }, { "id": "S1gWz2CcKX", "title": "Neural MMO: A massively multiplayer game environment for intelligent agents", "track": "main", "status": "Reject", "tldr": "An MMO-inspired research game platform for studying emergent behaviors of large populations in a complex environment", "abstract": "We present an artificial intelligence research platform inspired by the human game genre of MMORPGs (Massively Multiplayer Online Role-Playing Games, a.k.a. MMOs). We demonstrate how this platform can be used to study behavior and learning in large populations of neural agents. Unlike currently popular game environments, our platform supports persistent environments, with variable number of agents, and open-ended task descriptions. The emergence of complex life on Earth is often attributed to the arms race that ensued from a huge number of organisms all competing for finite resources. Our platform aims to simulate this setting in microcosm: we conduct a series of experiments to test how large-scale multiagent competition can incentivize the development of skillful behavior. We find that population size magnifies the complexity of the behaviors that emerge and results in agents that out-compete agents trained in smaller populations.", "keywords": "MMO;Multiagent;Game;Reinforcement Learning;Platform;Framework;Niche Formation;Exploration", "primary_area": "", "supplementary_material": "", "author": "Joseph Suarez;Yilun Du;Phillip Isola;Igor Mordatch", "authorids": "joseph15@stanford.edu;yilundu@gmail.com;phillipi@mit.edu;mordatch@openai.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsuarez2019neural,\ntitle={Neural {MMO}: A massively multiplayer game environment for intelligent agents},\nauthor={Joseph Suarez and Yilun Du and Phillip Isola and Igor Mordatch},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gWz2CcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=S1gWz2CcKX", "pdf_size": 0, "rating": "5;6;7", "confidence": "2;4;5", "wc_review": "893;510;1190", "wc_reply_reviewers": "0;378;0", "wc_reply_authors": "321;151;328", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 864.3333333333334, 278.3479038102417 ], "wc_reply_reviewers_avg": [ 126.0, 178.19090885900997 ], "wc_reply_authors_avg": [ 266.6666666666667, 81.83859453555856 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9819805060619659, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14527600469711127715&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "S1g_EsActm", "title": "ATTENTION INCORPORATE NETWORK: A NETWORK CAN ADAPT VARIOUS DATA SIZE", "track": "main", "status": "Reject", "tldr": "", "abstract": "In traditional neural networks for image processing, the inputs of the neural networks should be the same size such as 224\u00d7224\u00d73. But how can we train the neural net model with different input size? A common way to do is image deformation which accompany a problem of information loss (e.g. image crop or wrap). In this paper we propose a new network structure called Attention Incorporate Network(AIN). It solve the problem of different size of input images and extract the key features of the inputs by attention mechanism, pay different attention depends on the importance of the features not rely on the data size. Experimentally, AIN achieve a higher accuracy, better convergence comparing to the same size of other network structure.", "keywords": "attention mechanism;various image size", "primary_area": "", "supplementary_material": "", "author": "Liangbo He;Hao Sun", "authorids": "heliangbo@tsinghua.edu.cn;sh759811581@tsinghua.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhe2019attention,\ntitle={{ATTENTION} {INCORPORATE} {NETWORK}: A {NETWORK} {CAN} {ADAPT} {VARIOUS} {DATA} {SIZE}},\nauthor={Liangbo He and Hao Sun},\nyear={2019},\nurl={https://openreview.net/forum?id=S1g_EsActm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1g_EsActm", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;4", "wc_review": "195;536;114", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 281.6666666666667, 182.85574156209103 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:t1kCUqiPKb4J:scholar.google.com/&scioq=ATTENTION+INCORPORATE+NETWORK:+A+NETWORK+CAN+ADAPT+VARIOUS+DATA+SIZE&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "id": "S1gd7nCcF7", "title": "Self-Supervised Generalisation with Meta Auxiliary Learning", "track": "main", "status": "Reject", "tldr": "We propose Meta AuXiliary Learning (MAXL), a learning framework which can automatically generate auxiliary tasks to improve generalisation of the principal task in a self-supervised manner. ", "abstract": "Auxiliary learning has been shown to improve the generalisation performance of a principal task. But typically, this requires manually-defined auxiliary tasks based on domain knowledge. In this paper, we consider that it may be possible to automatically learn these auxiliary tasks to best suit the principal task, towards optimum auxiliary tasks without any human knowledge. We propose a novel method, Meta Auxiliary Learning (MAXL), which we design for the task of image classification, where the auxiliary task is hierarchical sub-class image classification. The role of the meta learner is to determine sub-class target labels to train a multi-task evaluator, such that these labels improve the generalisation performance on the principal task. Experiments on three different CIFAR datasets show that MAXL outperforms baseline auxiliary learning methods, and is competitive even with a method which uses human-defined sub-class hierarchies. MAXL is self-supervised and general, and therefore offers a promising new direction towards automated generalisation.", "keywords": "meta learning;auxiliary learning;multi-task learning;self-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Shikun Liu;Edward Johns;Andrew Davison", "authorids": "shikun.liu17@imperial.ac.uk;e.johns@imperial.ac.uk;a.davison@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliu2019selfsupervised,\ntitle={Self-Supervised Generalisation with Meta Auxiliary Learning},\nauthor={Shikun Liu and Edward Johns and Andrew Davison},\nyear={2019},\nurl={https://openreview.net/forum?id=S1gd7nCcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1gd7nCcF7", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;3;4", "wc_review": "737;265;226", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "573;381;177", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 409.3333333333333, 232.24173230111384 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 377.0, 161.6910634512619 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 213, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18242502085163121025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "S1geJhC9Km", "title": "Feature quantization for parsimonious and interpretable predictive models", "track": "main", "status": "Reject", "tldr": "We tackle discretization of continuous features and grouping of factor levels as a representation learning problem and provide a rigorous way of estimating the best quantization to yield good performance and interpretability.", "abstract": "For regulatory and interpretability reasons, the logistic regression is still widely used by financial institutions to learn the refunding probability of a loan from applicant's historical data. To improve prediction accuracy and interpretability, a preprocessing step quantizing both continuous and categorical data is usually performed: continuous features are discretized by assigning factor levels to intervals and, if numerous, levels of categorical features are grouped. However, a better predictive accuracy can be reached by embedding this quantization estimation step directly into the predictive estimation step itself. By doing so, the predictive loss has to be optimized on a huge and untractable discontinuous quantization set. To overcome this difficulty, we introduce a specific two-step optimization strategy: first, the optimization problem is relaxed by approximating discontinuous quantization functions by smooth functions; second, the resulting relaxed optimization problem is solved via a particular neural network and stochastic gradient descent. The strategy gives then access to good candidates for the original optimization problem after a straightforward maximum a posteriori procedure to obtain cutpoints. The good performances of this approach, which we call glmdisc, are illustrated on simulated and real data from the UCI library and Cr\u00e9dit Agricole Consumer Finance (a major European historic player in the consumer credit market). The results show that practitioners finally have an automatic all-in-one tool that answers their recurring needs of quantization for predictive tasks.", "keywords": "discretization;grouping;interpretability;shallow neural networks", "primary_area": "", "supplementary_material": "", "author": "Adrien EHRHARDT;Vincent VANDEWALLE;Christophe BIERNACKI;Philippe HEINRICH", "authorids": "adrien.ehrhardt@inria.fr;vincent.vandewalle@inria.fr;christophe.biernacki@inria.fr;philippe.heinrich@univ-lille.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nehrhardt2019feature,\ntitle={Feature quantization for parsimonious and interpretable predictive models},\nauthor={Adrien EHRHARDT and Vincent VANDEWALLE and Christophe BIERNACKI and Philippe HEINRICH},\nyear={2019},\nurl={https://openreview.net/forum?id=S1geJhC9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1geJhC9Km", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;2", "wc_review": "245;139;132", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "657;339;184", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 172.0, 51.697840058039816 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 393.3333333333333, 196.8863180168247 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4382739908498556790&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "S1giVsRcYm", "title": "Count-Based Exploration with the Successor Representation", "track": "main", "status": "Reject", "tldr": "We propose the idea of using the norm of the successor representation an exploration bonus in reinforcement learning. In hard exploration Atari games, our the deep RL algorithm matches the performance of recent pseudo-count-based methods.", "abstract": "The problem of exploration in reinforcement learning is well-understood in the tabular case and many sample-efficient algorithms are known. Nevertheless, it is often unclear how the algorithms in the tabular setting can be extended to tasks with large state-spaces where generalization is required. Recent promising developments generally depend on problem-specific density models or handcrafted features. In this paper we introduce a simple approach for exploration that allows us to develop theoretically justified algorithms in the tabular case but that also give us intuitions for new algorithms applicable to settings where function approximation is required. Our approach and its underlying theory is based on the substochastic successor representation, a concept we develop here. While the traditional successor representation is a representation that defines state generalization by the similarity of successor states, the substochastic successor representation is also able to implicitly count the number of times each state (or feature) has been observed. This extension connects two until now disjoint areas of research. We show in traditional tabular domains (RiverSwim and SixArms) that our algorithm empirically performs as well as other sample-efficient algorithms. We then describe a deep reinforcement learning algorithm inspired by these ideas and show that it matches the performance of recent pseudo-count-based methods in hard exploration Atari 2600 games.", "keywords": "reinforcement learning;successor representation;exploration;atari", "primary_area": "", "supplementary_material": "", "author": "Marlos C. Machado;Marc G. Bellemare;Michael Bowling", "authorids": "machado@ualberta.ca;bellemare@google.com;mbowling@ualberta.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmachado2019countbased,\ntitle={Count-Based Exploration with the Successor Representation},\nauthor={Marlos C. Machado and Marc G. Bellemare and Michael Bowling},\nyear={2019},\nurl={https://openreview.net/forum?id=S1giVsRcYm},\n}", "github": "[![github](/images/github_icon.svg) mcmachado/count_based_exploration_sr](https://github.com/mcmachado/count_based_exploration_sr) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=S1giVsRcYm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1giVsRcYm", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;2;4", "wc_review": "367;417;313", "wc_reply_reviewers": "0;0;178", "wc_reply_authors": "510;688;1308", "reply_reviewers": "0;0;1", "reply_authors": "1;1;3", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 365.6666666666667, 42.4682888230213 ], "wc_reply_reviewers_avg": [ 59.333333333333336, 83.91000470080364 ], "wc_reply_authors_avg": [ 835.3333333333334, 342.0344362130158 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 245, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8794785472185004564&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "S1giro05t7", "title": "Reducing Overconfident Errors outside the Known Distribution", "track": "main", "status": "Reject", "tldr": "Deep networks are more likely to be confidently wrong when testing on unexpected data. We propose an experimental methodology to study the problem, and two methods to reduce confident errors on unknown input distributions.", "abstract": "Intuitively, unfamiliarity should lead to lack of confidence. In reality, current algorithms often make highly confident yet wrong predictions when faced with unexpected test samples from an unknown distribution different from training. Unlike domain adaptation methods, we cannot gather an \"unexpected dataset\" prior to test, and unlike novelty detection methods, a best-effort original task prediction is still expected. We compare a number of methods from related fields such as calibration and epistemic uncertainty modeling, as well as two proposed methods that reduce overconfident errors of samples from an unknown novel distribution without drastically increasing evaluation time: (1) G-distillation, training an ensemble of classifiers and then distill into a single model using both labeled and unlabeled examples, or (2) NCR, reducing prediction confidence based on its novelty detection score. Experimentally, we investigate the overconfidence problem and evaluate our solution by creating \"familiar\" and \"novel\" test splits, where \"familiar\" are identically distributed with training and \"novel\" are not. We discover that calibrating using temperature scaling on familiar data is the best single-model method for improving novel confidence, followed by our proposed methods. In addition, some methods' NLL performance are roughly equivalent to a regularly trained model with certain degree of smoothing. Calibrating can also reduce confident errors, for example, in gender recognition by 95% on demographic groups different from the training data.", "keywords": "Machine learning safety;confidence;overconfidence;unknown domain;novel distribution;generalization;distillation;ensemble;underrepresentation", "primary_area": "", "supplementary_material": "", "author": "Zhizhong Li;Derek Hoiem", "authorids": "zli115@illinois.edu;dhoiem@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2019reducing,\ntitle={Reducing Overconfident Errors outside the Known Distribution},\nauthor={Zhizhong Li and Derek Hoiem},\nyear={2019},\nurl={https://openreview.net/forum?id=S1giro05t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1giro05t7", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;4", "wc_review": "241;369;303", "wc_reply_reviewers": "0;89;0", "wc_reply_authors": "380;558;464", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 304.3333333333333, 52.264285660052366 ], "wc_reply_reviewers_avg": [ 29.666666666666668, 41.95500235040182 ], "wc_reply_authors_avg": [ 467.3333333333333, 72.70641114937679 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15742451222566014112&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "S1grRoR9tQ", "title": "Bayesian Deep Learning via Stochastic Gradient MCMC with a Stochastic Approximation Adaptation", "track": "main", "status": "Reject", "tldr": "a robust Bayesian deep learning algorithm to infer complex posteriors with latent variables", "abstract": "We propose a robust Bayesian deep learning algorithm to infer complex posteriors with latent variables. Inspired by dropout, a popular tool for regularization and model ensemble, we assign sparse priors to the weights in deep neural networks (DNN) in order to achieve automatic \u201cdropout\u201d and avoid over-fitting. By alternatively sampling from posterior distribution through stochastic gradient Markov Chain Monte Carlo (SG-MCMC) and optimizing latent variables via stochastic approximation (SA), the trajectory of the target weights is proved to converge to the true posterior distribution conditioned on optimal latent variables. This ensures a stronger regularization on the over-fitted parameter space and more accurate uncertainty quantification on the decisive variables. Simulations from large-p-small-n regressions showcase the robustness of this method when applied to models with latent variables. Additionally, its application on the convolutional neural networks (CNN) leads to state-of-the-art performance on MNIST and Fashion MNIST datasets and improved resistance to adversarial attacks. ", "keywords": "generalized stochastic approximation;stochastic gradient Markov chain Monte Carlo;adaptive algorithm;EM algorithm;convolutional neural networks;Bayesian inference;sparse prior;spike and slab prior;local trap", "primary_area": "", "supplementary_material": "", "author": "Wei Deng;Xiao Zhang;Faming Liang;Guang Lin", "authorids": "deng106@purdue.edu;zhang923@purdue.edu;fmliang@purdue.edu;guanglin@purdue.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndeng2019bayesian,\ntitle={Bayesian Deep Learning via Stochastic Gradient {MCMC} with a Stochastic Approximation Adaptation},\nauthor={Wei Deng and Xiao Zhang and Faming Liang and Guang Lin},\nyear={2019},\nurl={https://openreview.net/forum?id=S1grRoR9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1grRoR9tQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "2;4;5", "wc_review": "278;235;215", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 242.66666666666666, 26.284765338288427 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9819805060619659, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6727721667562075169&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "S1lCbhAqKX", "title": "Structured Content Preservation for Unsupervised Text Style Transfer", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Text style transfer aims to modify the style of a sentence while keeping its content unchanged. Recent style transfer systems often fail to faithfully preserve the content after changing the style. This paper proposes a structured content preserving model that leverages linguistic information in the structured fine-grained supervisions to better preserve the style-independent content \\footnote{Henceforth, we refer to style-independent content as content, for simplicity.} during style transfer. In particular, we achieve the goal by devising rich model objectives based on both the sentence's lexical information and a language model that conditions on content. The resulting model therefore is encouraged to retain the semantic meaning of the target sentences. We perform extensive experiments that compare our model to other existing approaches in the tasks of sentiment and political slant transfer. Our model achieves significant improvement in terms of both content preservation and style transfer in automatic and human evaluation.", "keywords": "Unsupervised text style transfer", "primary_area": "", "supplementary_material": "", "author": "Youzhi Tian;Zhiting Hu;Zhou Yu", "authorids": "yztian@ucdavis.edu;zhitingh@cs.cmu.edu;joyu@ucdavis.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1lCbhAqKX", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;3", "wc_review": "518;685;512", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 571.6666666666666, 80.17619485995966 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11482628433107496292&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "MisGAN: Learning from Incomplete Data with Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1096", "id": "S1lDV3RcKm", "author_site": "Steven Cheng-Xian Li, Bo Jiang, Benjamin M Marlin", "tldr": "This paper presents a GAN-based framework for learning the distribution from high-dimensional incomplete data.", "abstract": "Generative adversarial networks (GANs) have been shown to provide an effective way to model complex distributions and have obtained impressive results on various challenging tasks. However, typical GANs require fully-observed data during training. In this paper, we present a GAN-based framework for learning from complex, high-dimensional incomplete data. The proposed framework learns a complete data generator along with a mask generator that models the missing data distribution. We further demonstrate how to impute missing data by equipping our framework with an adversarially trained imputer. We evaluate the proposed framework using a series of experiments with several types of missing data processes under the missing completely at random assumption.", "keywords": "generative models;missing data", "primary_area": "", "supplementary_material": "", "author": "Steven Cheng-Xian Li;Bo Jiang;Benjamin Marlin", "authorids": "cxl@cs.umass.edu;bjiang@sjtu.edu.cn;marlin@cs.umass.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nli2018learning,\ntitle={Learning from Incomplete Data with Generative Adversarial Networks},\nauthor={Steven Cheng-Xian Li and Bo Jiang and Benjamin Marlin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lDV3RcKm},\n}", "github": "[![github](/images/github_icon.svg) steveli/misgan](https://github.com/steveli/misgan)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;4", "wc_review": "232;205;280", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "455;413;721", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 239.0, 31.016124838541646 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 529.6666666666666, 136.37529916455628 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 295, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4415656656646533426&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=S1lDV3RcKm", "pdf": "https://openreview.net/pdf?id=S1lDV3RcKm", "email": ";;", "author_num": 3 }, { "title": "A Direct Approach to Robust Deep Learning Using Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/908", "id": "S1lIMn05F7", "author_site": "huaxia wang, Chun-Nam Yu", "tldr": "Jointly train an adversarial noise generating network with a classification network to provide better robustness to adversarial attacks.", "abstract": "Deep neural networks have been shown to perform well in many classical machine learning problems, especially in image classification tasks. However, researchers have found that neural networks can be easily fooled, and they are surprisingly sensitive to small perturbations imperceptible to humans. Carefully crafted input images (adversarial examples) can force a well-trained neural network to provide arbitrary outputs. Including adversarial examples during training is a popular defense mechanism against adversarial attacks. In this paper we propose a new defensive mechanism under the generative adversarial network~(GAN) framework. We model the adversarial noise using a generative network, trained jointly with a classification discriminative network as a minimax game. We show empirically that our adversarial network approach works well against black box attacks, with performance on par with state-of-art methods such as ensemble adversarial training and adversarial training with projected gradient descent.\n", "keywords": "deep learning;adversarial learning;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Huaxia Wang;Chun-Nam Yu", "authorids": "hwang38@stevens.edu;cnyu@cs.cornell.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nwang2018a,\ntitle={A Direct Approach to Robust Deep Learning Using Adversarial Networks},\nauthor={Huaxia Wang and Chun-Nam Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lIMn05F7},\n}", "github": "[![github](/images/github_icon.svg) whxbergkamp/RobustDL_GAN](https://github.com/whxbergkamp/RobustDL_GAN)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;3", "wc_review": "302;217;261", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "538;159;197", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 260.0, 34.708308323320324 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 298.0, 170.41322327409534 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 82, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2332293430655643076&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1lIMn05F7", "pdf": "https://openreview.net/pdf?id=S1lIMn05F7", "email": ";", "author_num": 2 }, { "id": "S1lKSjRcY7", "title": "Improved Gradient Estimators for Stochastic Discrete Variables", "track": "main", "status": "Reject", "tldr": "We propose simple ways to reduce bias and complexity of stochastic gradient estimators used for learning distributions over discrete variables.", "abstract": "In many applications we seek to optimize an expectation with respect to a distribution over discrete variables. Estimating gradients of such objectives with respect to the distribution parameters is a challenging problem. We analyze existing solutions including finite-difference (FD) estimators and continuous relaxation (CR) estimators in terms of bias and variance. We show that the commonly used Gumbel-Softmax estimator is biased and propose a simple method to reduce it. We also derive a simpler piece-wise linear continuous relaxation that also possesses reduced bias. We demonstrate empirically that reduced bias leads to a better performance in variational inference and on binary optimization tasks.", "keywords": "continuous relaxation;discrete stochastic variables;reparameterization trick;variational inference;discrete optimization;stochastic gradient estimation", "primary_area": "", "supplementary_material": "", "author": "Evgeny Andriyash;Arash Vahdat;Bill Macready", "authorids": "eandriyash@dwavesys.com;avahdat@dwavesys.com;wgm@dwavesys.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nandriyash2019improved,\ntitle={Improved Gradient Estimators for Stochastic Discrete Variables},\nauthor={Evgeny Andriyash and Arash Vahdat and Bill Macready},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lKSjRcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1lKSjRcY7", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "wc_review": "368;125;312", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "524;304;73", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 268.3333333333333, 103.89845469923452 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 300.3333333333333, 184.13823309918254 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZO2PVahN480J:scholar.google.com/&scioq=Improved+Gradient+Estimators+for+Stochastic+Discrete+Variables&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1lPShAqFm", "title": "Empirically Characterizing Overparameterization Impact on Convergence", "track": "main", "status": "Reject", "tldr": "Empirically shows that larger models train in fewer training steps, because all factors in weight space traversal improve.", "abstract": "A long-held conventional wisdom states that larger models train more slowly when using gradient descent. This work challenges this widely-held belief, showing that larger models can potentially train faster despite the increasing computational requirements of each training step. In particular, we study the effect of network structure (depth and width) on halting time and show that larger models---wider models in particular---take fewer training steps to converge.\n\nWe design simple experiments to quantitatively characterize the effect of overparametrization on weight space traversal. Results show that halting time improves when growing model's width for three different applications, and the improvement comes from each factor: The distance from initialized weights to converged weights shrinks with a power-law-like relationship, the average step size grows with a power-law-like relationship, and gradient vectors become more aligned with each other during traversal.\n", "keywords": "gradient descent;optimization;convergence time;halting time;characterization", "primary_area": "", "supplementary_material": "", "author": "Newsha Ardalani;Joel Hestness;Gregory Diamos", "authorids": "newsha@baidu.com;joel@baidu.com;gregdiamos@baidu.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nardalani2019empirically,\ntitle={Empirically Characterizing Overparameterization Impact on Convergence},\nauthor={Newsha Ardalani and Joel Hestness and Gregory Diamos},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lPShAqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1lPShAqFm", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;3", "wc_review": "254;675;469", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 466.0, 171.8856208839665 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10585144281883707086&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Combinatorial Attacks on Binarized Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/860", "id": "S1lTEh09FQ", "author_site": "Elias Khalil, Amrita Gupta, Bistra Dilkina", "tldr": "Gradient-based attacks on binarized neural networks are not effective due to the non-differentiability of such networks; Our IPROP algorithm solves this problem using integer optimization", "abstract": "Binarized Neural Networks (BNNs) have recently attracted significant interest due to their computational efficiency. Concurrently, it has been shown that neural networks may be overly sensitive to ``attacks\" -- tiny adversarial changes in the input -- which may be detrimental to their use in safety-critical domains. Designing attack algorithms that effectively fool trained models is a key step towards learning robust neural networks.\nThe discrete, non-differentiable nature of BNNs, which distinguishes them from their full-precision counterparts, poses a challenge to gradient-based attacks. In this work, we study the problem of attacking a BNN through the lens of combinatorial and integer optimization. We propose a Mixed Integer Linear Programming (MILP) formulation of the problem. While exact and flexible, the MILP quickly becomes intractable as the network and perturbation space grow. To address this issue, we propose IProp, a decomposition-based algorithm that solves a sequence of much smaller MILP problems. Experimentally, we evaluate both proposed methods against the standard gradient-based attack (PGD) on MNIST and Fashion-MNIST, and show that IProp performs favorably compared to PGD, while scaling beyond the limits of the MILP.", "keywords": "binarized neural networks;combinatorial optimization;integer programming", "primary_area": "", "supplementary_material": "", "author": "Elias B Khalil;Amrita Gupta;Bistra Dilkina", "authorids": "lyes@gatech.edu;agupta375@gatech.edu;dilkina@usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkhalil2018combinatorial,\ntitle={Combinatorial Attacks on Binarized Neural Networks},\nauthor={Elias B Khalil and Amrita Gupta and Bistra Dilkina},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lTEh09FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "595;226;414", "wc_reply_reviewers": "123;0;0", "wc_reply_authors": "842;454;89", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 411.6666666666667, 150.65265421565667 ], "wc_reply_reviewers_avg": [ 41.0, 57.982756057296896 ], "wc_reply_authors_avg": [ 461.6666666666667, 307.45875965548436 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16660057879161292403&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1lTEh09FQ", "pdf": "https://openreview.net/pdf?id=S1lTEh09FQ", "email": ";;", "author_num": 3 }, { "id": "S1lTg3RcFm", "title": "Perception-Aware Point-Based Value Iteration for Partially Observable Markov Decision Processes", "track": "main", "status": "Reject", "tldr": "We develop a point-based value iteration solver for POMDPs with active perception and planning tasks.", "abstract": "Partially observable Markov decision processes (POMDPs) are a widely-used framework to model decision-making with uncertainty about the environment and under stochastic outcome. In conventional POMDP models, the observations that the agent receives originate from fixed known distribution. However, in a variety of real-world scenarios the agent has an active role in its perception by selecting which observations to receive. Due to combinatorial nature of such selection process, it is computationally intractable to integrate the perception decision with the planning decision. To prevent such expansion of the action space, we propose a greedy strategy for observation selection that aims to minimize the uncertainty in state. \nWe develop a novel point-based value iteration algorithm that incorporates the greedy strategy to achieve near-optimal uncertainty reduction for sampled belief points. This in turn enables the solver to efficiently approximate the reachable subspace of belief simplex by essentially separating computations related to perception from planning.\nLastly, we implement the proposed solver and demonstrate its performance and computational advantage in a range of robotic scenarios where the robot simultaneously performs active perception and planning.", "keywords": "partially observable Markov decision processes;active perception;submodular optimization;point-based value iteration;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Mahsa Ghasemi;Ufuk Topcu", "authorids": "mahsa.ghasemi@utexas.edu;utopcu@utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nghasemi2019perceptionaware,\ntitle={Perception-Aware Point-Based Value Iteration for Partially Observable Markov Decision Processes},\nauthor={Mahsa Ghasemi and Ufuk Topcu},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lTg3RcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=S1lTg3RcFm", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;2", "wc_review": "729;591;465", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "588;293;276", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 595.0, 107.81465577554843 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 385.6666666666667, 143.23950417240195 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184546, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11040388236141120656&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Exemplar Guided Unsupervised Image-to-Image Translation with Semantic Consistency", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1027", "id": "S1lTg3RqYQ", "author_site": "Liqian Ma, Xu Jia, Stamatios Georgoulis, Tinne Tuytelaars, Luc Van Gool", "tldr": "We propose the Exemplar Guided & Semantically Consistent Image-to-image Translation (EGSC-IT) network which conditions the translation process on an exemplar image in the target domain.", "abstract": "Image-to-image translation has recently received significant attention due to advances in deep learning. Most works focus on learning either a one-to-one mapping in an unsupervised way or a many-to-many mapping in a supervised way. However, a more practical setting is many-to-many mapping in an unsupervised way, which is harder due to the lack of supervision and the complex inner- and cross-domain variations. To alleviate these issues, we propose the Exemplar Guided & Semantically Consistent Image-to-image Translation (EGSC-IT) network which conditions the translation process on an exemplar image in the target domain. We assume that an image comprises of a content component which is shared across domains, and a style component specific to each domain. Under the guidance of an exemplar from the target domain we apply Adaptive Instance Normalization to the shared content component, which allows us to transfer the style information of the target domain to the source domain. To avoid semantic inconsistencies during translation that naturally appear due to the large inner- and cross-domain variations, we introduce the concept of feature masks that provide coarse semantic guidance without requiring the use of any semantic labels. Experimental results on various datasets show that EGSC-IT does not only translate the source image to diverse instances in the target domain, but also preserves the semantic consistency during the process. ", "keywords": "image-to-image translation;image generation;domain adaptation", "primary_area": "", "supplementary_material": "", "author": "Liqian Ma;Xu Jia;Stamatios Georgoulis;Tinne Tuytelaars;Luc Van Gool", "authorids": "liqian.ma@esat.kuleuven.be;xu.jia@esat.kuleuven.be;georgous@ee.ethz.ch;tinne.tuytelaars@esat.kuleuven.be;luc.vangool@esat.kuleuven.be", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nma2018exemplar,\ntitle={Exemplar Guided Unsupervised Image-to-Image Translation with Semantic Consistency},\nauthor={Liqian Ma and Xu Jia and Stamatios Georgoulis and Tinne Tuytelaars and Luc Van Gool},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lTg3RqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;8", "confidence": "5;4;4", "wc_review": "250;193;161", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "895;728;196", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 201.33333333333334, 36.80881536926839 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 606.3333333333334, 298.05182248878947 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 165, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6219807346238873083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1lTg3RqYQ", "pdf": "https://openreview.net/pdf?id=S1lTg3RqYQ", "email": ";;;;", "author_num": 5 }, { "id": "S1lVniC5Y7", "title": "From Nodes to Networks: Evolving Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "Genetic programming to evolve new recurrent nodes for language and music. Uses a LSTM model to predict the performance of the recurrent node. ", "abstract": "Gated recurrent networks such as those composed of Long Short-Term Memory\n(LSTM) nodes have recently been used to improve state of the art in many sequential\nprocessing tasks such as speech recognition and machine translation. However,\nthe basic structure of the LSTM node is essentially the same as when it was\nfirst conceived 25 years ago. Recently, evolutionary and reinforcement learning\nmechanisms have been employed to create new variations of this structure. This\npaper proposes a new method, evolution of a tree-based encoding of the gated\nmemory nodes, and shows that it makes it possible to explore new variations more\neffectively than other methods. The method discovers nodes with multiple recurrent\npaths and multiple memory cells, which lead to significant improvement in the\nstandard language modeling benchmark task. Remarkably, this node did not perform\nwell in another task, music modeling, but it was possible to evolve a different\nnode that did, demonstrating that the approach discovers customized structure for\neach task. The paper also shows how the search process can be speeded up by\ntraining an LSTM network to estimate performance of candidate structures, and\nby encouraging exploration of novel solutions. Thus, evolutionary design of complex\nneural network structures promises to improve performance of deep learning\narchitectures beyond human ability to do so.", "keywords": "Recurrent neural networks;evolutionary algorithms;genetic programming", "primary_area": "", "supplementary_material": "", "author": "Aditya Rawal;Jason Liang;Risto Miikkulainen", "authorids": "aditya@cs.utexas.edu;jasonzliang@utexas.edu;risto@cs.utexas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrawal2019from,\ntitle={From Nodes to Networks: Evolving Recurrent Neural Networks},\nauthor={Aditya Rawal and Jason Liang and Risto Miikkulainen},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lVniC5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1lVniC5Y7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "484;276;198", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 319.3333333333333, 120.71269840226238 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6305287926646951237&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "title": "ARM: Augment-REINFORCE-Merge Gradient for Stochastic Binary Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/942", "id": "S1lg0jAcYm", "author_site": "Mingzhang Yin, Mingyuan Zhou", "tldr": "An unbiased and low-variance gradient estimator for discrete latent variable models", "abstract": "To backpropagate the gradients through stochastic binary layers, we propose the augment-REINFORCE-merge (ARM) estimator that is unbiased, exhibits low variance, and has low computational complexity. Exploiting variable augmentation, REINFORCE, and reparameterization, the ARM estimator achieves adaptive variance reduction for Monte Carlo integration by merging two expectations via common random numbers. The variance-reduction mechanism of the ARM estimator can also be attributed to either antithetic sampling in an augmented space, or the use of an optimal anti-symmetric \"self-control\" baseline function together with the REINFORCE estimator in that augmented space. Experimental results show the ARM estimator provides state-of-the-art performance in auto-encoding variational inference and maximum likelihood estimation, for discrete latent variable models with one or multiple stochastic binary layers. Python code for reproducible research is publicly available.", "keywords": "Antithetic sampling;variable augmentation;deep discrete latent variable models;variance reduction;variational auto-encoder", "primary_area": "", "supplementary_material": "", "author": "Mingzhang Yin;Mingyuan Zhou", "authorids": "mzyin@utexas.edu;mingyuan.zhou@mccombs.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nyin2018arm,\ntitle={{ARM}: Augment-{REINFORCE}-Merge Gradient for Stochastic Binary Networks},\nauthor={Mingzhang Yin and Mingyuan Zhou},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lg0jAcYm},\n}", "github": "[![github](/images/github_icon.svg) mingzhang-yin/ARM-gradient](https://github.com/mingzhang-yin/ARM-gradient)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;4", "wc_review": "858;205;828", "wc_reply_reviewers": "0;0;248", "wc_reply_authors": "2059;447;1241", "reply_reviewers": "0;0;1", "reply_authors": "4;1;3", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 630.3333333333334, 301.0053524810185 ], "wc_reply_reviewers_avg": [ 82.66666666666667, 116.90832115617586 ], "wc_reply_authors_avg": [ 1249.0, 658.1205563319434 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1199474822347449770&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1lg0jAcYm", "pdf": "https://openreview.net/pdf?id=S1lg0jAcYm", "email": ";", "author_num": 2 }, { "title": "Building Dynamic Knowledge Graphs from Text using Machine Reading Comprehension", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1066", "id": "S1lhbnRqF7", "author_site": "Rajarshi Das, Tsendsuren Munkhdalai, Eric Yuan, Adam Trischler, Andrew McCallum", "tldr": "", "abstract": "We propose a neural machine-reading model that constructs dynamic knowledge graphs from procedural text. It builds these graphs recurrently for each step of the described procedure, and uses them to track the evolving states of participant entities. We harness and extend a recently proposed machine reading comprehension(MRC) model to query for entity states, since these states are generally communicated in spans of text and MRC models perform well in extracting entity-centric spans. The explicit, structured, and evolving knowledge graph representations that our model constructs can be used in downstream question answering tasks to improve machine comprehension of text, as we demonstrate empirically. On two comprehension tasks from the recently proposed ProPara dataset, our model achieves state-of-the-art results. We further show that our model is competitive on the Recipes dataset, suggesting it may be generally applicable.", "keywords": "recurrent graph networks;dynamic knowledge base construction;entity state tracking;machine reading comprehension", "primary_area": "", "supplementary_material": "", "author": "Rajarshi Das;Tsendsuren Munkhdalai;Xingdi Yuan;Adam Trischler;Andrew McCallum", "authorids": "rajarshi@cs.umass.edu;tsmunkhd@microsoft.com;eric.yuan@microsoft.com;adam.trischler@microsoft.com;mccallum@cs.umass.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ndas2018building,\ntitle={Building Dynamic Knowledge Graphs from Text using Machine Reading Comprehension},\nauthor={Rajarshi Das and Tsendsuren Munkhdalai and Xingdi Yuan and Adam Trischler and Andrew McCallum},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lhbnRqF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "309;143;499", "wc_reply_reviewers": "0;0;35", "wc_reply_authors": "1018;322;531", "reply_reviewers": "0;0;1", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 317.0, 145.4464391680548 ], "wc_reply_reviewers_avg": [ 11.666666666666666, 16.49915822768611 ], "wc_reply_authors_avg": [ 623.6666666666666, 291.5982776964836 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6748557175668250759&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1lhbnRqF7", "pdf": "https://openreview.net/pdf?id=S1lhbnRqF7", "email": ";;;;", "author_num": 5 }, { "id": "S1llBiR5YX", "title": "Accidental exploration through value predictors", "track": "main", "status": "Reject", "tldr": "We study the biases introduced in common value predictors by the fact that trajectories are, in practice, finite.", "abstract": "Infinite length of trajectories is an almost universal assumption in the theoretical foundations of reinforcement learning. In practice learning occurs on finite trajectories. In this paper we examine a specific result of this disparity, namely a strong bias of the time-bounded Every-visit Monte Carlo value estimator. This manifests as a vastly different learning dynamic for algorithms that use value predictors, including encouraging or discouraging exploration.\n\nWe investigate these claims theoretically for a one dimensional random walk, and empirically on a number of simple environments. We use GAE as an algorithm involving a value predictor and evolution strategies as a reference point.", "keywords": "reinforcement learning;value predictors;exploration", "primary_area": "", "supplementary_material": "", "author": "Tomasz Kisielewski;Damian Le\u015bniak;Maia Pasek", "authorids": "tymorl@gmail.com;damian.lesniak@doctoral.uj.edu.pl;maiapasek@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkisielewski2019accidental,\ntitle={Accidental exploration through value predictors},\nauthor={Tomasz Kisielewski and Damian Le\u015bniak and Maia Pasek},\nyear={2019},\nurl={https://openreview.net/forum?id=S1llBiR5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1llBiR5YX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "601;177;751", "wc_reply_reviewers": "0;0;596", "wc_reply_authors": "780;457;1049", "reply_reviewers": "0;0;2", "reply_authors": "1;1;2", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 509.6666666666667, 243.07109156696433 ], "wc_reply_reviewers_avg": [ 198.66666666666666, 280.95709439145486 ], "wc_reply_authors_avg": [ 762.0, 242.01790567366427 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:N0vtVpvBd4oJ:scholar.google.com/&scioq=Accidental+exploration+through+value+predictors&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "title": "Information asymmetry in KL-regularized RL", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1046", "id": "S1lqMn05Ym", "author_site": "Alexandre Galashov, Siddhant Jayakumar, Leonard Hasenclever, Dhruva Tirumala, Jonathan Schwarz, Guillaume Desjardins, Wojciech M Czarnecki, Yee Whye Teh, Razvan Pascanu, Nicolas Heess", "tldr": "Limiting state information for the default policy can improvement performance, in a KL-regularized RL framework where both agent and default policy are optimized together", "abstract": "Many real world tasks exhibit rich structure that is repeated across different parts of the state space or in time. In this work we study the possibility of leveraging such repeated structure to speed up and regularize learning. We start from the KL regularized expected reward objective which introduces an additional component, a default policy. Instead of relying on a fixed default policy, we learn it from data. But crucially, we restrict the amount of information the default policy receives, forcing it to learn reusable behaviors that help the policy learn faster. We formalize this strategy and discuss connections to information bottleneck approaches and to the variational EM algorithm. We present empirical results in both discrete and continuous action domains and demonstrate that, for certain tasks, learning a default policy alongside the policy can significantly speed up and improve learning.\nPlease watch the video demonstrating learned experts and default policies on several continuous control tasks ( https://youtu.be/U2qA3llzus8 ).", "keywords": "Deep Reinforcement Learning;Continuous Control;RL as Inference", "primary_area": "", "supplementary_material": "", "author": "Alexandre Galashov;Siddhant M. Jayakumar;Leonard Hasenclever;Dhruva Tirumala;Jonathan Schwarz;Guillaume Desjardins;Wojciech M. Czarnecki;Yee Whye Teh;Razvan Pascanu;Nicolas Heess", "authorids": "agalashov@google.com;sidmj@google.com;leonardh@google.com;dhruvat@google.com;schwarzjn@google.com;gdesjardins@google.com;lejlot@google.com;ywteh@google.com;razp@google.com;heess@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\ngalashov2018information,\ntitle={Information asymmetry in {KL}-regularized {RL}},\nauthor={Alexandre Galashov and Siddhant Jayakumar and Leonard Hasenclever and Dhruva Tirumala and Jonathan Schwarz and Guillaume Desjardins and Wojtek M. Czarnecki and Yee Whye Teh and Razvan Pascanu and Nicolas Heess},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lqMn05Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "5;4;3", "wc_review": "268;404;208", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "771;590;447", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 293.3333333333333, 81.99728992811626 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 602.6666666666666, 132.5753454539049 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12116543825498538093&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=S1lqMn05Ym", "pdf": "https://openreview.net/pdf?id=S1lqMn05Ym", "email": ";;;;;;;;;", "author_num": 10 }, { "title": "TimbreTron: A WaveNet(CycleGAN(CQT(Audio))) Pipeline for Musical Timbre Transfer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/994", "id": "S1lvm305YQ", "author_site": "Sicong(Sheldon) Huang, Qiyang Li, Cem Anil, Xuchan Bao, Sageev Oore, Roger Grosse", "tldr": "We present the TimbreTron, a pipeline for perfoming high-quality timbre transfer on musical waveforms using CQT-domain style transfer.", "abstract": "In this work, we address the problem of musical timbre transfer, where the goal is to manipulate the timbre of a sound sample from one instrument to match another instrument while preserving other musical content, such as pitch, rhythm, and loudness. In principle, one could apply image-based style transfer techniques to a time-frequency representation of an audio signal, but this depends on having a representation that allows independent manipulation of timbre as well as high-quality waveform generation. We introduce TimbreTron, a method for musical timbre transfer which applies \u201cimage\u201d domain style transfer to a time-frequency representation of the audio signal, and then produces a high-quality waveform using a conditional WaveNet synthesizer. We show that the Constant Q Transform (CQT) representation is particularly well-suited to convolutional architectures due to its approximate pitch equivariance. Based on human perceptual evaluations, we confirmed that TimbreTron recognizably transferred the timbre while otherwise preserving the musical content, for both monophonic and polyphonic samples. We made an accompanying demo video here: https://www.cs.toronto.edu/~huang/TimbreTron/index.html which we strongly encourage you to watch before reading the paper.", "keywords": "Generative models;Timbre Transfer;Wavenet;CycleGAN", "primary_area": "", "supplementary_material": "", "author": "Sicong Huang;Qiyang Li;Cem Anil;Xuchan Bao;Sageev Oore;Roger B. Grosse", "authorids": "huang@cs.toronto.edu;colinli@cs.toronto.edu;anilcem@cs.toronto.edu;jennybao@cs.toronto.edu;sageev@dal.ca;rgrosse@cs.toronto.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nhuang2018timbretron,\ntitle={TimbreTron: A WaveNet(Cycle{GAN}({CQT}(Audio))) Pipeline for Musical Timbre Transfer},\nauthor={Sicong Huang and Qiyang Li and Cem Anil and Xuchan Bao and Sageev Oore and Roger B. Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lvm305YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;8", "confidence": "5;4;4", "wc_review": "756;542;314", "wc_reply_reviewers": "557;0;0", "wc_reply_authors": "1475;274;207", "reply_reviewers": "1;0;0", "reply_authors": "4;1;1", "rating_avg": [ 6.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 537.3333333333334, 180.47591405934355 ], "wc_reply_reviewers_avg": [ 185.66666666666666, 262.57231808060465 ], "wc_reply_authors_avg": [ 652.0, 582.5913376172587 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9707253433941508, "gs_citation": 146, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11196022310662002190&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1lvm305YQ", "pdf": "https://openreview.net/pdf?id=S1lvm305YQ", "email": ";;;;;", "author_num": 6 }, { "id": "S1lwRjR9YX", "title": "Stability of Stochastic Gradient Method with Momentum for Strongly Convex Loss Functions", "track": "main", "status": "Reject", "tldr": "Stochastic gradient method with momentum generalizes.", "abstract": "While momentum-based methods, in conjunction with the stochastic gradient descent, are widely used when training machine learning models, there is little theoretical understanding on the generalization error of such methods. In practice, the momentum parameter is often chosen in a heuristic fashion with little theoretical guidance. In this work, we use the framework of algorithmic stability to provide an upper-bound on the generalization error for the class of strongly convex loss functions, under mild technical assumptions. Our bound decays to zero inversely with the size of the training set, and increases as the momentum parameter is increased. We also develop an upper-bound on the expected true risk, in terms of the number of training steps, the size of the training set, and the momentum parameter.", "keywords": "Generalization Error;Stochastic Gradient Descent;Uniform Stability", "primary_area": "", "supplementary_material": "", "author": "Ali Ramezani-Kebrya;Ashish Khisti;and Ben Liang", "authorids": "aramezani@ece.utoronto.ca;akhisti@ece.utoronto.ca;liang@ece.utoronto.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nramezani-kebrya2019stability,\ntitle={Stability of Stochastic Gradient Method with Momentum for Strongly Convex Loss Functions},\nauthor={Ali Ramezani-Kebrya and Ashish Khisti and and Ben Liang},\nyear={2019},\nurl={https://openreview.net/forum?id=S1lwRjR9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1lwRjR9YX", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;5;4", "wc_review": "221;634;93", "wc_reply_reviewers": "312;394;131", "wc_reply_authors": "596;874;166", "reply_reviewers": "4;1;2", "reply_authors": "5;2;2", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 316.0, 230.85204496964428 ], "wc_reply_reviewers_avg": [ 279.0, 109.87568733194193 ], "wc_reply_authors_avg": [ 545.3333333333334, 291.2517048114149 ], "reply_reviewers_avg": [ 2.3333333333333335, 1.247219128924647 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12114780272557196651&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Whitening and Coloring Batch Transform for GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/992", "id": "S1x2Fj0qKQ", "author_site": "Aliaksandr Siarohin, Enver Sangineto, Nicu Sebe", "tldr": "", "abstract": "Batch Normalization (BN) is a common technique used to speed-up and stabilize training. On the other hand, the learnable parameters of BN are commonly used in conditional Generative Adversarial Networks (cGANs) for representing class-specific information using conditional Batch Normalization (cBN). In this paper we propose to generalize both BN and cBN using a Whitening and Coloring based batch normalization. We show that our conditional Coloring can represent categorical conditioning information which largely helps the cGAN qualitative results. Moreover, we show that full-feature whitening is important in a general GAN scenario in which the training process is known to be highly unstable. We test our approach on different datasets and using different GAN networks and training protocols, showing a consistent improvement in all the tested frameworks. Our CIFAR-10 conditioned results are higher than all previous works on this dataset.", "keywords": "Generative Adversarial Networks;conditional GANs;Batch Normalization", "primary_area": "", "supplementary_material": "", "author": "Aliaksandr Siarohin;Enver Sangineto;Nicu Sebe", "authorids": "aliaksandr.siarohin@unitn.it;enver.sangineto@unitn.it;niculae.sebe@unitn.it", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsiarohin2018whitening,\ntitle={Whitening and Coloring transform for {GAN}s},\nauthor={Aliaksandr Siarohin and Enver Sangineto and Nicu Sebe},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1x2Fj0qKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;2", "wc_review": "958;180;236", "wc_reply_reviewers": "235;0;0", "wc_reply_authors": "3385;435;932", "reply_reviewers": "3;0;0", "reply_authors": "7;1;2", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 458.0, 354.2917818220833 ], "wc_reply_reviewers_avg": [ 78.33333333333333, 110.78006238589245 ], "wc_reply_authors_avg": [ 1584.0, 1289.5614241542225 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 3.3333333333333335, 2.6246692913372702 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8343777033924906329&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1x2Fj0qKQ", "pdf": "https://openreview.net/pdf?id=S1x2Fj0qKQ", "email": ";;", "author_num": 3 }, { "id": "S1x2aiRqFX", "title": "Differentiable Expected BLEU for Text Generation", "track": "main", "status": "Reject", "tldr": "A new differentiable expected BLEU objective that is end-to-end trainable with gradient descent for neural text generation models", "abstract": "Neural text generation models such as recurrent networks are typically trained by maximizing data log-likelihood based on cross entropy. Such training objective shows a discrepancy from test criteria like the BLEU metric. Recent work optimizes expected BLEU under the model distribution using policy gradient, while such algorithm can suffer from high variance and become impractical. In this paper, we propose a new Differentiable Expected BLEU (DEBLEU) objective that permits direct optimization of neural generation models with gradient descent. We leverage the decomposability and sparsity of BLEU, and reformulate it with moderate approximations, making the evaluation of the objective and its gradient efficient, comparable to common cross-entropy loss. We further devise a simple training procedure with ground-truth masking and annealing for stable optimization. Experiments on neural machine translation and image captioning show our method significantly improves over both cross-entropy and policy gradient training.", "keywords": "text generation;BLEU;differentiable;gradient descent;maximum likelihood learning;policy gradient;machine translation", "primary_area": "", "supplementary_material": "", "author": "Wentao Wang;Zhiting Hu;Zichao Yang;Haoran Shi;Eric P. Xing", "authorids": "wwt10@pku.edu.cn;zhitinghu@gmail.com;yangtze2301@gmail.com;shr970423@gmail.com;epxing@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwang2019differentiable,\ntitle={Differentiable Expected {BLEU} for Text Generation},\nauthor={Wentao Wang and Zhiting Hu and Zichao Yang and Haoran Shi and Eric P. Xing},\nyear={2019},\nurl={https://openreview.net/forum?id=S1x2aiRqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1x2aiRqFX", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;4", "wc_review": "743;343;243", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 443.0, 216.02468994692867 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g88D9eBdudIJ:scholar.google.com/&scioq=Differentiable+Expected+BLEU+for+Text+Generation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Temporal Difference Variational Auto-Encoder", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1084", "id": "S1x4ghC9tQ", "author_site": "Karol Gregor, George Papamakarios, Frederic Besse, Lars Buesing, Theophane Weber", "tldr": "Generative model of temporal data, that builds online belief state, operates in latent space, does jumpy predictions and rollouts of states.", "abstract": "To act and plan in complex environments, we posit that agents should have a mental simulator of the world with three characteristics: (a) it should build an abstract state representing the condition of the world; (b) it should form a belief which represents uncertainty on the world; (c) it should go beyond simple step-by-step simulation, and exhibit temporal abstraction. Motivated by the absence of a model satisfying all these requirements, we propose TD-VAE, a generative sequence model that learns representations containing explicit beliefs about states several steps into the future, and that can be rolled out directly without single-step transitions. TD-VAE is trained on pairs of temporally separated time points, using an analogue of temporal difference learning used in reinforcement learning.", "keywords": "generative models;variational auto-encoders;state space models;temporal difference learning", "primary_area": "", "supplementary_material": "", "author": "Karol Gregor;George Papamakarios;Frederic Besse;Lars Buesing;Theophane Weber", "authorids": "karol.gregor@gmail.com;g.papamakarios@ed.ac.uk;fbesse@google.com;lbuesing@google.com;theophane@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngregor2018temporal,\ntitle={Temporal Difference Variational Auto-Encoder},\nauthor={Karol Gregor and George Papamakarios and Frederic Besse and Lars Buesing and Theophane Weber},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1x4ghC9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;8;9", "confidence": "5;4;4", "wc_review": "211;188;165", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "147;214;112", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 8.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 188.0, 18.7794213613377 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 157.66666666666666, 42.3188951756646 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 161, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2182933855734309592&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=S1x4ghC9tQ", "pdf": "https://openreview.net/pdf?id=S1x4ghC9tQ", "email": ";;;;", "author_num": 5 }, { "id": "S1x8WnA5Ym", "title": "Learning Diverse Generations using Determinantal Point Processes", "track": "main", "status": "Reject", "tldr": "The addition of a diversity criterion inspired from DPP in the GAN objective avoids mode collapse and leads to better generations. ", "abstract": "Generative models have proven to be an outstanding tool for representing high-dimensional probability distributions and generating realistic looking images. A fundamental characteristic of generative models is their ability to produce multi-modal outputs. However, while training, they are often susceptible to mode collapse, which means that the model is limited in mapping the input noise to only a few modes of the true data distribution. In this paper, we draw inspiration from Determinantal Point Process (DPP) to devise a generative model that alleviates mode collapse while producing higher quality samples. DPP is an elegant probabilistic measure used to model negative correlations within a subset and hence quantify its diversity. We use DPP kernel to model the diversity in real data as well as in synthetic data. Then, we devise a generation penalty term that encourages the generator to synthesize data with a similar diversity to real data. In contrast to previous state-of-the-art generative models that tend to use additional trainable parameters or complex training paradigms, our method does not change the original training scheme. Embedded in an adversarial training and variational autoencoder, our Generative DPP approach shows a consistent resistance to mode-collapse on a wide-variety of synthetic data and natural image datasets including MNIST, CIFAR10, and CelebA, while outperforming state-of-the-art methods for data-efficiency, convergence-time, and generation quality. Our code will be made publicly available.", "keywords": "Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Mohamed Elfeki;Camille Couprie;Mohamed Elhoseiny", "authorids": "m.elfeki11@gmail.com;coupriec@fb.com;elhoseiny@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nelfeki2019learning,\ntitle={Learning Diverse Generations using Determinantal Point Processes},\nauthor={Mohamed Elfeki and Camille Couprie and Mohamed Elhoseiny},\nyear={2019},\nurl={https://openreview.net/forum?id=S1x8WnA5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1x8WnA5Ym", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;5;4", "wc_review": "587;300;441", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "618;277;443", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 442.6666666666667, 117.17318616285706 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 446.0, 139.2288284324287 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Y0_M09Be1WoJ:scholar.google.com/&scioq=Learning+Diverse+Generations+using+Determinantal+Point+Processes&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "S1xBioR5KX", "title": "Parameter efficient training of deep convolutional neural networks by dynamic sparse reparameterization", "track": "main", "status": "Reject", "tldr": "We describe a dynamic sparse reparameterization technique that allow training of a small sparse network to generalize on par with, or better than, a full-sized dense model compressed to the same size. ", "abstract": "Modern deep neural networks are highly overparameterized, and often of huge sizes. A number of post-training model compression techniques, such as distillation, pruning and quantization, can reduce the size of network parameters by a substantial fraction with little loss in performance. However, training a small network of the post-compression size de novo typically fails to reach the same level of accuracy achieved by compression of a large network, leading to a widely-held belief that gross overparameterization is essential to effective learning. In this work, we argue that this is not necessarily true. We describe a dynamic sparse reparameterization technique that closed the performance gap between a model compressed through iterative pruning and a model of the post-compression size trained de novo. We applied our method to training deep residual networks and showed that it outperformed existing reparameterization techniques, yielding the best accuracy for a given parameter budget for training. Compared to existing dynamic reparameterization methods that reallocate non-zero parameters during training, our approach achieved better performance at lower computational cost. Our method is not only of practical value for training under stringent memory constraints, but also potentially informative to theoretical understanding of generalization properties of overparameterized deep neural networks. \n\n", "keywords": "sparse;reparameterization;overparameterization;convolutional neural network;training;compression;pruning", "primary_area": "", "supplementary_material": "", "author": "Hesham Mostafa;Xin Wang", "authorids": "hesham.mostafa@intel.com;xin3.wang@intel.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmostafa2019parameter,\ntitle={Parameter efficient training of deep convolutional neural networks by dynamic sparse reparameterization},\nauthor={Hesham Mostafa and Xin Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xBioR5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1xBioR5KX", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "wc_review": "657;286;276", "wc_reply_reviewers": "416;0;25", "wc_reply_authors": "2039;694;341", "reply_reviewers": "1;0;1", "reply_authors": "6;2;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 406.3333333333333, 177.29510866224018 ], "wc_reply_reviewers_avg": [ 147.0, 190.48534501810542 ], "wc_reply_authors_avg": [ 1024.6666666666667, 731.5765320335407 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 2.160246899469287 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 416, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3342252922777294975&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "Learnable Embedding Space for Efficient Neural Architecture Compression", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/834", "id": "S1xLN3C9YX", "author_site": "Shengcao Cao, Xiaofang Wang, Kris M Kitani", "tldr": "We propose a method to incrementally learn an embedding space over the domain of network architectures, to enable the careful selection of architectures for evaluation during compressed architecture search.", "abstract": "We propose a method to incrementally learn an embedding space over the domain of network architectures, to enable the careful selection of architectures for evaluation during compressed architecture search. Given a teacher network, we search for a compressed network architecture by using Bayesian Optimization (BO) with a kernel function defined over our proposed embedding space to select architectures for evaluation. We demonstrate that our search algorithm can significantly outperform various baseline methods, such as random search and reinforcement learning (Ashok et al., 2018). The compressed architectures found by our method are also better than the state-of-the-art manually-designed compact architecture ShuffleNet (Zhang et al., 2018). We also demonstrate that the learned embedding space can be transferred to new settings for architecture search, such as a larger teacher network or a teacher network in a different architecture family, without any training.", "keywords": "Network Compression;Neural Architecture Search;Bayesian Optimization;Architecture Embedding", "primary_area": "", "supplementary_material": "", "author": "Shengcao Cao;Xiaofang Wang;Kris M. Kitani", "authorids": "caoshengcao@pku.edu.cn;xiaofan2@cs.cmu.edu;kkitani@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ncao2018learnable,\ntitle={Learnable Embedding Space for Efficient Neural Architecture Compression},\nauthor={Shengcao Cao and Xiaofang Wang and Kris M. Kitani},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xLN3C9YX},\n}", "github": "[![github](/images/github_icon.svg) Friedrich1006/ESNAC](https://github.com/Friedrich1006/ESNAC) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=S1xLN3C9YX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;4", "wc_review": "442;434;805", "wc_reply_reviewers": "185;27;166", "wc_reply_authors": "1293;489;2356", "reply_reviewers": "1;1;3", "reply_authors": "2;1;7", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 560.3333333333334, 173.03628392783853 ], "wc_reply_reviewers_avg": [ 126.0, 70.43200030289263 ], "wc_reply_authors_avg": [ 1379.3333333333333, 764.6403657203097 ], "reply_reviewers_avg": [ 1.6666666666666667, 0.9428090415820634 ], "reply_authors_avg": [ 3.3333333333333335, 2.6246692913372702 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=117198627951999316&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=S1xLN3C9YX", "pdf": "https://openreview.net/pdf?id=S1xLN3C9YX", "email": ";;", "author_num": 3 }, { "id": "S1xLZ2R5KQ", "title": "Maximum a Posteriori on a Submanifold: a General Image Restoration Method with GAN", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a general method for various image restoration problems, such as denoising, deblurring, super-resolution and inpainting. The problem is formulated as a constrained optimization problem. Its objective is to maximize a posteriori probability of latent variables, and its constraint is that the image generated by these latent variables must be the same as the degraded image. We use a Generative Adversarial Network (GAN) as our density estimation model. Convincing results are obtained on MNIST dataset.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Fangzhou Luo;Xiaolin Wu", "authorids": "fluo1993@gmail.com;xwu510@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nluo2019maximum,\ntitle={Maximum a Posteriori on a Submanifold: a General Image Restoration Method with {GAN}},\nauthor={Fangzhou Luo and Xiaolin Wu},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xLZ2R5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=S1xLZ2R5KQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;5;4", "wc_review": "223;335;684", "wc_reply_reviewers": "146;0;0", "wc_reply_authors": "358;172;115", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 414.0, 196.31776961514885 ], "wc_reply_reviewers_avg": [ 48.666666666666664, 68.82506003549064 ], "wc_reply_authors_avg": [ 215.0, 103.75933692926145 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5044439315458759341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "On the Sensitivity of Adversarial Robustness to Input Data Distributions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1112", "id": "S1xNEhR9KX", "author_site": "Gavin Ding, Yik Chau Lui, Xiaomeng Jin, Luyu Wang, Ruitong Huang", "tldr": "Robustness performance of PGD trained models are sensitive to semantics-preserving transformation of image datasets, which implies the trickiness of evaluation of robust learning algorithms in practice.", "abstract": "Neural networks are vulnerable to small adversarial perturbations. Existing literature largely focused on understanding and mitigating the vulnerability of learned models. In this paper, we demonstrate an intriguing phenomenon about the most popular robust training method in the literature, adversarial training: Adversarial robustness, unlike clean accuracy, is sensitive to the input data distribution. Even a semantics-preserving transformations on the input data distribution can cause a significantly different robustness for the adversarial trained model that is both trained and evaluated on the new distribution. Our discovery of such sensitivity on data distribution is based on a study which disentangles the behaviors of clean accuracy and robust accuracy of the Bayes classifier. Empirical investigations further confirm our finding. We construct semantically-identical variants for MNIST and CIFAR10 respectively, and show that standardly trained models achieve comparable clean accuracies on them, but adversarially trained models achieve significantly different robustness accuracies. This counter-intuitive phenomenon indicates that input data distribution alone can affect the adversarial robustness of trained neural networks, not necessarily the tasks themselves. Lastly, we discuss the practical implications on evaluating adversarial robustness, and make initial attempts to understand this complex phenomenon.", "keywords": "adversarial robustness;adversarial training;PGD training;adversarial perturbation;input data distribution", "primary_area": "", "supplementary_material": "", "author": "Gavin Weiguang Ding;Kry Yik Chau Lui;Xiaomeng Jin;Luyu Wang;Ruitong Huang", "authorids": "gavin.ding@borealisai.com;yikchau.y.lui@borealisai.com;tracy.jin@mail.utoronto.ca;luyu.wang@borealisai.com;ruitong.huang@borealisai.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nding2018on,\ntitle={On the Sensitivity of Adversarial Robustness to Input Data Distributions},\nauthor={Gavin Weiguang Ding and Kry Yik-Chau Lui and Xiaomeng Jin and Luyu Wang and Ruitong Huang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xNEhR9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;2;3", "wc_review": "205;155;163", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "660;119;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 174.33333333333334, 21.9291789378647 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 259.6666666666667, 287.216913769986 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 68, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=925121948530123203&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=S1xNEhR9KX", "pdf": "https://openreview.net/pdf?id=S1xNEhR9KX", "email": ";;;;", "author_num": 5 }, { "title": "Minimal Images in Deep Neural Networks: Fragile Object Recognition in Natural Images", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/679", "id": "S1xNb2A9YX", "author_site": "Sanjana Srivastava, Guy Ben-Yosef, Xavier Boix", "tldr": "", "abstract": "The human ability to recognize objects is impaired when the object is not shown in full. \"Minimal images\" are the smallest regions of an image that remain recognizable for humans. Ullman et al. (2016) show that a slight modification of the location and size of the visible region of the minimal image produces a sharp drop in human recognition accuracy. In this paper, we demonstrate that such drops in accuracy due to changes of the visible region are a common phenomenon between humans and existing state-of-the-art deep neural networks (DNNs), and are much more prominent in DNNs. We found many cases where DNNs classified one region correctly and the other incorrectly, though they only differed by one row or column of pixels, and were often bigger than the average human minimal image size. We show that this phenomenon is independent from previous works that have reported lack of invariance to minor modifications in object location in DNNs. Our results thus reveal a new failure mode of DNNs that also affects humans to a much lesser degree. They expose how fragile DNN recognition ability is in natural images even without adversarial patterns being introduced. Bringing the robustness of DNNs in natural images to the human level remains an open challenge for the community. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sanjana Srivastava;Guy Ben-Yosef;Xavier Boix", "authorids": "sanjanas@mit.edu;gby@csail.mit.edu;xboix@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsrivastava2018minimal,\ntitle={Minimal Images in Deep Neural Networks: Fragile Object Recognition in Natural Images},\nauthor={Sanjana Srivastava and Guy Ben-Yosef and Xavier Boix},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xNb2A9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "302;212;215", "wc_reply_reviewers": "66;0;0", "wc_reply_authors": "703;181;285", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 243.0, 41.737273509418415 ], "wc_reply_reviewers_avg": [ 22.0, 31.11269837220809 ], "wc_reply_authors_avg": [ 389.6666666666667, 225.5915680063321 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9004031207048584319&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=S1xNb2A9YX", "pdf": "https://openreview.net/pdf?id=S1xNb2A9YX", "email": ";;", "author_num": 3 }, { "title": "A Statistical Approach to Assessing Neural Network Robustness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/767", "id": "S1xcx3C5FX", "author_site": "Stefan Webb, Tom Rainforth, Yee Whye Teh, M. Pawan Kumar", "tldr": "We introduce a statistical approach to assessing neural network robustness that provides an informative notion of how robust a network is, rather than just the conventional binary assertion of whether or not of property is violated.", "abstract": "We present a new approach to assessing the robustness of neural networks based on estimating the proportion of inputs for which a property is violated. Specifically, we estimate the probability of the event that the property is violated under an input model. Our approach critically varies from the formal verification framework in that when the property can be violated, it provides an informative notion of how robust the network is, rather than just the conventional assertion that the network is not verifiable. Furthermore, it provides an ability to scale to larger networks than formal verification approaches. Though the framework still provides a formal guarantee of satisfiability whenever it successfully finds one or more violations, these advantages do come at the cost of only providing a statistical estimate of unsatisfiability whenever no violation is found. Key to the practical success of our approach is an adaptation of multi-level splitting, a Monte Carlo approach for estimating the probability of rare events, to our statistical robustness framework. We demonstrate that our approach is able to emulate formal verification procedures on benchmark problems, while scaling to larger networks and providing reliable additional information in the form of accurate estimates of the violation probability.", "keywords": "neural network verification;multi-level splitting;formal verification", "primary_area": "", "supplementary_material": "", "author": "Stefan Webb;Tom Rainforth;Yee Whye Teh;M. Pawan Kumar", "authorids": "info@stefanwebb.me;twgr@robots.ox.ac.uk;y.w.teh@stats.ox.ac.uk;pawan@robots.ox.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwebb2018statistical,\ntitle={Statistical Verification of Neural Networks},\nauthor={Stefan Webb and Tom Rainforth and Yee Whye Teh and M. Pawan Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xcx3C5FX},\n}", "github": "[![github](/images/github_icon.svg) oval-group/statistical-robustness](https://github.com/oval-group/statistical-robustness)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;3", "wc_review": "839;255;360", "wc_reply_reviewers": "673;0;0", "wc_reply_authors": "2135;833;486", "reply_reviewers": "2;0;0", "reply_authors": "4;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 484.6666666666667, 254.19196595399225 ], "wc_reply_reviewers_avg": [ 224.33333333333334, 317.2552424923643 ], "wc_reply_authors_avg": [ 1151.3333333333333, 709.836757446543 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7897732150648450452&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=S1xcx3C5FX", "pdf": "https://openreview.net/pdf?id=S1xcx3C5FX", "email": ";;;", "author_num": 4 }, { "id": "S1xiOjC9F7", "title": "Graph Matching Networks for Learning the Similarity of Graph Structured Objects", "track": "main", "status": "Reject", "tldr": "We tackle the problem of similarity learning for structured objects with applications in particular in computer security, and propose a new model graph matching networks that excels on this task.", "abstract": "This paper addresses the challenging problem of retrieval and matching of graph structured objects, and makes two key contributions. First, we demonstrate how Graph Neural Networks (GNN), which have emerged as an effective model for various supervised prediction problems defined on structured data, can be trained to produce embedding of graphs in vector spaces that enables efficient similarity reasoning. Second, we propose a novel Graph Matching Network model that, given a pair of graphs as input, computes a similarity score between them by jointly reasoning on the pair through a new cross-graph attention-based matching mechanism. We demonstrate the effectiveness of our models on different domains including the challenging problem of control-flow-graph based function similarity search that plays an important role in the detection of vulnerabilities in software systems. The experimental analysis demonstrates that our models are not only able to exploit structure in the context of similarity learning but they can also outperform domain-specific baseline systems that have been carefully hand-engineered for these problems.", "keywords": "Similarity learning;structured objects;graph matching networks", "primary_area": "", "supplementary_material": "", "author": "Yujia Li;Chenjie Gu;Thomas Dullien;Oriol Vinyals;Pushmeet Kohli", "authorids": "yujiali@google.com;gcj@google.com;thomasdullien@google.com;vinyals@google.com;pushmeet@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019graph,\ntitle={Graph Matching Networks for Learning the Similarity of Graph Structured Objects},\nauthor={Yujia Li and Chenjie Gu and Thomas Dullien and Oriol Vinyals and Pushmeet Kohli},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xiOjC9F7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=S1xiOjC9F7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=S1xiOjC9F7", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "363;534;254", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "456;751;870", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 383.6666666666667, 115.23984650381232 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 692.3333333333334, 174.03128709771954 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 777, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13607991845696425216&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12 }, { "id": "S1xjdoC9Fm", "title": "Offline Deep models calibration with bayesian neural networks", "track": "main", "status": "Reject", "tldr": "We apply bayesian neural networks to improve calibration", "abstract": "We apply Bayesian Neural Networks to improve calibration of state-of-the-art deep\nneural networks. We show that, even with the most basic amortized approximate\nposterior distribution, and fast fully connected neural network for the likelihood,\nthe Bayesian framework clearly outperforms other simple maximum likelihood\nbased solutions that have recently shown very good performance, as temperature\nscaling. As an example, we reduce the Expected Calibration\nError (ECE) from 0.52 to 0.24 on CIFAR-10 and from 4.28 to 2.456 on CIFAR-100\non two Wide ResNet with 96.13% and 80.39% accuracy respectively, which are\namong the best results published for this task. We demonstrate our robustness and\nperformance with experiments on a wide set of state-of-the-art computer vision\nmodels. Moreover, our approach acts off-line, and thus can be applied to any\nprobabilistic model regardless of the limitations that the model may present during\ntraining. This make it suitable to calibrate systems that make use of pre-trained\ndeep neural networks that are expensive to train for a specific task, or to directly\ntrain a calibrated deep convolutional model with Monte Carlo Dropout approximations, among others. However,\nour method is still complementary with any Bayesian Neural Network for further\nimprovement.", "keywords": "calibration;deep models;bayesian neural networks", "primary_area": "", "supplementary_material": "", "author": "Juan Maro\u00f1as;Roberto Paredes;Daniel Ramos", "authorids": "jmaronasm@gmail.com;rparedes@dsic.upv.es;daniel.ramos@uam.es", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmaro\u00f1as2019offline,\ntitle={Offline Deep models calibration with bayesian neural networks},\nauthor={Juan Maro\u00f1as and Roberto Paredes and Daniel Ramos},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xjdoC9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1xjdoC9Fm", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "wc_review": "1092;278;332", "wc_reply_reviewers": "60;40;0", "wc_reply_authors": "278;360;293", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 567.3333333333334, 371.6497754009217 ], "wc_reply_reviewers_avg": [ 33.333333333333336, 24.944382578492945 ], "wc_reply_authors_avg": [ 310.3333333333333, 35.64952859280034 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RscB-N3LLasJ:scholar.google.com/&scioq=Offline+Deep+models+calibration+with+bayesian+neural+networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "S1xoy3CcYX", "title": "Adversarial Examples Are a Natural Consequence of Test Error in Noise", "track": "main", "status": "Reject", "tldr": "Small adversarial perturbations should be expected given observed error rates of models outside the natural data distribution.", "abstract": " Over the last few years, the phenomenon of adversarial examples --- maliciously constructed inputs that fool trained machine learning models --- has captured the attention of the research community, especially when the adversary is restricted to making small modifications of a correctly handled input. At the same time, less surprisingly, image classifiers lack human-level performance on randomly corrupted images, such as images with additive Gaussian noise. In this work, we show that these are two manifestations of the same underlying phenomenon. We establish this connection in several ways. First, we find that adversarial examples exist at the same distance scales we would expect from a linear model with the same performance on corrupted images. Next, we show that Gaussian data augmentation during training improves robustness to small adversarial perturbations and that adversarial training improves robustness to several types of image corruptions. Finally, we present a model-independent upper bound on the distance from a corrupted image to its nearest error given test performance and show that in practice we already come close to achieving the bound, so that improving robustness further for the corrupted image distribution requires significantly reducing test error. All of this suggests that improving adversarial robustness should go hand in hand with improving performance in the presence of more general and realistic image corruptions. This yields a computationally tractable evaluation metric for defenses to consider: test error in noisy image distributions.", "keywords": "Adversarial examples;generalization", "primary_area": "", "supplementary_material": "", "author": "Nicolas Ford;Justin Gilmer;Ekin D. Cubuk", "authorids": "nicf@google.com;gilmer@google.com;cubuk@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nford2019adversarial,\ntitle={Adversarial Examples Are a Natural Consequence of Test Error in Noise},\nauthor={Nicolas Ford and Justin Gilmer and Ekin D. Cubuk},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xoy3CcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=S1xoy3CcYX", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;4;3", "wc_review": "291;402;1121", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;185;915", "reply_reviewers": "0;0;0", "reply_authors": "0;1;2", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 604.6666666666666, 367.904274989508 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 366.6666666666667, 395.0175804807792 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 202, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17742461639569030742&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "A Unified Theory of Early Visual Representations from Retina to Cortex through Anatomically Constrained Deep CNNs", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/732", "id": "S1xq3oR5tQ", "author_site": "Jack Lindsey, Samuel Ocko, Surya Ganguli, Stephane Deny", "tldr": "We reproduced neural representations found in biological visual systems by simulating their neural resource constraints in a deep convolutional model.", "abstract": "The vertebrate visual system is hierarchically organized to process visual information in successive stages. Neural representations vary drastically across the first stages of visual processing: at the output of the retina, ganglion cell receptive fields (RFs) exhibit a clear antagonistic center-surround structure, whereas in the primary visual cortex (V1), typical RFs are sharply tuned to a precise orientation. There is currently no unified theory explaining these differences in representations across layers. Here, using a deep convolutional neural network trained on image recognition as a model of the visual system, we show that such differences in representation can emerge as a direct consequence of different neural resource constraints on the retinal and cortical networks, and for the first time we find a single model from which both geometries spontaneously emerge at the appropriate stages of visual processing. The key constraint is a reduced number of neurons at the retinal output, consistent with the anatomy of the optic nerve as a stringent bottleneck. Second, we find that, for simple downstream cortical networks, visual representations at the retinal output emerge as nonlinear and lossy feature detectors, whereas they emerge as linear and faithful encoders of the visual scene for more complex cortical networks. This result predicts that the retinas of small vertebrates (e.g. salamander, frog) should perform sophisticated nonlinear computations, extracting features directly relevant to behavior, whereas retinas of large animals such as primates should mostly encode the visual scene linearly and respond to a much broader range of stimuli. These predictions could reconcile the two seemingly incompatible views of the retina as either performing feature extraction or efficient coding of natural scenes, by suggesting that all vertebrates lie on a spectrum between these two objectives, depending on the degree of neural resources allocated to their visual system.", "keywords": "visual system;convolutional neural networks;efficient coding;retina", "primary_area": "", "supplementary_material": "", "author": "Jack Lindsey;Samuel A. Ocko;Surya Ganguli;Stephane Deny", "authorids": "lindsey6@stanford.edu;socko@stanford.edu;sganguli@stanford.edu;sdeny@stanford.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlindsey2018the,\ntitle={The effects of neural resource constraints on early visual representations },\nauthor={Jack Lindsey and Samuel A. Ocko and Surya Ganguli and Stephane Deny},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xq3oR5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "8;8;8", "confidence": "3;5;5", "wc_review": "424;216;302", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "880;77;1040", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 314.0, 85.33854150773065 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 665.6666666666666, 421.34414543089224 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2073469512347644047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=S1xq3oR5tQ", "pdf": "https://openreview.net/pdf?id=S1xq3oR5tQ", "email": ";;;", "author_num": 4 }, { "title": "Improving Sequence-to-Sequence Learning via Optimal Transport", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/831", "id": "S1xtAjR5tX", "author_site": "Liqun Chen, Yizhe Zhang, Ruiyi Zhang, Chenyang Tao, Zhe Gan, Haichao Zhang, Bai Li, Dinghan Shen, Changyou Chen, Lawrence Carin", "tldr": "", "abstract": "Sequence-to-sequence models are commonly trained via maximum likelihood estimation (MLE). However, standard MLE training considers a word-level objective, predicting the next word given the previous ground-truth partial sentence. This procedure focuses on modeling local syntactic patterns, and may fail to capture long-range semantic structure. We present a novel solution to alleviate these issues. Our approach imposes global sequence-level guidance via new supervision based on optimal transport, enabling the overall characterization and preservation of semantic features. We further show that this method can be understood as a Wasserstein gradient flow trying to match our model to the ground truth sequence distribution. Extensive experiments are conducted to validate the utility of the proposed approach, showing consistent improvements over a wide variety of NLP tasks, including machine translation, abstractive text summarization, and image captioning.", "keywords": "NLP;optimal transport;sequence to sequence;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Liqun Chen;Yizhe Zhang;Ruiyi Zhang;Chenyang Tao;Zhe Gan;Haichao Zhang;Bai Li;Dinghan Shen;Changyou Chen;Lawrence Carin", "authorids": "liqun.chen@duke.edu;yizhe.zhang@microsoft.com;rz68@duke.edu;chenyang.tao@duke.edu;zhe.gan@microsoft.com;hczhang1@gmail.com;bai.li@duke.edu;dinghan.shen@duke.edu;cchangyou@gmail.com;lcarin@duke.edu", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\nchen2018improving,\ntitle={Improving Sequence-to-Sequence Learning via Optimal Transport},\nauthor={Liqun Chen and Yizhe Zhang and Ruiyi Zhang and Chenyang Tao and Zhe Gan and Haichao Zhang and Bai Li and Dinghan Shen and Changyou Chen and Lawrence Carin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xtAjR5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "wc_review": "234;1035;302", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "593;872;286", "reply_reviewers": "0;0;0", "reply_authors": "1;3;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 523.6666666666666, 362.6314321119386 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 583.6666666666666, 239.3245123722646 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9397580809910077889&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=S1xtAjR5tX", "pdf": "https://openreview.net/pdf?id=S1xtAjR5tX", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "S1xzyhR9Y7", "title": "Improving Sentence Representations with Multi-view Frameworks", "track": "main", "status": "Reject", "tldr": "Multi-view learning improves unsupervised sentence representation learning", "abstract": "Multi-view learning can provide self-supervision when different views are available of the same data. Distributional hypothesis provides another form of useful self-supervision from adjacent sentences which are plentiful in large unlabelled corpora. Motivated by the asymmetry in the two hemispheres of the human brain as well as the observation that different learning architectures tend to emphasise different aspects of sentence meaning, we present two multi-view frameworks for learning sentence representations in an unsupervised fashion. One framework uses a generative objective and the other a discriminative one. In both frameworks, the final representation is an ensemble of two views, in which, one view encodes the input sentence with a Recurrent Neural Network (RNN), and the other view encodes it with a simple linear model. We show that, after learning, the vectors produced by our multi-view frameworks provide improved representations over their single-view learnt counterparts, and the combination of different views gives representational improvement over each view and demonstrates solid transferability on standard downstream tasks.", "keywords": "multi-view;learning;sentence;representation", "primary_area": "", "supplementary_material": "", "author": "Shuai Tang;Virginia R. de Sa", "authorids": "shuaitang93@ucsd.edu;desa@ucsd.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntang2019improving,\ntitle={Improving Sentence Representations with Multi-view Frameworks},\nauthor={Shuai Tang and Virginia R. de Sa},\nyear={2019},\nurl={https://openreview.net/forum?id=S1xzyhR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1xzyhR9Y7", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;5;4", "wc_review": "284;1097;810", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "604;953;528", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 730.3333333333334, 336.6524749484086 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 695.0, 185.05314551951466 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15981913784929972868&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "S1z9ehAqYX", "title": "Shrinkage-based Bias-Variance Trade-off for Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep reinforcement learning has achieved remarkable successes in solving various challenging artificial intelligence tasks. A variety of different algorithms have been introduced and improved towards human-level performance. Although technical advances have been developed for each individual algorithms, there has been strong evidence showing that further substantial improvements can be achieved by properly combining multiple approaches with difference biases and variances. In this work, we propose to use the James-Stein (JS) shrinkage estimator to combine on-policy policy gradient estimators which have low bias but high variance, with low-variance high-bias gradient estimates such as those constructed based on model-based methods or temporally smoothed averaging of historical gradients. Empirical results show that our simple shrinkage approach is very effective in practice and substantially improve the sample efficiency of the state-of-the-art on-policy methods on various continuous control tasks.\n", "keywords": "bias-variance trade-off;James-stein estimator;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Yihao Feng;Hao Liu;Jian Peng;Qiang Liu", "authorids": "yihao@cs.utexas.edu;uestcliuhao@gmail.com;jianpeng@illinois.edu;lqiang@cs.utexas.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfeng2019shrinkagebased,\ntitle={Shrinkage-based Bias-Variance Trade-off for Deep Reinforcement Learning},\nauthor={Yihao Feng and Hao Liu and Jian Peng and Qiang Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=S1z9ehAqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=S1z9ehAqYX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;2;3", "wc_review": "497;249;273", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "317;140;52", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 339.6666666666667, 111.68208848731693 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 169.66666666666666, 110.2008267764912 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zKObOlaKFIgJ:scholar.google.com/&scioq=Shrinkage-based+Bias-Variance+Trade-off+for+Deep+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "PATE-GAN: Generating Synthetic Data with Differential Privacy Guarantees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/880", "id": "S1zk9iRqF7", "author_site": "James Jordon, Jinsung Yoon, Mihaela Schaar", "tldr": "", "abstract": "Machine learning has the potential to assist many communities in using the large datasets that are becoming more and more available. Unfortunately, much of that potential is not being realized because it would require sharing data in a way that compromises privacy. In this paper, we investigate a method for ensuring (differential) privacy of the generator of the Generative Adversarial Nets (GAN) framework. The resulting model can be used for generating synthetic data on which algorithms can be trained and validated, and on which competitions can be conducted, without compromising the privacy of the original dataset. Our method modifies the Private Aggregation of Teacher Ensembles (PATE) framework and applies it to GANs. Our modified framework (which we call PATE-GAN) allows us to tightly bound the influence of any individual sample on the model, resulting in tight differential privacy guarantees and thus an improved performance over models with the same guarantees. We also look at measuring the quality of synthetic data from a new angle; we assert that for the synthetic data to be useful for machine learning researchers, the relative performance of two algorithms (trained and tested) on the synthetic dataset should be the same as their relative performance (when trained and tested) on the original dataset. Our experiments, on various datasets, demonstrate that PATE-GAN consistently outperforms the state-of-the-art method with respect to this and other notions of synthetic data quality.", "keywords": "Synthetic data generation;Differential privacy;Generative adversarial networks;Private Aggregation of Teacher ensembles", "primary_area": "", "supplementary_material": "", "author": "James Jordon;Jinsung Yoon;Mihaela van der Schaar", "authorids": "james.jordon@wolfson.ox.ac.uk;jsyoon0823@gmail.com;mihaela.vanderschaar@eng.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyoon2018pategan,\ntitle={{PATE}-{GAN}: Generating Synthetic Data with Differential Privacy Guarantees},\nauthor={Jinsung Yoon and James Jordon and Mihaela van der Schaar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1zk9iRqF7},\n}", "github": "[![github](/images/github_icon.svg) vanderschaarlab/mlforhealthlabpub](https://github.com/vanderschaarlab/mlforhealthlabpub/tree/main/alg/pategan)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "wc_review": "316;191;254", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "435;188;321", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 253.66666666666666, 51.03158063613376 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 314.6666666666667, 100.9367238532251 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 912, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9154435771423490892&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1zk9iRqF7", "pdf": "https://openreview.net/pdf?id=S1zk9iRqF7", "email": ";;", "author_num": 3 }, { "id": "S1zlmnA5K7", "title": "Where Off-Policy Deep Reinforcement Learning Fails", "track": "main", "status": "Reject", "tldr": "We describe conditions where off-policy deep reinforcements algorithms fail and present a solution.", "abstract": "This work examines batch reinforcement learning--the task of maximally exploiting a given batch of off-policy data, without further data collection. We demonstrate that due to errors introduced by extrapolation, standard off-policy deep reinforcement learning algorithms, such as DQN and DDPG, are only capable of learning with data correlated to their current policy, making them ineffective for most off-policy applications. We introduce a novel class of off-policy algorithms, batch-constrained reinforcement learning, which restricts the action space to force the agent towards behaving on-policy with respect to a subset of the given data. We extend this notion to deep reinforcement learning, and to the best of our knowledge, present the first continuous control deep reinforcement learning algorithm which can learn effectively from uncorrelated off-policy data.", "keywords": "reinforcement learning;off-policy;imitation;batch reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Scott Fujimoto;David Meger;Doina Precup", "authorids": "scott.fujimoto@mail.mcgill.ca;david.meger@mcgill.ca;dprecup@cs.mcgill.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfujimoto2019where,\ntitle={Where Off-Policy Deep Reinforcement Learning Fails},\nauthor={Scott Fujimoto and David Meger and Doina Precup},\nyear={2019},\nurl={https://openreview.net/forum?id=S1zlmnA5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=S1zlmnA5K7", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;4;4", "wc_review": "337;463;566", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "502;240;580", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 455.3333333333333, 93.64590517238624 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 440.6666666666667, 145.4219913065268 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:YnuY39H7KMkJ:scholar.google.com/&scioq=Where+Off-Policy+Deep+Reinforcement+Learning+Fails&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Integer Networks for Data Compression with Latent-Variable Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1017", "id": "S1zz2i0cY7", "author_site": "Johannes Ball\u00e9, Nick Johnston, David Minnen", "tldr": "We train variational models with quantized networks for computational determinism. This enables using them for cross-platform data compression.", "abstract": "We consider the problem of using variational latent-variable models for data compression. For such models to produce a compressed binary sequence, which is the universal data representation in a digital world, the latent representation needs to be subjected to entropy coding. Range coding as an entropy coding technique is optimal, but it can fail catastrophically if the computation of the prior differs even slightly between the sending and the receiving side. Unfortunately, this is a common scenario when floating point math is used and the sender and receiver operate on different hardware or software platforms, as numerical round-off is often platform dependent. We propose using integer networks as a universal solution to this problem, and demonstrate that they enable reliable cross-platform encoding and decoding of images using variational models.", "keywords": "data compression;variational models;network quantization", "primary_area": "", "supplementary_material": "", "author": "Johannes Ball\u00e9;Nick Johnston;David Minnen", "authorids": "jballe@google.com;nickj@google.com;dminnen@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nball\u00e92018integer,\ntitle={Integer Networks for Data Compression with Latent-Variable Models},\nauthor={Johannes Ball\u00e9 and Nick Johnston and David Minnen},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=S1zz2i0cY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;3", "wc_review": "322;172;384", "wc_reply_reviewers": "0;816;113", "wc_reply_authors": "516;1490;263", "reply_reviewers": "0;2;1", "reply_authors": "1;4;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 292.6666666666667, 88.9993757780856 ], "wc_reply_reviewers_avg": [ 309.6666666666667, 360.99153575795776 ], "wc_reply_authors_avg": [ 756.3333333333334, 528.9627166025556 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 85, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8035159381180309667&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=S1zz2i0cY7", "pdf": "https://openreview.net/pdf?id=S1zz2i0cY7", "email": ";;", "author_num": 3 }, { "id": "SJ4Z72Rctm", "title": "Composing Entropic Policies using Divergence Correction", "track": "main", "status": "Reject", "tldr": "Two new methods for combining entropic policies: maximum entropy generalized policy improvement, and divergence correction.", "abstract": "Deep reinforcement learning (RL) algorithms have made great strides in recent years. An important remaining challenge is the ability to quickly transfer existing skills to novel tasks, and to combine existing skills with newly acquired ones. In domains where tasks are solved by composing skills this capacity holds the promise of dramatically reducing the data requirements of deep RL algorithms, and hence increasing their applicability. Recent work has studied ways of composing behaviors represented in the form of action-value functions. We analyze these methods to highlight their strengths and weaknesses, and point out situations where each of them is susceptible to poor performance. To perform this analysis we extend generalized policy improvement to the max-entropy framework and introduce a method for the practical implementation of successor features in continuous action spaces. Then we propose a novel approach which, in principle, recovers the optimal policy during transfer. This method works by explicitly learning the (discounted, future) divergence between policies. We study this approach in the tabular case and propose a scalable variant that is applicable in multi-dimensional continuous action spaces.\nWe compare our approach with existing ones on a range of non-trivial continuous control problems with compositional structure, and demonstrate qualitatively better performance despite not requiring simultaneous observation of all task rewards.", "keywords": "maximum entropy RL;policy composition;deep rl", "primary_area": "", "supplementary_material": "", "author": "Jonathan J Hunt;Andre Barreto;Timothy P Lillicrap;Nicolas Heess", "authorids": "jjhunt@google.com;andrebarreto@google.com;countzero@google.com;heess@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhunt2019composing,\ntitle={Composing Entropic Policies using Divergence Correction},\nauthor={Jonathan J Hunt and Andre Barreto and Timothy P Lillicrap and Nicolas Heess},\nyear={2019},\nurl={https://openreview.net/forum?id=SJ4Z72Rctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJ4Z72Rctm", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;3", "wc_review": "551;261;174", "wc_reply_reviewers": "138;0;38", "wc_reply_authors": "2798;822;189", "reply_reviewers": "1;0;1", "reply_authors": "4;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 328.6666666666667, 161.17554267181964 ], "wc_reply_reviewers_avg": [ 58.666666666666664, 58.20271089524573 ], "wc_reply_authors_avg": [ 1269.6666666666667, 1111.1628543507422 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15842209432999245153&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "SJ4vTjRqtQ", "title": "Dynamic Planning Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We introduce Dynamic Planning Networks (DPN), a novel architecture for deep reinforcement learning, that combines model-based and model-free aspects for online planning. Our architecture learns to dynamically construct plans using a learned state-transition model by selecting and traversing between simulated states and actions to maximize valuable information before acting. In contrast to model-free methods, model-based planning lets the agent efficiently test action hypotheses without performing costly trial-and-error in the environment. DPN learns to efficiently form plans by expanding a single action-conditional state transition at a time instead of exhaustively evaluating each action, reducing the required number of state-transitions during planning by up to 96%. We observe various emergent planning patterns used to solve environments, including classical search methods such as breadth-first and depth-first search. Learning To Plan shows improved data efficiency, performance, and generalization to new and unseen domains in comparison to several baselines.", "keywords": "reinforcement learning;planning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Norman L. Tasfi;Miriam Capretz", "authorids": "ntasfi@gmail.com;mcapretz@uwo.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntasfi2019dynamic,\ntitle={Dynamic Planning Networks},\nauthor={Norman L. Tasfi and Miriam Capretz},\nyear={2019},\nurl={https://openreview.net/forum?id=SJ4vTjRqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJ4vTjRqtQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;5;2", "wc_review": "1107;223;708", "wc_reply_reviewers": "1060;77;0", "wc_reply_authors": "769;402;778", "reply_reviewers": "2;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 679.3333333333334, 361.46030979286724 ], "wc_reply_reviewers_avg": [ 379.0, 482.564676148873 ], "wc_reply_authors_avg": [ 649.6666666666666, 175.16531873886706 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999994, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13975169613364683515&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "SJG1wjRqFQ", "title": "Discrete Structural Planning for Generating Diverse Translations", "track": "main", "status": "Reject", "tldr": "Learning discrete structural representation to control sentence generation and obtain diverse outputs", "abstract": "Planning is important for humans when producing complex languages, which is a missing part in current language generation models. In this work, we add a planning phase in neural machine translation to control the global sentence structure ahead of translation. Our approach learns discrete structural representations to encode syntactic information of target sentences. During translation, we can either let beam search to choose the structural codes automatically or specify the codes manually. The word generation is then conditioned on the selected discrete codes. Experiments show that the translation performance remains intact by learning the codes to capture pure structural variations. Through structural planning, we are able to control the global sentence structure by manipulating the codes. By evaluating with a proposed structural diversity metric, we found that the sentences sampled using different codes have much higher diversity scores. In qualitative analysis, we demonstrate that the sampled paraphrase translations have drastically different structures. ", "keywords": "machine translation;syntax;diversity;code learning", "primary_area": "", "supplementary_material": "", "author": "Raphael Shu;Hideki Nakayama", "authorids": "shu@nlab.ci.i.u-tokyo.ac.jp;nakayama@ci.i.u-tokyo.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshu2019discrete,\ntitle={Discrete Structural Planning for Generating Diverse Translations},\nauthor={Raphael Shu and Hideki Nakayama},\nyear={2019},\nurl={https://openreview.net/forum?id=SJG1wjRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJG1wjRqFQ", "pdf_size": 0, "rating": "2;4;5", "confidence": "5;5;3", "wc_review": "491;306;460", "wc_reply_reviewers": "0;0;170", "wc_reply_authors": "483;310;682", "reply_reviewers": "0;0;2", "reply_authors": "1;1;3", "rating_avg": [ 3.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 419.0, 80.89911412782384 ], "wc_reply_reviewers_avg": [ 56.666666666666664, 80.13876853447539 ], "wc_reply_authors_avg": [ 491.6666666666667, 151.9919588516299 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LA6m0I97OrcJ:scholar.google.com/&scioq=Discrete+Structural+Planning+for+Generating+Diverse+Translations&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Value Propagation Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1077", "id": "SJG6G2RqtX", "author_site": "Nantas Nardelli, Gabriel Synnaeve, Zeming Lin, Pushmeet Kohli, Philip Torr, Nicolas Usunier", "tldr": "We present planners based on convnets that are sample-efficient and that generalize to larger instances of navigation and pathfinding problems.", "abstract": "We present Value Propagation (VProp), a set of parameter-efficient differentiable planning modules built on Value Iteration which can successfully be trained using reinforcement learning to solve unseen tasks, has the capability to generalize to larger map sizes, and can learn to navigate in dynamic environments. We show that the modules enable learning to plan when the environment also includes stochastic elements, providing a cost-efficient learning system to build low-level size-invariant planners for a variety of interactive navigation problems. We evaluate on static and dynamic configurations of MazeBase grid-worlds, with randomly generated environments of several different sizes, and on a StarCraft navigation scenario, with more complex dynamics, and pixels as input.", "keywords": "Reinforcement Learning;Value Iteration;Navigation;Convolutional Neural Networks;Learning to plan", "primary_area": "", "supplementary_material": "", "author": "Nantas Nardelli;Gabriel Synnaeve;Zeming Lin;Pushmeet Kohli;Philip H. S. Torr;Nicolas Usunier", "authorids": "nantas@robots.ox.ac.uk;gab@fb.com;zlin@fb.com;pushmeet@google.com;philip.torr@eng.ox.ac.uk;usunier@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nnardelli2018value,\ntitle={Value Propagation Networks},\nauthor={Nantas Nardelli and Gabriel Synnaeve and Zeming Lin and Pushmeet Kohli and Philip H. S. Torr and Nicolas Usunier},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJG6G2RqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;3;3", "wc_review": "385;566;267", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "856;596;232", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 406.0, 122.96611999517049 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 561.3333333333334, 255.9235997107123 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9180230208406770561&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SJG6G2RqtX", "pdf": "https://openreview.net/pdf?id=SJG6G2RqtX", "email": ";;;;;", "author_num": 6 }, { "title": "Bayesian Policy Optimization for Model Uncertainty", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/823", "id": "SJGvns0qK7", "author_site": "Gilwoo Lee, Brian Hou, Aditya Mandalika, Jeongseok Lee, Sanjiban Choudhury, Siddhartha Srinivasa", "tldr": "We formulate model uncertainty in Reinforcement Learning as a continuous Bayes-Adaptive Markov Decision Process and present a method for practical and scalable Bayesian policy optimization.", "abstract": "Addressing uncertainty is critical for autonomous systems to robustly adapt to the real world. We formulate the problem of model uncertainty as a continuous Bayes-Adaptive Markov Decision Process (BAMDP), where an agent maintains a posterior distribution over latent model parameters given a history of observations and maximizes its expected long-term reward with respect to this belief distribution. Our algorithm, Bayesian Policy Optimization, builds on recent policy optimization algorithms to learn a universal policy that navigates the exploration-exploitation trade-off to maximize the Bayesian value function. To address challenges from discretizing the continuous latent parameter space, we propose a new policy network architecture that encodes the belief distribution independently from the observable state. Our method significantly outperforms algorithms that address model uncertainty without explicitly reasoning about belief distributions and is competitive with state-of-the-art Partially Observable Markov Decision Process solvers.", "keywords": "Bayes-Adaptive Markov Decision Process;Model Uncertainty;Bayes Policy Optimization", "primary_area": "", "supplementary_material": "", "author": "Gilwoo Lee;Brian Hou;Aditya Mandalika;Jeongseok Lee;Sanjiban Choudhury;Siddhartha S. Srinivasa", "authorids": "gilwoo@cs.uw.edu;bhou@cs.uw.edu;adityavk@cs.uw.edu;jslee02@cs.uw.edu;sanjibac@cs.uw.edu;siddh@cs.uw.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nlee2018bayesian,\ntitle={Bayesian Policy Optimization for Model Uncertainty},\nauthor={Gilwoo Lee and Brian Hou and Aditya Mandalika and Jeongseok Lee and Siddhartha S. Srinivasa},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJGvns0qK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7;7", "confidence": "4;3;3;4", "wc_review": "540;179;622;278", "wc_reply_reviewers": "0;0;127;0", "wc_reply_authors": "677;34;426;410", "reply_reviewers": "0;0;1;0", "reply_authors": "1;1;2;1", "rating_avg": [ 6.25, 0.82915619758885 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 404.75, 182.01562432934156 ], "wc_reply_reviewers_avg": [ 31.75, 54.99261314031185 ], "wc_reply_authors_avg": [ 386.75, 229.54234358828003 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.30151134457776363, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1420595428589112582&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=SJGvns0qK7", "pdf": "https://openreview.net/pdf?id=SJGvns0qK7", "email": ";;;;;", "author_num": 6 }, { "id": "SJGyFiRqK7", "title": "Decoupling Gating from Linearity", "track": "main", "status": "Reject", "tldr": "We propose Gated Linear Unit networks \u2014 a model that performs similarly to ReLU networks on real data while being much easier to analyze theoretically.", "abstract": "The gap between the empirical success of deep learning and the lack of strong theoretical guarantees calls for studying simpler models. By observing that a ReLU neuron is a product of a linear function with a gate (the latter determines whether the neuron is active or not), where both share a jointly trained weight vector, we propose to decouple the two. We introduce GaLU networks \u2014 networks in which each neuron is a product of a Linear Unit, defined by a weight vector which is being trained, with a Gate, defined by a different weight vector which is not being trained. Generally speaking, given a base model and a simpler version of it, the two parameters that determine the quality of the simpler version are whether its practical performance is close enough to the base model and whether it is easier to analyze it theoretically. We show that GaLU networks perform similarly to ReLU networks on standard datasets and we initiate a study of their theoretical properties, demonstrating that they are indeed easier to analyze. We believe that further research of GaLU networks may be fruitful for the development of a theory of deep learning.", "keywords": "Artificial Neural Networks;Neural Networks;ReLU;GaLU;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Yonathan Fiat;Eran Malach;Shai Shalev-Shwartz", "authorids": "jonathan.fiat@gmail.com;eran.malach@mail.huji.ac.il;shais@cs.huji.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nfiat2019decoupling,\ntitle={Decoupling Gating from Linearity},\nauthor={Yonathan Fiat and Eran Malach and Shai Shalev-Shwartz},\nyear={2019},\nurl={https://openreview.net/forum?id=SJGyFiRqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJGyFiRqK7", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;4", "wc_review": "128;162;303", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 197.66666666666666, 75.76425425107954 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13213669711430561987&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SJLhxnRqFQ", "title": "Adversarially Learned Mixture Model", "track": "main", "status": "Reject", "tldr": "The AMM is the first fully adversarially optimized method to model the conditional dependence between categorical and continuous latent variables.", "abstract": "The Adversarially Learned Mixture Model (AMM) is a generative model for unsupervised or semi-supervised data clustering. The AMM is the first adversarially optimized method to model the conditional dependence between inferred continuous and categorical latent variables. Experiments on the MNIST and SVHN datasets show that the AMM allows for semantic separation of complex data when little or no labeled data is available. The AMM achieves unsupervised clustering error rates of 3.32% and 20.4% on the MNIST and SVHN datasets, respectively. A semi-supervised extension of the AMM achieves a classification error rate of 5.60% on the SVHN dataset.", "keywords": "Unsupervised;Semi-supervised;Generative;Adversarial;Clustering", "primary_area": "", "supplementary_material": "", "author": "Andrew Jesson;C\u00e9cile Low-Kam;Tanya Nair;Florian Soudan;Florent Chandelier;Nicolas Chapados", "authorids": "andrew.jesson@imagia.com;cecile.low-kam@imagia.com;tanya.nair@imagia.com;fsoudan21@gmail.com;florent.chandelier@imagia.com;nicolas.chapados@imagia.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\njesson2019adversarially,\ntitle={Adversarially Learned Mixture Model},\nauthor={Andrew Jesson and C\u00e9cile Low-Kam and Tanya Nair and Florian Soudan and Florent Chandelier and Nicolas Chapados},\nyear={2019},\nurl={https://openreview.net/forum?id=SJLhxnRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJLhxnRqFQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;2;1", "wc_review": "183;78;168", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 143.0, 46.36809247747852 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10098759058202036631&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SJMBM2RqKQ", "title": "Uncertainty-guided Lifelong Learning in Bayesian Networks", "track": "main", "status": "Reject", "tldr": "We formulate lifelong learning in the Bayesian-by-Backprop framework, exploiting the parameter uncertainty in two settings: for pruning network parameters and in importance weight based continual learning.", "abstract": "Sequentially learning of tasks arriving in a continuous stream is a complex problem and becomes more challenging when the model has a fixed capacity. Lifelong learning aims at learning new tasks without forgetting previously learnt ones as well as freeing up capacity for learning future tasks. We argue that identifying the most influential parameters in a representation learned for one task plays a critical role to decide on \\textit{what to remember} for continual learning. Motivated by the statistically-grounded uncertainty defined in Bayesian neural networks, we propose to formulate a Bayesian lifelong learning framework, \\texttt{BLLL}, that addresses two lifelong learning directions: 1) completely eliminating catastrophic forgetting using weight pruning, where a hard selection mask freezes the most certain parameters (\\texttt{BLLL-PRN}) and 2) reducing catastrophic forgetting by adaptively regularizing the learning rates using the parameter uncertainty (\\texttt{BLLL-REG}). While \\texttt{BLLL-PRN} is by definition a zero-forgetting guaranteed method, \\texttt{BLLL-REG}, despite exhibiting some small forgetting, is a task-agnostic lifelong learner, which does not require to know when a new task arrives. This feature makes \\texttt{BLLL-REG} a more convenient candidate for applications such as robotics or on-line learning in which such information is not available. We evaluate our Bayesian learning approaches extensively on diverse object classification datasets in short and long sequences of tasks and perform superior or marginally better than the existing approaches.", "keywords": "lifelong learning;continual learning;sequential learning", "primary_area": "", "supplementary_material": "", "author": "Sayna Ebrahimi;Mohamed Elhoseiny;Trevor Darrell;Marcus Rohrbach", "authorids": "sayna@eecs.berkeley.edu;elhoseiny@fb.com;trevor@eecs.berkeley.edu;maroffm@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nebrahimi2019uncertaintyguided,\ntitle={Uncertainty-guided Lifelong Learning in Bayesian Networks},\nauthor={Sayna Ebrahimi and Mohamed Elhoseiny and Trevor Darrell and Marcus Rohrbach},\nyear={2019},\nurl={https://openreview.net/forum?id=SJMBM2RqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJMBM2RqKQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "wc_review": "255;435;368", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "138;235;286", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 352.6666666666667, 74.28025010428784 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 219.66666666666666, 61.38584708401621 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4250089932365204083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJMO2iCct7", "title": "A NOVEL VARIATIONAL FAMILY FOR HIDDEN NON-LINEAR MARKOV MODELS", "track": "main", "status": "Reject", "tldr": "We propose a new variational inference algorithm for time series and a novel variational family endowed with nonlinear dynamics.", "abstract": "Latent variable models have been widely applied for the analysis and visualization of large datasets. In the case of sequential data, closed-form inference is possible when the transition and observation functions are linear. However, approximate inference techniques are usually necessary when dealing with nonlinear evolution and observations. Here, we propose a novel variational inference framework for the explicit modeling of time series, Variational Inference for Nonlinear Dynamics (VIND), that is able to uncover nonlinear observation and latent dynamics from sequential data. The framework includes a structured approximate posterior, and an algorithm that relies on the fixed-point iteration method to find the best estimate for latent trajectories. We apply the method to several datasets and show that it is able to accurately infer the underlying dynamics of these systems, in some cases substantially outperforming state-of-the-art methods.", "keywords": "variational inference;time series;nonlinear dynamics;neuroscience", "primary_area": "", "supplementary_material": "", "author": "Daniel Hernandez Diaz;Antonio Khalil Moretti;Ziqiang Wei;Shreya Saxena;John Cunningham;Liam Paninski", "authorids": "dh2832@columbia.edu;amoretti@cs.columbia.edu;weiz@janelia.hhmi.org;ss5513@columbia.edu;jpcunni@gmail.com;liam.paninski@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ndiaz2019a,\ntitle={A {NOVEL} {VARIATIONAL} {FAMILY} {FOR} {HIDDEN} {NON}-{LINEAR} {MARKOV} {MODELS}},\nauthor={Daniel Hernandez Diaz and Antonio Khalil Moretti and Ziqiang Wei and Shreya Saxena and John Cunningham and Liam Paninski},\nyear={2019},\nurl={https://openreview.net/forum?id=SJMO2iCct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJMO2iCct7", "pdf_size": 0, "rating": "5;6;8", "confidence": "3;3;5", "wc_review": "668;249;635", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1088;544;1348", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 517.3333333333334, 190.21800358769642 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 993.3333333333334, 334.9878936452613 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hHyTu_h-93QJ:scholar.google.com/&scioq=A+NOVEL+VARIATIONAL+FAMILY+FOR+HIDDEN+NON-LINEAR+MARKOV+MODELS&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJMZRsC9Y7", "title": "A NON-LINEAR THEORY FOR SENTENCE EMBEDDING", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper revisits the Random Walk model for sentence embedding in the context of non-extensive statistics. We propose a non-extensive algebra to compute the discourse vector. We argue that by doing so we are taking into account high non-linearity in the semantic space. Furthermore, we show that by considering a non-extensive algebra, the compounding effect of the vector length is mitigated. Overall, we show that the proposed model leads to good sentence embedding. We evaluate the embedding method on textual similarity tasks.", "keywords": "sentence embedding;generative models", "primary_area": "", "supplementary_material": "", "author": "Hichem Mezaoui;Isar Nejadgholi", "authorids": "hichem@imrsv.ai;isar@imrsv.ai", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmezaoui2019a,\ntitle={A {NON}-{LINEAR} {THEORY} {FOR} {SENTENCE} {EMBEDDING}},\nauthor={Hichem Mezaoui and Isar Nejadgholi},\nyear={2019},\nurl={https://openreview.net/forum?id=SJMZRsC9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJMZRsC9Y7", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;3", "wc_review": "285;456;110", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 283.6666666666667, 141.25705488773139 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:htwbPDICW0MJ:scholar.google.com/&scioq=A+NON-LINEAR+THEORY+FOR+SENTENCE+EMBEDDING&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJMeTo09YQ", "title": "Guided Exploration in Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "introduces a guided action exploration mechanism that drastically speed up RL training", "abstract": "This paper proposes a new method to drastically speed up deep reinforcement learning (deep RL) training for problems that have the property of \\textit{state-action permissibility} (SAP). Two types of permissibility are defined under SAP. The first type says that after an action $a_t$ is performed in a state $s_t$ and the agent reaches the new state $s_{t+1}$, the agent can decide whether the action $a_t$ is \\textit{permissible} or \\textit{not permissible} in state $s_t$. The second type says that even without performing the action $a_t$ in state $s_t$, the agent can already decide whether $a_t$ is permissible or not in $s_t$. An action is not permissible in a state if the action can never lead to an optimal solution and thus should not be tried. We incorporate the proposed SAP property into two state-of-the-art deep RL algorithms to guide their state-action exploration. Results show that the SAP guidance can markedly speed up training.", "keywords": "deep reinforcement learning;guided exploration;RL training speed up", "primary_area": "", "supplementary_material": "", "author": "Sahisnu Mazumder;Bing Liu;Shuai Wang;Yingxuan Zhu;Xiaotian Yin;Lifeng Liu;Jian Li;Yongbing Huang", "authorids": "sahisnumazumder@gmail.com;liub@cs.uic.edu;gshuaishuai@gmail.com;yingxuan.zhu@huawei.com;xiaotian.yin@huawei.com;lifeng.liu1@huawei.com;jian.li1@huawei.com;huangyongbing@huawei.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nmazumder2019guided,\ntitle={Guided Exploration in Deep Reinforcement Learning},\nauthor={Sahisnu Mazumder and Bing Liu and Shuai Wang and Yingxuan Zhu and Xiaotian Yin and Lifeng Liu and Jian Li and Yongbing Huang},\nyear={2019},\nurl={https://openreview.net/forum?id=SJMeTo09YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJMeTo09YQ", "pdf_size": 0, "rating": "3;5;7", "confidence": "3;4;5", "wc_review": "428;388;317", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "878;810;543", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 377.6666666666667, 45.900859348043674 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 743.6666666666666, 144.5829250714697 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6751610648200819232&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SJMnG2C9YX", "title": "Complementary-label learning for arbitrary losses and models", "track": "main", "status": "Reject", "tldr": "From now on, you can train ResNet and DenseNet, even if no class label given for training is correct!", "abstract": "In contrast to the standard classification paradigm where the true (or possibly noisy) class is given to each training pattern, complementary-label learning only uses training patterns each equipped with a complementary label. This only specifies one of the classes that the pattern does not belong to. The seminal paper on complementary-label learning proposed an unbiased estimator of the classification risk that can be computed only from complementarily labeled data. How- ever, it required a restrictive condition on the loss functions, making it impossible to use popular losses such as the softmax cross-entropy loss. Recently, another formulation with the softmax cross-entropy loss was proposed with consistency guarantee. However, this formulation does not explicitly involve a risk estimator. Thus model/hyper-parameter selection is not possible by cross-validation\u2014 we may need additional ordinarily labeled data for validation purposes, which is not available in the current setup. In this paper, we give a novel general framework of complementary-label learning, and derive an unbiased risk estimator for arbitrary losses and models. We further improve the risk estimator by non-negative correction and demonstrate its superiority through experiments.", "keywords": "complementary labels;weak supervision", "primary_area": "", "supplementary_material": "", "author": "Takashi Ishida;Gang Niu;Aditya Krishna Menon;Masashi Sugiyama", "authorids": "ishida@ms.k.u-tokyo.ac.jp;gang.niu@riken.jp;aditya.menon@anu.edu.au;sugi@k.u-tokyo.ac.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nishida2019complementarylabel,\ntitle={Complementary-label learning for arbitrary losses and models},\nauthor={Takashi Ishida and Gang Niu and Aditya Krishna Menon and Masashi Sugiyama},\nyear={2019},\nurl={https://openreview.net/forum?id=SJMnG2C9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJMnG2C9YX", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;4", "wc_review": "501;359;298", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "446;191;56", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 386.0, 85.04508608183465 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 231.0, 161.7096162879623 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4663196775584030091&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "SJNRHiAcYX", "title": "Boosting Trust Region Policy Optimization by Normalizing flows Policy", "track": "main", "status": "Reject", "tldr": "Normalizing flows policy to improve TRPO and ACKTR", "abstract": "We propose to improve trust region policy search with normalizing flows policy. We illustrate that when the trust region is constructed by KL divergence constraint, normalizing flows policy can generate samples far from the 'center' of the previous policy iterate, which potentially enables better exploration and helps avoid bad local optima. We show that normalizing flows policy significantly improves upon factorized Gaussian policy baseline, with both TRPO and ACKTR, especially on tasks with complex dynamics such as Humanoid.", "keywords": "Reinforcement Learning;Normalizing Flows", "primary_area": "", "supplementary_material": "", "author": "Yunhao Tang;Shipra Agrawal", "authorids": "yt2541@columbia.edu;sa3305@columbia.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntang2019boosting,\ntitle={Boosting Trust Region Policy Optimization by Normalizing flows Policy},\nauthor={Yunhao Tang and Shipra Agrawal},\nyear={2019},\nurl={https://openreview.net/forum?id=SJNRHiAcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJNRHiAcYX", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "wc_review": "518;314;237", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "781;602;368", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 356.3333333333333, 118.55893986630541 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 583.6666666666666, 169.1041756498704 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11944935319810581935&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "SJNceh0cFX", "title": "A RECURRENT NEURAL CASCADE-BASED MODEL FOR CONTINUOUS-TIME DIFFUSION PROCESS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many works have been proposed in the literature to capture the dynamics of diffusion in networks. While some of them define graphical markovian models to extract temporal relationships between node infections in networks, others consider diffusion episodes as sequences of infections via recurrent neural models. In this paper we propose a model at the crossroads of these two extremes, which embeds the history of diffusion in infected nodes as hidden continuous states. Depending on the trajectory followed by the content before reaching a given node, the distribution of influence probabilities may vary. However, content trajectories are usually hidden in the data, which induces challenging learning problems. We propose a topological recurrent neural model which exhibits good experimental performances for diffusion modelling and prediction. ", "keywords": "Information Diffusion;Recurrent Neural Network;Black Box Inference", "primary_area": "", "supplementary_material": "", "author": "Sylvain Lamprier", "authorids": "sylvain.lamprier@lip6.fr", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nlamprier2019a,\ntitle={A {RECURRENT} {NEURAL} {CASCADE}-{BASED} {MODEL} {FOR} {CONTINUOUS}-{TIME} {DIFFUSION} {PROCESS}},\nauthor={Sylvain Lamprier},\nyear={2019},\nurl={https://openreview.net/forum?id=SJNceh0cFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJNceh0cFX", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;4;4", "wc_review": "267;365;235", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "834;493;561", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 289.0, 55.3052137385497 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 629.3333333333334, 147.35972614282673 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yPv6qE3XB1QJ:scholar.google.com/&scioq=A+RECURRENT+NEURAL+CASCADE-BASED+MODEL+FOR+CONTINUOUS-TIME+DIFFUSION+PROCESS&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Variational Bayesian Phylogenetic Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1060", "id": "SJVmjjR9FX", "author_site": "Cheng Zhang, Frederick A Matsen", "tldr": "The first variational Bayes formulation of phylogenetic inference, a challenging inference problem over structures with intertwined discrete and continuous components", "abstract": "Bayesian phylogenetic inference is currently done via Markov chain Monte Carlo with simple mechanisms for proposing new states, which hinders exploration efficiency and often requires long runs to deliver accurate posterior estimates. In this paper we present an alternative approach: a variational framework for Bayesian phylogenetic analysis. We approximate the true posterior using an expressive graphical model for tree distributions, called a subsplit Bayesian network, together with appropriate branch length distributions. We train the variational approximation via stochastic gradient ascent and adopt multi-sample based gradient estimators for different latent variables separately to handle the composite latent space of phylogenetic models. We show that our structured variational approximations are flexible enough to provide comparable posterior estimation to MCMC, while requiring less computation due to a more efficient tree exploration mechanism enabled by variational inference. Moreover, the variational approximations can be readily used for further statistical analysis such as marginal likelihood estimation for model comparison via importance sampling. Experiments on both synthetic data and real data Bayesian phylogenetic inference problems demonstrate the effectiveness and efficiency of our methods.", "keywords": "Bayesian phylogenetic inference;Variational inference;Subsplit Bayesian networks", "primary_area": "", "supplementary_material": "", "author": "Cheng Zhang;Frederick A. Matsen IV", "authorids": "zc.rabbit@gmail.com;matsen@fredhutch.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nzhang2018variational,\ntitle={Variational Bayesian Phylogenetic Inference},\nauthor={Cheng Zhang and Frederick A. Matsen IV},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJVmjjR9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;1", "wc_review": "243;332;166", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "479;894;446", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 247.0, 67.82821438506743 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 606.3333333333334, 203.85670348447107 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 66, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16358551768743877476&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJVmjjR9FX", "pdf": "https://openreview.net/pdf?id=SJVmjjR9FX", "email": ";", "author_num": 2 }, { "id": "SJe2so0qF7", "title": "Learning data-derived privacy preserving representations from information metrics", "track": "main", "status": "Reject", "tldr": "Learning privacy-preserving transformations from data. A collaborative approach", "abstract": "It is clear that users should own and control their data and privacy. Utility providers are also becoming more interested in guaranteeing data privacy. Therefore, users and providers can and should collaborate in privacy protecting challenges, and this paper addresses this new paradigm. We propose a framework where the user controls what characteristics of the data they want to share (utility) and what they want to keep private (secret), without necessarily asking the utility provider to change its existing machine learning algorithms. We first analyze the space of privacy-preserving representations and derive natural information-theoretic bounds on the utility-privacy trade-off when disclosing a sanitized version of the data X. We present explicit learning architectures to learn privacy-preserving representations that approach this bound in a data-driven fashion. We describe important use-case scenarios where the utility providers are willing to collaborate with the sanitization process. We study space-preserving transformations where the utility provider can use the same algorithm on original and sanitized data, a critical and novel attribute to help service providers accommodate varying privacy requirements with a single set of utility algorithms. We illustrate this framework through the implementation of three use cases; subject-within-subject, where we tackle the problem of having a face identity detector that works only on a consenting subset of users, an important application, for example, for mobile devices activated by face recognition; gender-and-subject, where we preserve facial verification while hiding the gender attribute for users who choose to do so; and emotion-and-gender, where we hide independent variables, as is the case of hiding gender while preserving emotion detection.", "keywords": "Machine learning;privacy;adversarial training;information theory;data-driven privacy", "primary_area": "", "supplementary_material": "", "author": "Martin Bertran;Natalia Martinez;Afroditi Papadaki;Qiang Qiu;Miguel Rodrigues;Guillermo Sapiro", "authorids": "martin.bertran@duke.edu;natalia.martinez@duke.edu;a.papadaki.17@ucl.ac.uk;qiuqiang@gmail.com;m.rodrigues@ucl.ac.uk;guillermo.sapiro@duke.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nbertran2019learning,\ntitle={Learning data-derived privacy preserving representations from information metrics},\nauthor={Martin Bertran and Natalia Martinez and Afroditi Papadaki and Qiang Qiu and Miguel Rodrigues and Guillermo Sapiro},\nyear={2019},\nurl={https://openreview.net/forum?id=SJe2so0qF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJe2so0qF7", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;4", "wc_review": "183;1222;519", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "577;1395;1179", "reply_reviewers": "0;0;0", "reply_authors": "1;3;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 641.3333333333334, 432.90055311686643 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1050.3333333333333, 346.11879784580066 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15337914464909235577&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "LEARNING FACTORIZED REPRESENTATIONS FOR OPEN-SET DOMAIN ADAPTATION", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1036", "id": "SJe3HiC5KX", "author_site": "Mahsa Baktashmotlagh, Masoud Faraki, Tom Drummond, Mathieu Salzmann", "tldr": "", "abstract": "Domain adaptation for visual recognition has undergone great progress in the past few years. Nevertheless, most existing methods work in the so-called closed-set scenario, assuming that the classes depicted by the target images are exactly the same as those of the source domain. In this paper, we tackle the more challenging, yet more realistic case of open-set domain adaptation, where new, unknown classes can be present in the target data. While, in the unsupervised scenario, one cannot expect to be able to identify each specific new class, we aim to automatically detect which samples belong to these new classes and discard them from the recognition process. To this end, we rely on the intuition that the source and target samples depicting the known classes can be generated by a shared subspace, whereas the target samples from unknown classes come from a different, private subspace. We therefore introduce a framework that factorizes the data into shared and private parts, while encouraging the shared representation to be discriminative. Our experiments on standard benchmarks evidence that our approach significantly outperforms the state-of-the-art in open-set domain adaptation.", "keywords": "Open Set Domain Adaptation", "primary_area": "", "supplementary_material": "", "author": "Mahsa Baktashmotlagh;Masoud Faraki;Tom Drummond;Mathieu Salzmann", "authorids": "m.baktashmotlagh@qut.edu.au;masoud.faraki@monash.edu;tom.drummond@monash.edu;mathieu.salzmann@epfl.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbaktashmotlagh2018learning,\ntitle={{LEARNING} {FACTORIZED} {REPRESENTATIONS} {FOR} {OPEN}-{SET} {DOMAIN} {ADAPTATION}},\nauthor={Mahsa Baktashmotlagh and Masoud Faraki and Tom Drummond and Mathieu Salzmann},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJe3HiC5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;3", "wc_review": "825;255;475", "wc_reply_reviewers": "36;0;402", "wc_reply_authors": "692;159;731", "reply_reviewers": "1;0;2", "reply_authors": "2;1;3", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 518.3333333333334, 234.71022323045258 ], "wc_reply_reviewers_avg": [ 146.0, 181.61497735594386 ], "wc_reply_authors_avg": [ 527.3333333333334, 260.9371997669597 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=991348150001731721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=SJe3HiC5KX", "pdf": "https://openreview.net/pdf?id=SJe3HiC5KX", "email": ";;;", "author_num": 4 }, { "id": "SJe8DsR9tm", "title": "Dynamic Early Terminating of Multiply Accumulate Operations for Saving Computation Cost in Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning has been attracting enormous attention from academia as well as industry due to its great success in many artificial intelligence applications. As more applications are developed, the need for implementing a complex neural network model on an energy-limited edge device becomes more critical. To this end, this paper proposes a new optimization method to reduce the computation efforts of convolutional neural networks. The method takes advantage of the fact that some convolutional operations are actually wasteful since their outputs are pruned by the following activation or pooling layers. Basically, a convolutional filter conducts a series of multiply-accumulate (MAC) operations. We propose to set a checkpoint in the MAC process to determine whether a filter could terminate early based on the intermediate result. Furthermore, a fine-tuning process is conducted to recover the accuracy drop due to the applied checkpoints. The experimental results show that the proposed method can save approximately 50% MAC operations with less than 1% accuracy drop for CIFAR-10 example model and Network in Network on the CIFAR-10 and CIFAR-100 datasets. Additionally, compared with the state-of- the-art method, the proposed method is more effective on the CIFAR-10 dataset and is competitive on the CIFAR-100 dataset.", "keywords": "Convolutional neural network;Early terminating;Dynamic model optimization", "primary_area": "", "supplementary_material": "", "author": "Yu-Yi Su;Yung-Chih Chen;Xiang-Xiu Wu;Shih-Chieh Chang", "authorids": "wwball34@gmail.com;ycchen.phi@gmail.com;jaubau999@gmail.com;scchang@cs.nthu.edu.tw", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsu2019dynamic,\ntitle={Dynamic Early Terminating of Multiply Accumulate Operations for Saving Computation Cost in Convolutional Neural Networks},\nauthor={Yu-Yi Su and Yung-Chih Chen and Xiang-Xiu Wu and Shih-Chieh Chang},\nyear={2019},\nurl={https://openreview.net/forum?id=SJe8DsR9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJe8DsR9tm", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;5", "wc_review": "315;349;555", "wc_reply_reviewers": "103;0;0", "wc_reply_authors": "1117;631;1884", "reply_reviewers": "1;0;0", "reply_authors": "2;1;3", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 406.3333333333333, 106.03563342365413 ], "wc_reply_reviewers_avg": [ 34.333333333333336, 48.554665641476255 ], "wc_reply_authors_avg": [ 1210.6666666666667, 515.8050880796824 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:srJNIB5ZzuEJ:scholar.google.com/&scioq=Dynamic+Early+Terminating+of+Multiply+Accumulate+Operations+for+Saving+Computation+Cost+in+Convolutional+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "On the Universal Approximability and Complexity Bounds of Quantized ReLU Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/699", "id": "SJe9rh0cFX", "author_site": "Yukun Ding, Jinglan Liu, Jinjun Xiong, Yiyu Shi", "tldr": "This paper proves the universal approximability of quantized ReLU neural networks and puts forward the complexity bound given arbitrary error.", "abstract": "Compression is a key step to deploy large neural networks on resource-constrained platforms. As a popular compression technique, quantization constrains the number of distinct weight values and thus reducing the number of bits required to represent and store each weight. In this paper, we study the representation power of quantized neural networks. First, we prove the universal approximability of quantized ReLU networks on a wide class of functions. Then we provide upper bounds on the number of weights and the memory size for a given approximation error bound and the bit-width of weights for function-independent and function-dependent structures. Our results reveal that, to attain an approximation error bound of $\\epsilon$, the number of weights needed by a quantized network is no more than $\\mathcal{O}\\left(\\log^5(1/\\epsilon)\\right)$ times that of an unquantized network. This overhead is of much lower order than the lower bound of the number of weights needed for the error bound, supporting the empirical success of various quantization techniques. To the best of our knowledge, this is the first in-depth study on the complexity bounds of quantized neural networks.", "keywords": "Quantized Neural Networks;Universial Approximability;Complexity Bounds;Optimal Bit-width", "primary_area": "", "supplementary_material": "", "author": "Yukun Ding;Jinglan Liu;Jinjun Xiong;Yiyu Shi", "authorids": "yding5@nd.edu;jliu16@nd.edu;jinjun@us.ibm.com;yshi4@nd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nding2018on,\ntitle={On the Universal Approximability and Complexity Bounds of Quantized Re{LU} Neural Networks},\nauthor={Yukun Ding and Jinglan Liu and Jinjun Xiong and Yiyu Shi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJe9rh0cFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;3", "wc_review": "211;240;117", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "187;593;278", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 189.33333333333334, 52.49973544906891 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 352.6666666666667, 173.95465564974748 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9956095606815471877&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SJe9rh0cFX", "pdf": "https://openreview.net/pdf?id=SJe9rh0cFX", "email": ";;;", "author_num": 4 }, { "id": "SJeFNoRcFQ", "title": "Traditional and Heavy Tailed Self Regularization in Neural Network Models", "track": "main", "status": "Reject", "tldr": "See the abstract. (For the revision, the paper is identical, except for a 59 page Supplementary Material, which can serve as a stand-along technical report version of the paper.)", "abstract": "Random Matrix Theory (RMT) is applied to analyze the weight matrices of Deep Neural Networks (DNNs), including both production quality, pre-trained models such as AlexNet and Inception, and smaller models trained from scratch, such as LeNet5 and a miniature-AlexNet. Empirical and theoretical results clearly indicate that the empirical spectral density (ESD) of DNN layer matrices displays signatures of traditionally-regularized statistical models, even in the absence of exogenously specifying traditional forms of regularization, such as Dropout or Weight Norm constraints. Building on recent results in RMT, most notably its extension to Universality classes of Heavy-Tailed matrices, we develop a theory to identify 5+1 Phases of Training, corresponding to increasing amounts of Implicit Self-Regularization. For smaller and/or older DNNs, this Implicit Self-Regularization is like traditional Tikhonov regularization, in that there is a \"size scale\" separating signal from noise. For state-of-the-art DNNs, however, we identify a novel form of Heavy-Tailed Self-Regularization, similar to the self-organization seen in the statistical physics of disordered systems. This implicit Self-Regularization can depend strongly on the many knobs of the training process. By exploiting the generalization gap phenomena, we demonstrate that we can cause a small model to exhibit all 5+1 phases of training simply by changing the batch size.", "keywords": "statistical mechanics;self-regularization;random matrix;glassy behavior;heavy-tailed", "primary_area": "", "supplementary_material": "", "author": "Charles H. Martin;Michael W. Mahoney", "authorids": "charles@calculationconsulting.com;mmahoney@stat.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmartin2019traditional,\ntitle={Traditional and Heavy Tailed Self Regularization in Neural Network Models},\nauthor={Charles H. Martin and Michael W. Mahoney},\nyear={2019},\nurl={https://openreview.net/forum?id=SJeFNoRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=SJeFNoRcFQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "1;4;5", "wc_review": "173;221;154", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 1.699673171197595 ], "wc_review_avg": [ 182.66666666666666, 28.193773938387338 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.6933752452815362, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2495666934042672938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SJeI5i0cYQ", "title": "EXPLORING DEEP LEARNING USING INFORMATION THEORY TOOLS AND PATCH ORDERING", "track": "main", "status": "Withdraw", "tldr": "Develop new techniques that rely on patch reordering to enable detailed analysis of data-set relationship to training and generalization performances.", "abstract": "We present a framework for automatically ordering image patches that enables in-depth analysis of dataset relationship to learnability of a classification task using convolutional neural network. An image patch is a group of pixels residing in a continuous area contained in the sample. Our preliminary experimental results show that an informed smart shuffling of patches at a sample level can expedite training by exposing important features at early stages of training. In addition, we conduct systematic experiments and provide evidence that CNN\u2019s generalization capabilities do not correlate with human recognizable features present in training samples. We utilized the framework not only to show that spatial locality of features within samples do not correlate with generalization, but also to expedite convergence while achieving similar generalization performance. Using multiple network architectures and datasets, we show that ordering image regions using mutual information measure between adjacent patches, enables CNNs to converge in a third of the total steps required to train the same network without patch ordering.", "keywords": "CNN;Deep Learning;Feature Extraction;Patch Ordering;Convergence;Image Classification", "primary_area": "", "supplementary_material": "", "author": "Henok Ghebrechristos;Gita Alaghband", "authorids": "henok.ghebrechristos@ucdenver.edu;gita.alaghband@ucdenver.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SJeI5i0cYQ", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 2, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10318417928821064697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SJeT_oRcY7", "title": "Localized random projections challenge benchmarks for bio-plausible deep learning", "track": "main", "status": "Reject", "tldr": "Spiking networks using localized random projections and STDP challenge current MNIST benchmark models for bio-plausible deep learning", "abstract": "Similar to models of brain-like computation, artificial deep neural networks rely\non distributed coding, parallel processing and plastic synaptic weights. Training\ndeep neural networks with the error-backpropagation algorithm, however, is\nconsidered bio-implausible. An appealing alternative to training deep neural networks\nis to use one or a few hidden layers with fixed random weights or trained\nwith an unsupervised, local learning rule and train a single readout layer with a\nsupervised, local learning rule. We find that a network of leaky-integrate-andfire\nneurons with fixed random, localized receptive fields in the hidden layer and\nspike timing dependent plasticity to train the readout layer achieves 98.1% test\naccuracy on MNIST, which is close to the optimal result achievable with error-backpropagation\nin non-convolutional networks of rate neurons with one hidden\nlayer. To support the design choices of the spiking network, we systematically\ncompare the classification performance of rate networks with a single hidden\nlayer, where the weights of this layer are either random and fixed, trained with\nunsupervised Principal Component Analysis or Sparse Coding, or trained with\nthe backpropagation algorithm. This comparison revealed, first, that unsupervised\nlearning does not lead to better performance than fixed random projections for\nlarge hidden layers on digit classification (MNIST) and object recognition (CIFAR10);\nsecond, networks with random projections and localized receptive fields\nperform significantly better than networks with all-to-all connectivity and almost\nreach the performance of networks trained with the backpropagation algorithm.\nThe performance of these simple random projection networks is comparable to\nmost current models of bio-plausible deep learning and thus provides an interesting\nbenchmark for future approaches.", "keywords": "deep learning;bio-plausibility;random projections;spiking networks;unsupervised learning;MNIST;spike timing dependent plasticity", "primary_area": "", "supplementary_material": "", "author": "Bernd Illing;Wulfram Gerstner;Johanni Brea", "authorids": "bernd.illing@epfl.ch;wulfram.gerstner@epfl.ch;johanni.brea@epfl.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nilling2019localized,\ntitle={Localized random projections challenge benchmarks for bio-plausible deep learning},\nauthor={Bernd Illing and Wulfram Gerstner and Johanni Brea},\nyear={2019},\nurl={https://openreview.net/forum?id=SJeT_oRcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJeT_oRcY7", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;4;3", "wc_review": "993;310;516", "wc_reply_reviewers": "305;0;0", "wc_reply_authors": "643;427;320", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 606.3333333333334, 286.0563270095983 ], "wc_reply_reviewers_avg": [ 101.66666666666667, 143.77837884126464 ], "wc_reply_authors_avg": [ 463.3333333333333, 134.3436720587249 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3573756817297444160&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJeUAj05tQ", "title": "DADAM: A consensus-based distributed adaptive gradient method for online optimization", "track": "main", "status": "Reject", "tldr": "", "abstract": "Online and stochastic optimization methods such as SGD, ADAGRAD and ADAM are key algorithms in solving large-scale machine learning problems including deep learning. A number of schemes that are based on communications of nodes with a central server have been recently proposed in the literature to parallelize them. A bottleneck of such centralized algorithms lies on the high communication cost incurred by the central node. In this paper, we present a new consensus-based distributed adaptive moment estimation method (DADAM) for online optimization over a decentralized network that enables data parallelization, as well as decentralized computation. Such a framework note only can be extremely useful for learning agents with access to only local data in a communication constrained environment, but as shown in this work also outperform centralized adaptive algorithms such as ADAM for certain realistic classes of loss functions. We analyze the convergence properties of the proposed algorithm and provide a \\textit{dynamic regret} bound on the convergence rate of adaptive moment estimation methods in both stochastic and deterministic settings. Empirical results demonstrate that DADAM works well in practice and compares favorably to competing online optimization methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Parvin Nazari;Davoud Ataee Tarzanagh;George Michailidis", "authorids": "p_nazari@aut.ac.ir;tarzanagh@ufl.edu;gmichail@ufl.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnazari2019dadam,\ntitle={{DADAM}: A consensus-based distributed adaptive gradient method for online optimization},\nauthor={Parvin Nazari and Davoud Ataee Tarzanagh and George Michailidis},\nyear={2019},\nurl={https://openreview.net/forum?id=SJeUAj05tQ},\n}", "github": "[![github](/images/github_icon.svg) Tarzanagh/DADAM](https://github.com/Tarzanagh/DADAM)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJeUAj05tQ", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;4;3", "wc_review": "700;221;162", "wc_reply_reviewers": "454;0;0", "wc_reply_authors": "1656;487;357", "reply_reviewers": "1;0;0", "reply_authors": "4;1;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 361.0, 240.91630635278025 ], "wc_reply_reviewers_avg": [ 151.33333333333334, 214.01765243912837 ], "wc_reply_authors_avg": [ 833.3333333333334, 584.129171407223 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16927357580583565179&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "Learning Localized Generative Models for 3D Point Clouds via Graph Convolution", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/721", "id": "SJeXSo09FQ", "author_site": "Diego Valsesia, Giulia Fracastoro, Enrico Magli", "tldr": "A GAN using graph convolution operations with dynamically computed graphs from hidden features", "abstract": "Point clouds are an important type of geometric data and have widespread use in computer graphics and vision. However, learning representations for point clouds is particularly challenging due to their nature as being an unordered collection of points irregularly distributed in 3D space. Graph convolution, a generalization of the convolution operation for data defined over graphs, has been recently shown to be very successful at extracting localized features from point clouds in supervised or semi-supervised tasks such as classification or segmentation. This paper studies the unsupervised problem of a generative model exploiting graph convolution. We focus on the generator of a GAN and define methods for graph convolution when the graph is not known in advance as it is the very output of the generator. The proposed architecture learns to generate localized features that approximate graph embeddings of the output geometry. We also study the problem of defining an upsampling layer in the graph-convolutional generator, such that it learns to exploit a self-similarity prior on the data distribution to sample more effectively.", "keywords": "GAN;graph convolution;point clouds", "primary_area": "", "supplementary_material": "", "author": "Diego Valsesia;Giulia Fracastoro;Enrico Magli", "authorids": "diego.valsesia@polito.it;giulia.fracastoro@polito.it;enrico.magli@polito.it", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nvalsesia2018learning,\ntitle={Learning Localized Generative Models for 3D Point Clouds via Graph Convolution},\nauthor={Diego Valsesia and Giulia Fracastoro and Enrico Magli},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJeXSo09FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;3;3", "wc_review": "332;241;290", "wc_reply_reviewers": "254;9;0", "wc_reply_authors": "1009;660;292", "reply_reviewers": "2;1;0", "reply_authors": "2;1;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 287.6666666666667, 37.1872140511882 ], "wc_reply_reviewers_avg": [ 87.66666666666667, 117.67280437250099 ], "wc_reply_authors_avg": [ 653.6666666666666, 292.7482801923105 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 213, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9880881475010180729&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SJeXSo09FQ", "pdf": "https://openreview.net/pdf?id=SJeXSo09FQ", "email": ";;", "author_num": 3 }, { "id": "SJea5oAqK7", "title": "PASS: Phased Attentive State Space Modeling of Disease Progression Trajectories", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Disease progression models are instrumental in predicting individual-level health\ntrajectories and understanding disease dynamics. Existing models are capable\nof providing either accurate predictions of patients\u2019 prognoses or clinically interpretable\nrepresentations of disease pathophysiology, but not both. In this paper,\nwe develop the phased attentive state space (PASS) model of disease progression,\na deep probabilistic model that captures complex representations for disease progression\nwhile maintaining clinical interpretability. Unlike Markovian state space\nmodels which assume memoryless dynamics, PASS uses an attention mechanism\nto induce \"memoryful\" state transitions, whereby repeatedly updated attention\nweights are used to focus on past state realizations that best predict future states.\nThis gives rise to complex, non-stationary state dynamics that remain interpretable\nthrough the generated attention weights, which designate the relationships between\nthe realized state variables for individual patients. PASS uses phased LSTM\nunits (with time gates controlled by parametrized oscillations) to generate the attention\nweights in continuous time, which enables handling irregularly-sampled\nand potentially missing medical observations. Experiments on data from a realworld\ncohort of patients show that PASS successfully balances the tradeoff between\naccuracy and interpretability: it demonstrates superior predictive accuracy\nand learns insightful individual-level representations of disease progression.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ahmed M. Alaa;Mihaela van der Schaar", "authorids": "a7med3laa@hotmail.com;mihaelaucla@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SJea5oAqK7", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lKz-ZjBS7y4J:scholar.google.com/&scioq=PASS:+Phased+Attentive+State+Space+Modeling+of+Disease+Progression+Trajectories&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJekyhCctQ", "title": "Detecting Adversarial Examples Via Neural Fingerprinting", "track": "main", "status": "Reject", "tldr": "Novel technique for detecting adversarial examples -- robust across gradient-based and gradient-free attacks, AUC-ROC >95%", "abstract": "Deep neural networks are vulnerable to adversarial examples: input data that has been manipulated to cause dramatic model output errors. To defend against such attacks, we propose NeuralFingerprinting: a simple, yet effective method to detect adversarial examples that verifies whether model behavior is consistent with a set of fingerprints. These fingerprints are encoded into the model response during training and are inspired by the use of biometric and cryptographic signatures. In contrast to previous defenses, our method does not rely on knowledge of the adversary and can scale to large networks and input data. The benefits of our method are that 1) it is fast, 2) it is prohibitively expensive for an attacker to reverse-engineer which fingerprints were used, and 3) it does not assume knowledge of the adversary. In this work, we 1) theoretically analyze NeuralFingerprinting for linear models and 2) show that NeuralFingerprinting significantly improves on state-of-the-art detection mechanisms for deep neural networks, by detecting the strongest known adversarial attacks with 98-100% AUC-ROC scores on the MNIST, CIFAR-10 and MiniImagenet (20 classes) datasets. In particular, we consider several threat models, including the most conservative one in which the attacker has full knowledge of the defender's strategy. In all settings, the detection accuracy of NeuralFingerprinting generalizes well to unseen test-data and is robust over a wide range of hyperparameters.", "keywords": "Adversarial Attacks;Deep Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Sumanth Dathathri;Stephan Zheng;Yisong Yue;Richard M. Murray", "authorids": "sdathath@caltech.edu;st.t.zheng@gmail.com;yyue@caltech.edu;murray@cds.caltech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndathathri2019detecting,\ntitle={Detecting Adversarial Examples Via Neural Fingerprinting},\nauthor={Sumanth Dathathri and Stephan Zheng and Yisong Yue and Richard M. Murray},\nyear={2019},\nurl={https://openreview.net/forum?id=SJekyhCctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJekyhCctQ", "pdf_size": 0, "rating": "5;6;9", "confidence": "4;3;4", "wc_review": "602;205;497", "wc_reply_reviewers": "618;0;0", "wc_reply_authors": "2938;421;317", "reply_reviewers": "5;0;0", "reply_authors": "9;2;1", "rating_avg": [ 6.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 434.6666666666667, 167.96097430322584 ], "wc_reply_reviewers_avg": [ 206.0, 291.3279938488576 ], "wc_reply_authors_avg": [ 1225.3333333333333, 1211.7822503330465 ], "reply_reviewers_avg": [ 1.6666666666666667, 2.357022603955158 ], "reply_authors_avg": [ 4.0, 3.559026084010437 ], "replies_avg": [ 47, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.2773500981126145, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17796815094894187701&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "SJequsCqKQ", "title": "Cautious Deep Learning", "track": "main", "status": "Reject", "tldr": "New way to do classification using P(X|Y) instead P(Y|X) which results with cautious prediciton outputing \"I don't know\" for outliers. ", "abstract": "Most classifiers operate by selecting the maximum of an estimate of the conditional distribution $p(y|x)$ where $x$ stands for the features of the instance to be classified and $y$ denotes its label. This often results in a hubristic bias: overconfidence in the assignment of a definite label. Usually, the observations are concentrated on a small volume but the classifier provides definite predictions for the entire space. We propose constructing conformal prediction sets which contain a set of labels rather than a single label. These conformal prediction sets contain the true label with probability $1-\\alpha$. Our construction is based on $p(x|y)$ rather than $p(y|x)$ which results in a classifier that is very cautious: it outputs the null set --- meaning ``I don't know'' --- when the object does not resemble the training examples. An important property of our approach is that classes can be added or removed without having to retrain the classifier. We demonstrate the performance on the ImageNet ILSVRC dataset and the CelebA and IMDB-Wiki facial datasets using high dimensional features obtained from state of the art convolutional neural networks. ", "keywords": "Deep Learning;Classification;Prediction;Cautious Methods", "primary_area": "", "supplementary_material": "", "author": "Yotam Hechtlinger;Barnabas Poczos;Larry Wasserman", "authorids": "yhechtli@andrew.cmu.edu;bapoczos@cs.cmu.edu;larry@cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhechtlinger2019cautious,\ntitle={Cautious Deep Learning},\nauthor={Yotam Hechtlinger and Barnabas Poczos and Larry Wasserman},\nyear={2019},\nurl={https://openreview.net/forum?id=SJequsCqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJequsCqKQ", "pdf_size": 0, "rating": "4;4;7", "confidence": "5;3;2", "wc_review": "264;477;238", "wc_reply_reviewers": "0;69;0", "wc_reply_authors": "199;180;104", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 326.3333333333333, 107.06488167876938 ], "wc_reply_reviewers_avg": [ 23.0, 32.526911934581186 ], "wc_reply_authors_avg": [ 161.0, 41.04469108991645 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184546, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6504123690541577595&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SJerEhR5Km", "title": "Novel positional encodings to enable tree-structured transformers", "track": "main", "status": "Reject", "tldr": "We develop novel positional encodings for tree-structured data, enabling transformers to be applied to tree structured problems.", "abstract": "With interest in program synthesis and similarly \ufb02avored problems rapidly increasing, neural models optimized for tree-domain problems are of great value. In the sequence domain, transformers can learn relationships across arbitrary pairs of positions with less bias than recurrent models. Under the intuition that a similar property would be beneficial in the tree domain, we propose a method to extend transformers to tree-structured inputs and/or outputs. Our approach abstracts transformer's default sinusoidal positional encodings, allowing us to substitute in a novel custom positional encoding scheme that represents node positions within a tree. We evaluated our model in tree-to-tree program translation and sequence-to-tree semantic parsing settings, achieving superior performance over the vanilla transformer model on several tasks.\n", "keywords": "program translation;tree structures;transformer", "primary_area": "", "supplementary_material": "", "author": "Vighnesh Leonardo Shiv;Chris Quirk", "authorids": "vishiv@microsoft.com;chrisq@microsoft.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshiv2019novel,\ntitle={Novel positional encodings to enable tree-structured transformers},\nauthor={Vighnesh Leonardo Shiv and Chris Quirk},\nyear={2019},\nurl={https://openreview.net/forum?id=SJerEhR5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJerEhR5Km", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;3", "wc_review": "407;163;177", "wc_reply_reviewers": "0;206;0", "wc_reply_authors": "29;239;76", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 249.0, 111.86897097348606 ], "wc_reply_reviewers_avg": [ 68.66666666666667, 97.10933128295251 ], "wc_reply_authors_avg": [ 114.66666666666667, 89.98641872835898 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=955114948422820883&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJf6BhAqK7", "title": "Variadic Learning by Bayesian Nonparametric Deep Embedding", "track": "main", "status": "Reject", "tldr": "We address any-shot, any-way learning with multi-modal prototypes by connecting bayesian nonparametrics and deep metric learning", "abstract": "Learning at small or large scales of data is addressed by two strong but divided frontiers: few-shot learning and standard supervised learning. Few-shot learning focuses on sample efficiency at small scale, while supervised learning focuses on accuracy at large scale. Ideally they could be reconciled for effective learning at any number of data points (shot) and number of classes (way). To span the full spectrum of shot and way, we frame the variadic learning regime of learning from any number of inputs. We approach variadic learning by meta-learning a novel multi-modal clustering model that connects bayesian nonparametrics and deep metric learning. Our bayesian nonparametric deep embedding (BANDE) method is optimized end-to-end with a single objective, and adaptively adjusts capacity to learn from variable amounts of supervision. We show that multi-modality is critical for learning complex classes such as Omniglot alphabets and carrying out unsupervised clustering. We explore variadic learning by measuring generalization across shot and way between meta-train and meta-test, show the first results for scaling from few-way, few-shot tasks to 1692-way Omniglot classification and 5k-shot CIFAR-10 classification, and find that nonparametric methods generalize better than parametric methods. On the standard few-shot learning benchmarks of Omniglot and mini-ImageNet, BANDE equals or improves on the state-of-the-art for semi-supervised classification.", "keywords": "meta-learning;metric learning;bayesian nonparametrics;few-shot learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Kelsey R Allen;Hanul Shin;Evan Shelhamer;Josh B. Tenenbaum", "authorids": "krallen@mit.edu;skyshin@mit.edu;shelhamer@cs.berkeley.edu;jbt@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nallen2019variadic,\ntitle={Variadic Learning by Bayesian Nonparametric Deep Embedding},\nauthor={Kelsey R Allen and Hanul Shin and Evan Shelhamer and Josh B. Tenenbaum},\nyear={2019},\nurl={https://openreview.net/forum?id=SJf6BhAqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJf6BhAqK7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;2;4", "wc_review": "449;142;1251", "wc_reply_reviewers": "319;0;492", "wc_reply_authors": "1547;682;1735", "reply_reviewers": "1;0;2", "reply_authors": "3;1;4", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 614.0, 467.53894668430206 ], "wc_reply_reviewers_avg": [ 270.3333333333333, 203.7847448221339 ], "wc_reply_authors_avg": [ 1321.3333333333333, 458.5457689503004 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3877552293860599493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJfFTjA5KQ", "title": "Unification of Recurrent Neural Network Architectures and Quantum Inspired Stable Design", "track": "main", "status": "Reject", "tldr": "We provide theoretical proof of various recurrent neural network designs representable dynamics' nonlinearity and memory scale, and propose a new RNN ansatz inspired by quantum physics.", "abstract": "Various architectural advancements in the design of recurrent neural networks~(RNN) have been focusing on improving the empirical stability and representability by sacrificing the complexity of the architecture. However, more remains to be done to fully understand the fundamental trade-off between these conflicting requirements. Towards answering this question, we forsake the purely bottom-up approach of data-driven machine learning to understand, instead, the physical origin and dynamical properties of existing RNN architectures. This facilitates designing new RNNs with smaller complexity overhead and provable stability guarantee. First, we define a family of deep recurrent neural networks, $n$-$t$-ORNN, according to the order of nonlinearity $n$ and the range of temporal memory scale $t$ in their underlying dynamics embodied in the form of discretized ordinary differential equations. We show that most of the existing proposals of RNN architectures belong to different orders of $n$-$t$-ORNNs. We then propose a new RNN ansatz, namely the Quantum-inspired Universal computing Neural Network~(QUNN), to leverage the reversibility, stability, and universality of quantum computation for stable and universal RNN. QUNN provides a complexity reduction in the number of training parameters from being polynomial in both data and correlation time to only linear in correlation time. Compared to Long-Short-Term Memory (LSTM), QUNN of the same number of hidden layers facilitates higher nonlinearity and longer memory span with provable stability. Our work opens new directions in designing minimal RNNs based on additional knowledge about the dynamical nature of both the data and different training architectures.", "keywords": "theory and analysis of RNNs architectures;reversibe evolution;stability of deep neural network;learning representations of outputs or states;quantum inspired embedding", "primary_area": "", "supplementary_material": "", "author": "Murphy Yuezhen Niu;Lior Horesh;Michael O'Keeffe;Isaac Chuang", "authorids": "yzniu@mit.edu;lhoresh@us.ibm.com;michael.okeeffe@ll.mit.edu;ichuang@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nniu2019unification,\ntitle={Unification of Recurrent Neural Network Architectures and Quantum Inspired Stable Design },\nauthor={Murphy Yuezhen Niu and Lior Horesh and Michael O'Keeffe and Isaac Chuang},\nyear={2019},\nurl={https://openreview.net/forum?id=SJfFTjA5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer5;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=SJfFTjA5KQ", "pdf_size": 0, "rating": "4;4;5;5", "confidence": "3;2;2;2", "wc_review": "456;544;195;133", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 4.5, 0.5 ], "confidence_avg": [ 2.25, 0.4330127018922193 ], "wc_review_avg": [ 332.0, 172.25707532638535 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.5773502691896257, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:fmu35FfdPHIJ:scholar.google.com/&scioq=Unification+of+Recurrent+Neural+Network+Architectures+and+Quantum+Inspired+Stable+Design&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJfHg2A5tQ", "title": "BNN+: Improved Binary Network Training", "track": "main", "status": "Reject", "tldr": "The paper presents an improved training mechanism for obtaining binary networks with smaller accuracy drop that helps close the gap with it's full precision counterpart", "abstract": "Deep neural networks (DNN) are widely used in many applications. However, their deployment on edge devices has been difficult because they are resource hungry. Binary neural networks (BNN) help to alleviate the prohibitive resource requirements of DNN, where both activations and weights are limited to 1-bit. We propose an improved binary training method (BNN+), by introducing a regularization function that encourages training weights around binary values. In addition to this, to enhance model performance we add trainable scaling factors to our regularization functions. Furthermore, we use an improved approximation of the derivative of the sign activation function in the backward computation. These additions are based on linear operations that are easily implementable into the binary training framework. We show experimental results on CIFAR-10 obtaining an accuracy of 86.5%, on AlexNet and 91.3% with VGG network. On ImageNet, our method also outperforms the traditional BNN method and XNOR-net, using AlexNet by a margin of 4% and 2% top-1 accuracy respectively.", "keywords": "Binary Network;Binary Training;Model Compression;Quantization", "primary_area": "", "supplementary_material": "", "author": "Sajad Darabi;Mouloud Belbahri;Matthieu Courbariaux;Vahid Partovi Nia", "authorids": "sajad.darabi@cs.ucla.edu;belbahrim@dms.umontreal.ca;matthieu.courbariaux@gmail.com;vahid.partovinia@huawei.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndarabi2019bnn,\ntitle={{BNN}+: Improved Binary Network Training},\nauthor={Sajad Darabi and Mouloud Belbahri and Matthieu Courbariaux and Vahid Partovi Nia},\nyear={2019},\nurl={https://openreview.net/forum?id=SJfHg2A5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJfHg2A5tQ", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;3;4", "wc_review": "1192;813;232", "wc_reply_reviewers": "785;0;0", "wc_reply_authors": "2726;1583;625", "reply_reviewers": "3;0;0", "reply_authors": "6;3;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 745.6666666666666, 394.7998086231665 ], "wc_reply_reviewers_avg": [ 261.6666666666667, 370.0525488209598 ], "wc_reply_authors_avg": [ 1644.6666666666667, 858.8373277609419 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 3.3333333333333335, 2.0548046676563256 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5929644121060893014&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "ACCELERATING NONCONVEX LEARNING VIA REPLICA EXCHANGE LANGEVIN DIFFUSION", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/894", "id": "SJfPFjA9Fm", "author_site": "Yi Chen, Jinglin Chen, Jing Dong, Jian Peng, Zhaoran Wang", "tldr": "", "abstract": "Langevin diffusion is a powerful method for nonconvex optimization, which enables the escape from local minima by injecting noise into the gradient. In particular, the temperature parameter controlling the noise level gives rise to a tradeoff between ``global exploration'' and ``local exploitation'', which correspond to high and low temperatures. To attain the advantages of both regimes, we propose to use replica exchange, which swaps between two Langevin diffusions with different temperatures. We theoretically analyze the acceleration effect of replica exchange from two perspectives: (i) the convergence in $\\chi^2$-divergence, and (ii) the large deviation principle. Such an acceleration effect allows us to faster approach the global minima. Furthermore, by discretizing the replica exchange Langevin diffusion, we obtain a discrete-time algorithm. For such an algorithm, we quantify its discretization error in theory and demonstrate its acceleration effect in practice. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yi Chen;Jinglin Chen;Jing Dong;Jian Peng;Zhaoran Wang", "authorids": "yichen2016@u.northwestern.edu;jinglinc@illinois.edu;jd2736@columbia.edu;jianpeng@illinois.edu;zhaoranwang@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nchen2018accelerating,\ntitle={{ACCELERATING} {NONCONVEX} {LEARNING} {VIA} {REPLICA} {EXCHANGE} {LANGEVIN} {DIFFUSION}},\nauthor={Yi Chen and Jinglin Chen and Jing Dong and Jian Peng and Zhaoran Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJfPFjA9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "wc_review": "144;183;177", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "490;221;293", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 168.0, 17.146428199482248 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 334.6666666666667, 113.70234044302791 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10752592153480881669&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJfPFjA9Fm", "pdf": "https://openreview.net/pdf?id=SJfPFjA9Fm", "email": ";;;;", "author_num": 5 }, { "id": "SJfWKsC5K7", "title": "Explaining Neural Networks Semantically and Quantitatively", "track": "main", "status": "Withdraw", "tldr": "This paper presents a method to explain the knowledge encoded in a convolutional neural network (CNN) quantitatively and semantically.", "abstract": "This paper presents a method to explain the knowledge encoded in a convolutional neural network (CNN) quantitatively and semantically. How to analyze the specific rationale of each prediction made by the CNN presents one of key issues of understanding neural networks, but it is also of significant practical values in certain applications. In this study, we propose to distill knowledge from the CNN into an explainable additive model, so that we can use the explainable model to provide a quantitative explanation for the CNN prediction. We analyze the typical bias-interpreting problem of the explainable model and develop prior losses to guide the learning of the explainable additive model. Experimental results have demonstrated the effectiveness of our method.", "keywords": "Network interpretability;deep learning;knowledge distillation;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Hao Chen;Runjin Chen;Quanshi Zhang", "authorids": "bridgechen@hust.edu.cn;chenrunjin@sjtu.edu.cn;zqs1022@sjtu.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJfWKsC5K7", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;4", "wc_review": "290;838;454", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 527.3333333333334, 229.65094285797207 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4090138270637557944&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Dynamically Unfolding Recurrent Restorer: A Moving Endpoint Control Method for Image Restoration", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/674", "id": "SJfZKiC5FX", "author_site": "Xiaoshuai Zhang, Yiping Lu, Jiaying Liu, Bin Dong", "tldr": "We propose a novel method to handle image degradations of different levels by learning a diffusion terminal time. Our model can generalize to unseen degradation level and different noise statistic.", "abstract": "In this paper, we propose a new control framework called the moving endpoint control to restore images corrupted by different degradation levels in one model. The proposed control problem contains a restoration dynamics which is modeled by an RNN. The moving endpoint, which is essentially the terminal time of the associated dynamics, is determined by a policy network. We call the proposed model the dynamically unfolding recurrent restorer (DURR). Numerical experiments show that DURR is able to achieve state-of-the-art performances on blind image denoising and JPEG image deblocking. Furthermore, DURR can well generalize to images with higher degradation levels that are not included in the training stage.", "keywords": "image restoration;differential equation", "primary_area": "", "supplementary_material": "", "author": "Xiaoshuai Zhang;Yiping Lu;Jiaying Liu;Bin Dong", "authorids": "jet@pku.edu.cn;luyiping9712@pku.edu.cn;liujiaying@pku.edu.cn;dongbin@math.pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhang2018dynamically,\ntitle={Dynamically Unfolding Recurrent Restorer: A Moving Endpoint Control Method for Image Restoration},\nauthor={Xiaoshuai Zhang and Yiping Lu and Jiaying Liu and Bin Dong},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJfZKiC5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;3;4", "wc_review": "278;252;223", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "448;459;169", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 251.0, 22.464787260658994 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 358.6666666666667, 134.18974956712935 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5404835983364980694&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=SJfZKiC5FX", "pdf": "https://openreview.net/pdf?id=SJfZKiC5FX", "email": ";;;", "author_num": 4 }, { "id": "SJf_XhCqKm", "title": "Open Loop Hyperparameter Optimization and Determinantal Point Processes", "track": "main", "status": "Reject", "tldr": "We address fully parallel hyperparameter optimization with Determinantal Point Processes. ", "abstract": "Driven by the need for parallelizable hyperparameter optimization methods, this paper studies open loop search methods: sequences that are predetermined and can be generated before a single configuration is evaluated. Examples include grid search, uniform random search, low discrepancy sequences, and other sampling distributions.\nIn particular, we propose the use of k-determinantal point processes in hyperparameter optimization via random search. Compared to conventional uniform random search where hyperparameter settings are sampled independently, a k-DPP promotes diversity. We describe an approach that transforms hyperparameter search spaces for efficient use with a k-DPP. In addition, we introduce a novel Metropolis-Hastings algorithm which can sample from k-DPPs defined over any space from which uniform samples can be drawn, including spaces with a mixture of discrete and continuous dimensions or tree structure. Our experiments show significant benefits in realistic scenarios with a limited budget for training supervised learners, whether in serial or parallel.", "keywords": "hyperparameter optimization;black box optimization", "primary_area": "", "supplementary_material": "", "author": "Jesse Dodge;Kevin Jamieson;Noah Smith", "authorids": "jessed@cs.cmu.edu;jamieson@cs.washington.edu;nasmith@cs.washington.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndodge2019open,\ntitle={Open Loop Hyperparameter Optimization and Determinantal Point Processes},\nauthor={Jesse Dodge and Kevin Jamieson and Noah Smith},\nyear={2019},\nurl={https://openreview.net/forum?id=SJf_XhCqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJf_XhCqKm", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;5;4", "wc_review": "284;414;136", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "534;645;71", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 278.0, 113.57229709161767 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 416.6666666666667, 248.58845955156931 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9326564281667380212&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Bias-Reduced Uncertainty Estimation for Deep Neural Classifiers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/963", "id": "SJfb5jCqKm", "author_site": "Yonatan Geifman, Guy Uziel, Ran El-Yaniv", "tldr": "We use snapshots from the training process to improve any uncertainty estimation method of a DNN classifier.", "abstract": "We consider the problem of uncertainty estimation in the context of (non-Bayesian) deep neural classification. In this context, all known methods are based on extracting uncertainty signals from a trained network optimized to solve the classification problem at hand. We demonstrate that such techniques tend to introduce biased estimates for instances whose predictions are supposed to be highly confident. We argue that this deficiency is an artifact of the dynamics of training with SGD-like optimizers, and it has some properties similar to overfitting. Based on this observation, we develop an uncertainty estimation algorithm that selectively estimates the uncertainty of highly confident points, using earlier snapshots of the trained model, before their estimates are jittered (and way before they are ready for actual classification). We present extensive experiments indicating that the proposed algorithm provides uncertainty estimates that are consistently better than all known methods.", "keywords": "Uncertainty estimation;Deep learning", "primary_area": "", "supplementary_material": "", "author": "Yonatan Geifman;Guy Uziel;Ran El-Yaniv", "authorids": "yonatan.g@cs.technion.ac.il;uzielguy@gmail.com;rani@cs.technion.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ngeifman2018biasreduced,\ntitle={Bias-Reduced Uncertainty Estimation for Deep Neural Classifiers},\nauthor={Yonatan Geifman and Guy Uziel and Ran El-Yaniv},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJfb5jCqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "2;3;4", "wc_review": "238;703;106", "wc_reply_reviewers": "0;60;0", "wc_reply_authors": "227;317;187", "reply_reviewers": "0;1;0", "reply_authors": "2;2;2", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 349.0, 256.05077621440637 ], "wc_reply_reviewers_avg": [ 20.0, 28.284271247461902 ], "wc_reply_authors_avg": [ 243.66666666666666, 54.365021434333634 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12377365363481099199&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SJfb5jCqKm", "pdf": "https://openreview.net/pdf?id=SJfb5jCqKm", "email": ";;", "author_num": 3 }, { "id": "SJfcrn0qKX", "title": "Realistic Adversarial Examples in 3D Meshes", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Highly expressive models especially deep neural networks (DNNs) have been widely applied to various applications and achieved increasing success. However, recent studies show that such machine learning models appear to be vulnerable against adversarial examples. So far adversarial examples have been heavily explored for 2D images, while few work has tried to understand the vulnerabilities of 3D objects which exist in real world, where 3D objects are projected to 2D domains by photo taking for different learning (recognition) tasks. In this paper we consider adversarial behaviors in practical scenarios by manipulating the shape and texture of a given 3D mesh representation of an object. Our goal is to project the optimized \"adversarial meshes\" to 2D with photo-realistic rendering engine, and still able to mislead different machine learning models.\nExtensive experiments show that by generating unnoticeable 3D adversarial perturbation on shape or texture for a 3D mesh, the corresponding projected 2D instance can either lead classifiers to misclassify the victim object arbitrary malicious target, or hide any target object within the scene from state-of-the-art object detectors. We conduct human studies to show that our optimized adversarial 3D perturbation is highly unnoticeable for human vision systems. In addition to the subtle perturbation on a given 3D mesh, we also propose to synthesize a realistic 3D mesh to put in a scene mimicking similar rendering conditions and therefore attack existing objects within it. In-depth analysis for transferability among different 3D rendering engines and vulnerable regions of meshes are provided to help better understand adversarial behaviors in practice and motivate potential defenses. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chaowei Xiao;Dawei Yang;Bo Li;Jia Deng;Mingyan Liu", "authorids": "xiaocw@umich.edu;ydawei@umich.edu;lxbosky@gmail.com;jiadeng@cs.princeton.edu;mingyan@umich.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJfcrn0qKX", "pdf_size": 0, "rating": "3;5;5", "confidence": "3;3;3", "wc_review": "252;404;164", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 273.3333333333333, 99.13402824235256 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5950292500635871149&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJg013C5KX", "title": "Teaching to Teach by Structured Dark Knowledge", "track": "main", "status": "Reject", "tldr": "We newly proposed ``teaching to teach, to educate a better teacher to teach a better student by introducing structured dark knowledge.", "abstract": "To educate hyper deep learners, \\emph{Curriculum Learnings} (CLs) require either human heuristic participation or self-deciding the difficulties of training instances. These coaching manners are blind to the coherent structures among examples, categories, and tasks, which are pregnant with more knowledgeable curriculum-routed teachers. In this paper, we propose a general methodology \\emph{Teaching to Teach} (T2T). T2T is facilitated by \\emph{Structured Dark Knowledge} (SDK) that constitutes a communication protocol between structured knowledge prior and teaching strategies. On one hand, SDK adaptively extracts structured knowledge by selecting a training subset consistent with the previous teaching decisions. On the other hand, SDK teaches curriculum-agnostic teachers by transferring this knowledge to update their teaching policy. This virtuous cycle can be flexibly-deployed in most existing CL platforms and more importantly, very generic across various structured knowledge characteristics, e.g., diversity, complementarity, and causality. We evaluate T2T across different learners, teachers, and tasks, which significantly demonstrates that structured knowledge can be inherited by the teachers to further benefit learners' training.\n", "keywords": "teaching to teach;dark knowledge;curriculum learning;teaching", "primary_area": "", "supplementary_material": "", "author": "Ziliang Chen;Keze Wang;Liang Lin", "authorids": "c.ziliang@yahoo.com;kezewang@gmail.com;linliang@ieee.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2019teaching,\ntitle={Teaching to Teach by Structured Dark Knowledge},\nauthor={Ziliang Chen and Keze Wang and Liang Lin},\nyear={2019},\nurl={https://openreview.net/forum?id=SJg013C5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=SJg013C5KX", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;1;5", "wc_review": "317;138;380", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 1.699673171197595 ], "wc_review_avg": [ 278.3333333333333, 102.50962014475627 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4193139346887673, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l18bJiL-prgJ:scholar.google.com/&scioq=Teaching+to+Teach+by+Structured+Dark+Knowledge&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJg6nj09F7", "title": "NEURAL MALWARE CONTROL WITH DEEP REINFORCEMENT LEARNING", "track": "main", "status": "Reject", "tldr": "A deep reinforcement learning-based system is proposed to control when to halt the emulation of an unknown file and to improve the detection rate of a deep malware classifier.", "abstract": "Antimalware products are a key component in detecting malware attacks, and their engines typically execute unknown programs in a sandbox prior to running them on the native operating system. Files cannot be scanned indefinitely so the engine employs heuristics to determine when to halt execution. Previous research has investigated analyzing the sequence of system calls generated during this emulation process to predict if an unknown file is malicious, but these models require the emulation to be stopped after executing a fixed number of events from the beginning of the file. Also, these classifiers are not accurate enough to halt emulation in the middle of the file on their own. In this paper, we propose a novel algorithm which overcomes this limitation and learns the best time to halt the file's execution based on deep reinforcement learning (DRL). Because the new DRL-based system continues to emulate the unknown file until it can make a confident decision to stop, it prevents attackers from avoiding detection by initiating malicious activity after a fixed number of system calls. Results show that the proposed malware execution control model automatically halts emulation for 91.3\\% of the files earlier than heuristics employed by the engine. Furthermore, classifying the files at that time improves the true positive rate by 61.5%, at a false positive rate of 1%, compared to a baseline classifier.", "keywords": "malware;execution;control;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Yu Wang;Jack W. Stokes;Mady Marinescu", "authorids": "yu.wang@yale.edu;jstokes@microsoft.com;mady@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2019neural,\ntitle={{NEURAL} {MALWARE} {CONTROL} {WITH} {DEEP} {REINFORCEMENT} {LEARNING}},\nauthor={Yu Wang and Jack W. Stokes and Mady Marinescu},\nyear={2019},\nurl={https://openreview.net/forum?id=SJg6nj09F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJg6nj09F7", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;2;2", "wc_review": "84;111;267", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "501;470;189", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 154.0, 80.65977932030313 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 386.6666666666667, 140.34323005482744 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17491838585596210337&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SJg7IsC5KQ", "title": "On the Convergence and Robustness of Batch Normalization", "track": "main", "status": "Reject", "tldr": "We mathematically analyze the effect of batch normalization on a simple model and obtain key new insights that applies to general supervised learning.", "abstract": "Despite its empirical success, the theoretical underpinnings of the stability, convergence and acceleration properties of batch normalization (BN) remain elusive. In this paper, we attack this problem from a modelling approach, where we perform thorough theoretical analysis on BN applied to simplified model: ordinary least squares (OLS). We discover that gradient descent on OLS with BN has interesting properties, including a scaling law, convergence for arbitrary learning rates for the weights, asymptotic acceleration effects, as well as insensitivity to choice of learning rates. We then demonstrate numerically that these findings are not specific to the OLS problem and hold qualitatively for more complex supervised learning problems. This points to a new direction towards uncovering the mathematical principles that underlies batch normalization.", "keywords": "Batch normalization;Convergence analysis;Gradient descent;Ordinary least squares;Deep neural network", "primary_area": "", "supplementary_material": "", "author": "Yongqiang Cai;Qianxiao Li;Zuowei Shen", "authorids": "matcyon@nus.edu.sg;liqix@ihpc.a-star.edu.sg;matzuows@nus.edu.sg", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncai2019on,\ntitle={On the Convergence and Robustness of Batch Normalization},\nauthor={Yongqiang Cai and Qianxiao Li and Zuowei Shen},\nyear={2019},\nurl={https://openreview.net/forum?id=SJg7IsC5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJg7IsC5KQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;3;3", "wc_review": "725;332;216", "wc_reply_reviewers": "439;0;0", "wc_reply_authors": "2279;551;116", "reply_reviewers": "2;0;0", "reply_authors": "5;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 424.3333333333333, 217.8138858954793 ], "wc_reply_reviewers_avg": [ 146.33333333333334, 206.9465846272629 ], "wc_reply_authors_avg": [ 982.0, 934.1530923783317 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14811991046914199949&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "CAMOU: Learning Physical Vehicle Camouflages to Adversarially Attack Detectors in the Wild", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/645", "id": "SJgEl3A5tm", "author_site": "Yang Zhang, Hassan Foroosh, Phiip David, Boqing Gong", "tldr": "We propose a method to learn physical vehicle camouflage to adversarially attack object detectors in the wild. We find our camouflage effective and transferable.", "abstract": "In this paper, we conduct an intriguing experimental study about the physical adversarial attack on object detectors in the wild. In particular, we learn a camouflage pattern to hide vehicles from being detected by state-of-the-art convolutional neural network based detectors. Our approach alternates between two threads. In the first, we train a neural approximation function to imitate how a simulator applies a camouflage to vehicles and how a vehicle detector performs given images of the camouflaged vehicles. In the second, we minimize the approximated detection score by searching for the optimal camouflage. Experiments show that the learned camouflage can not only hide a vehicle from the image-based detectors under many test cases but also generalizes to different environments, vehicles, and object detectors.", "keywords": "Adversarial Attack;Object Detection;Synthetic Simulation", "primary_area": "", "supplementary_material": "", "author": "Yang Zhang;Hassan Foroosh;Philip David;Boqing Gong", "authorids": "yangzhang4065@gmail.com;foroosh@cs.ucf.edu;philip.j.david4.civ@mail.mil;boqinggo@outlook.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhang2018camou,\ntitle={{CAMOU}: Learning Physical Vehicle Camouflages to Adversarially Attack Detectors in the Wild},\nauthor={Yang Zhang and Hassan Foroosh and Philip David and Boqing Gong},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJgEl3A5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "4;7;8", "confidence": "3;3;4", "wc_review": "421;241;171", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "773;428;199", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 1.699673171197595 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 277.6666666666667, 105.30379332620876 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 466.6666666666667, 235.92418744635367 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.6933752452815364, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=497992744230955037&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "openreview": "https://openreview.net/forum?id=SJgEl3A5tm", "pdf": "https://openreview.net/pdf?id=SJgEl3A5tm", "email": ";;;", "author_num": 4 }, { "title": "Learning Latent Superstructures in Variational Autoencoders for Deep Multidimensional Clustering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/838", "id": "SJgNwi09Km", "author_site": "Xiaopeng Li, Zhourong Chen, Leonard Poon, Nevin Zhang", "tldr": "We investigate a variant of variational autoencoders where there is a superstructure of discrete latent variables on top of the latent features.", "abstract": "We investigate a variant of variational autoencoders where there is a superstructure of discrete latent variables on top of the latent features. In general, our superstructure is a tree structure of multiple super latent variables and it is automatically learned from data. When there is only one latent variable in the superstructure, our model reduces to one that assumes the latent features to be generated from a Gaussian mixture model. We call our model the latent tree variational autoencoder (LTVAE). Whereas previous deep learning methods for clustering produce only one partition of data, LTVAE produces multiple partitions of data, each being given by one super latent variable. This is desirable because high dimensional data usually have many different natural facets and can be meaningfully partitioned in multiple ways.", "keywords": "latent tree model;variational autoencoder;deep learning;latent variable model;bayesian network;structure learning;stepwise em;message passing;graphical model;multidimensional clustering;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Xiaopeng Li;Zhourong Chen;Leonard K. M. Poon;Nevin L. Zhang", "authorids": "xlibo@cse.ust.hk;zchenbb@cse.ust.hk;kmpoon@eduhk.hk;lzhang@cse.ust.hk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nli2018learning,\ntitle={Learning Latent Superstructures in Variational Autoencoders for Deep Multidimensional Clustering},\nauthor={Xiaopeng Li and Zhourong Chen and Leonard K. M. Poon and Nevin L. Zhang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJgNwi09Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;4", "wc_review": "613;691;1029", "wc_reply_reviewers": "133;195;486", "wc_reply_authors": "844;508;1082", "reply_reviewers": "1;1;2", "reply_authors": "1;1;2", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 777.6666666666666, 180.54977768533038 ], "wc_reply_reviewers_avg": [ 271.3333333333333, 153.8881267530265 ], "wc_reply_authors_avg": [ 811.3333333333334, 235.47021514880015 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 75, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5927867859070170464&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SJgNwi09Km", "pdf": "https://openreview.net/pdf?id=SJgNwi09Km", "email": ";;;", "author_num": 4 }, { "id": "SJgTps0qtQ", "title": "Exploiting Environmental Variation to Improve Policy Robustness in Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "By formulating the learning curriculum as a bandit problem, we present a principled approach to motivating policy robustness in continuous controls tasks.", "abstract": "Conventional reinforcement learning rarely considers how the physical variations in the environment (eg. mass, drag, etc.) affect the policy learned by the agent. In this paper, we explore how changes in the environment affect policy generalization. We observe experimentally that, for each task we considered, there exists an optimal environment setting that results in the most robust policy that generalizes well to future environments. We propose a novel method to exploit this observation to develop robust actor policies, by automatically developing a sampling curriculum over environment settings to use in training. Ours is a model-free approach and experiments demonstrate that the performance of our method is on par with the best policies found by an exhaustive grid search, while bearing a significantly lower computational cost.", "keywords": "Reinforcement Learning;Policy Robustness;Policy generalization;Automated Curriculum", "primary_area": "", "supplementary_material": "", "author": "Siddharth Mysore;Robert Platt;Kate Saenko", "authorids": "sidmys@bu.edu;rplatt@ccs.neu.edu;saenko@bu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmysore2019exploiting,\ntitle={Exploiting Environmental Variation to Improve Policy Robustness in Reinforcement Learning},\nauthor={Siddharth Mysore and Robert Platt and Kate Saenko},\nyear={2019},\nurl={https://openreview.net/forum?id=SJgTps0qtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJgTps0qtQ", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;3;4", "wc_review": "572;119;435", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "627;84;5", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 375.3333333333333, 189.6880480039677 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 238.66666666666666, 276.4806603644377 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9s5MVzLQnpsJ:scholar.google.com/&scioq=Exploiting+Environmental+Variation+to+Improve+Policy+Robustness+in+Reinforcement+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Learning Programmatically Structured Representations with Perceptor Gradients", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1107", "id": "SJggZnRcFQ", "author_site": "Svetlin Penkov, Subramanian Ramamoorthy", "tldr": "", "abstract": "We present the perceptor gradients algorithm -- a novel approach to learning symbolic representations based on the idea of decomposing an agent's policy into i) a perceptor network extracting symbols from raw observation data and ii) a task encoding program which maps the input symbols to output actions. We show that the proposed algorithm is able to learn representations that can be directly fed into a Linear-Quadratic Regulator (LQR) or a general purpose A* planner. Our experimental results confirm that the perceptor gradients algorithm is able to efficiently learn transferable symbolic representations as well as generate new observations according to a semantically meaningful specification.\n", "keywords": "representation learning;structured representations;symbols;programs", "primary_area": "", "supplementary_material": "", "author": "Svetlin Penkov;Subramanian Ramamoorthy", "authorids": "sv.penkov@gmail.com;s.ramamoorthy@ed.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\npenkov2018learning,\ntitle={Learning Programmatically Structured Representations with Perceptor Gradients},\nauthor={Svetlin Penkov and Subramanian Ramamoorthy},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJggZnRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;1;5", "wc_review": "394;117;1119", "wc_reply_reviewers": "465;0;371", "wc_reply_authors": "390;0;1031", "reply_reviewers": "1;0;1", "reply_authors": "2;0;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 1.632993161855452 ], "wc_review_avg": [ 543.3333333333334, 422.47393081966874 ], "wc_reply_reviewers_avg": [ 278.6666666666667, 200.7491524819525 ], "wc_reply_authors_avg": [ 473.6666666666667, 425.04143588857573 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10506348952741337181&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=SJggZnRcFQ", "pdf": "https://openreview.net/pdf?id=SJggZnRcFQ", "email": ";", "author_num": 2 }, { "id": "SJgiNo0cKX", "title": "Multiple Encoder-Decoders Net for Lane Detection", "track": "main", "status": "Reject", "tldr": "", "abstract": "For semantic image segmentation and lane detection, nets with a single spatial pyramid structure or encoder-decoder structure are usually exploited. Convolutional neural networks (CNNs) show great results on both high-level and low-level features representations, however, the capability has not been fully embodied for lane detection task. In especial, it's still a challenge for model-based lane detection to combine the multi-scale context with a pixel-level accuracy because of the weak visual appearance and strong prior information. In this paper, we we propose an novel network for lane detection, the three main contributions are as follows. First, we employ multiple encoder-decoders module in end-to-end ways and show the promising results for lane detection. Second, we analysis different configurations of multiple encoder-decoders nets. Third, we make our attempts to rethink the evaluation methods of lane detection for the limitation of the popular methods based on IoU.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuetong Du;Xiaodong Gu;Junqin Liu;Liwen He", "authorids": "1239832590@qq.com;gu3xuan@qq.com;65581134@qq.com;helw@njupt.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndu2019multiple,\ntitle={Multiple Encoder-Decoders Net for Lane Detection},\nauthor={Yuetong Du and Xiaodong Gu and Junqin Liu and Liwen He},\nyear={2019},\nurl={https://openreview.net/forum?id=SJgiNo0cKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJgiNo0cKX", "pdf_size": 0, "rating": "2;2;4", "confidence": "5;4;4", "wc_review": "553;339;1244", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 712.0, 386.19252538943147 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=799121491031774970&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SJgs1n05YQ", "title": "Learning and Planning with a Semantic Model", "track": "main", "status": "Reject", "tldr": "We propose a hybrid model-based & model-free approach using semantic information to improve DRL generalization in man-made environments.", "abstract": "Building deep reinforcement learning agents that can generalize and adapt to unseen environments remains a fundamental challenge for AI. This paper describes progresses on this challenge in the context of man-made environments, which are visually diverse but contain intrinsic semantic regularities. We propose a hybrid model-based and model-free approach, LEArning and Planning with Semantics (LEAPS), consisting of a multi-target sub-policy that acts on visual inputs, and a Bayesian model over semantic structures. When placed in an unseen environment, the agent plans with the semantic model to make high-level decisions, proposes the next sub-target for the sub-policy to execute, and updates the semantic model based on new observations. We perform experiments in visual navigation tasks using House3D, a 3D environment that contains diverse human-designed indoor scenes with real-world objects. LEAPS outperforms strong baselines that do not explicitly plan using the semantic content.", "keywords": "deep reinforcement learning;generalization;semantic structure;model-based", "primary_area": "", "supplementary_material": "", "author": "Yi Wu;Yuxin Wu;Aviv Tamar;Stuart Russell;Georgia Gkioxari;Yuandong Tian", "authorids": "jxwuyi@gmail.com;yuxinwu@fb.com;avivt@berkeley.edu;russell@cs.berkeley.edu;gkioxari@fb.com;yuandong@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nwu2019learning,\ntitle={Learning and Planning with a Semantic Model},\nauthor={Yi Wu and Yuxin Wu and Aviv Tamar and Stuart Russell and Georgia Gkioxari and Yuandong Tian},\nyear={2019},\nurl={https://openreview.net/forum?id=SJgs1n05YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJgs1n05YQ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "wc_review": "267;637;314", "wc_reply_reviewers": "0;80;0", "wc_reply_authors": "391;899;112", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 406.0, 164.46478853136517 ], "wc_reply_reviewers_avg": [ 26.666666666666668, 37.71236166328253 ], "wc_reply_authors_avg": [ 467.3333333333333, 325.79373160875207 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16468612319262774154&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Variational Autoencoders with Jointly Optimized Latent Dependency Structure", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/953", "id": "SJgsCjCqt7", "author_site": "Jiawei He, Yu Gong, Joe Marino, Greg Mori, Andreas Lehrmann", "tldr": "We propose a method for learning latent dependency structure in variational autoencoders.", "abstract": "We propose a method for learning the dependency structure between latent variables in deep latent variable models. Our general modeling and inference framework combines the complementary strengths of deep generative models and probabilistic graphical models. In particular, we express the latent variable space of a variational autoencoder (VAE) in terms of a Bayesian network with a learned, flexible dependency structure. The network parameters, variational parameters as well as the latent topology are optimized simultaneously with a single objective. Inference is formulated via a sampling procedure that produces expectations over latent variable structures and incorporates top-down and bottom-up reasoning over latent variable values. We validate our framework in extensive experiments on MNIST, Omniglot, and CIFAR-10. Comparisons to state-of-the-art structured variational autoencoder baselines show improvements in terms of the expressiveness of the learned model.", "keywords": "deep generative models;structure learning", "primary_area": "", "supplementary_material": "", "author": "Jiawei He;Yu Gong;Joseph Marino;Greg Mori;Andreas Lehrmann", "authorids": "jha203@sfu.ca;yu_gong@sfu.ca;jmarino@caltech.edu;mori@cs.sfu.ca;andreas.lehrmann@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhe2018variational,\ntitle={Variational Autoencoders with Jointly Optimized Latent Dependency Structure},\nauthor={Jiawei He and Yu Gong and Joseph Marino and Greg Mori and Andreas Lehrmann},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJgsCjCqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;5", "wc_review": "539;572;1639", "wc_reply_reviewers": "123;100;527", "wc_reply_authors": "396;301;945", "reply_reviewers": "1;1;2", "reply_authors": "1;1;2", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 916.6666666666666, 510.9444414241359 ], "wc_reply_reviewers_avg": [ 250.0, 196.093515106101 ], "wc_reply_authors_avg": [ 547.3333333333334, 283.85481421944485 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16010026349597999737&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJgsCjCqt7", "pdf": "https://openreview.net/pdf?id=SJgsCjCqt7", "email": ";;;;", "author_num": 5 }, { "title": "The Unusual Effectiveness of Averaging in GAN Training", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1095", "id": "SJgw_sRqFQ", "author_site": "Yasin YAZICI, Chuan-Sheng Foo, Stefan Winkler, Kim-Hui Yap, Georgios Piliouras, Vijay Chandrasekhar", "tldr": "", "abstract": "We examine two different techniques for parameter averaging in GAN training. Moving Average (MA) computes the time-average of parameters, whereas Exponential Moving Average (EMA) computes an exponentially discounted sum. Whilst MA is known to lead to convergence in bilinear settings, we provide the -- to our knowledge -- first theoretical arguments in support of EMA. We show that EMA converges to limit cycles around the equilibrium with vanishing amplitude as the discount parameter approaches one for simple bilinear games and also enhances the stability of general GAN training. We establish experimentally that both techniques are strikingly effective in the non-convex-concave GAN setting as well. Both improve inception and FID scores on different architectures and for different GAN objectives. We provide comprehensive experimental results across a range of datasets -- mixture of Gaussians, CIFAR-10, STL-10, CelebA and ImageNet -- to demonstrate its effectiveness. We achieve state-of-the-art results on CIFAR-10 and produce clean CelebA face images.\\footnote{~The code is available at \\url{https://github.com/yasinyazici/EMA_GAN}}", "keywords": "Generative Adversarial Networks (GANs);Moving Average;Exponential Moving Average;Convergence;Limit Cycles", "primary_area": "", "supplementary_material": "", "author": "Yasin Yaz{\\i}c{\\i};Chuan-Sheng Foo;Stefan Winkler;Kim-Hui Yap;Georgios Piliouras;Vijay Chandrasekhar", "authorids": "yasin001@e.ntu.edu.sg;foocs@i2r.a-star.edu.sg;stefan.winkler@adsc-create.edu.sg;ekhyap@ntu.edu.sg;georgios@sutd.edu.sg;vijay@i2r.a-star.edu.sg", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nyaz{\\i}c{\\i}2018the,\ntitle={The Unusual Effectiveness of Averaging in {GAN} Training},\nauthor={Yasin Yaz{\\i}c{\\i} and Chuan-Sheng Foo and Stefan Winkler and Kim-Hui Yap and Georgios Piliouras and Vijay Chandrasekhar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJgw_sRqFQ},\n}", "github": "[![github](/images/github_icon.svg) yasinyazici/EMA_GAN](https://github.com/yasinyazici/EMA_GAN)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "2;4;4", "wc_review": "104;339;559", "wc_reply_reviewers": "0;6;124", "wc_reply_authors": "303;308;219", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 334.0, 185.78661595138297 ], "wc_reply_reviewers_avg": [ 43.333333333333336, 57.09251750935104 ], "wc_reply_authors_avg": [ 276.6666666666667, 40.827550610940264 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4777630991460576023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=SJgw_sRqFQ", "pdf": "https://openreview.net/pdf?id=SJgw_sRqFQ", "email": ";;;;;", "author_num": 6 }, { "id": "SJgzJh0qtQ", "title": "A SINGLE SHOT PCA-DRIVEN ANALYSIS OF NETWORK STRUCTURE TO REMOVE REDUNDANCY", "track": "main", "status": "Withdraw", "tldr": "We present a single shot analysis of a trained neural network to remove redundancy and identify optimal network structure", "abstract": "Deep learning models have outperformed traditional methods in many fields such\nas natural language processing and computer vision. However, despite their\ntremendous success, the methods of designing optimal Convolutional Neural Networks\n(CNNs) are still based on heuristics or grid search. The resulting networks\nobtained using these techniques are often overparametrized with huge computational\nand memory requirements. This paper focuses on a structured, explainable\napproach towards optimal model design that maximizes accuracy while keeping\ncomputational costs tractable. We propose a single-shot analysis of a trained CNN\nthat uses Principal Component Analysis (PCA) to determine the number of filters\nthat are doing significant transformations per layer, without the need for retraining.\nIt can be interpreted as identifying the dimensionality of the hypothesis space\nunder consideration. The proposed technique also helps estimate an optimal number\nof layers by looking at the expansion of dimensions as the model gets deeper.\nThis analysis can be used to design an optimal structure of a given network on\na dataset, or help to adapt a predesigned network on a new dataset. We demonstrate\nthese techniques by optimizing VGG and AlexNet networks on CIFAR-10,\nCIFAR-100 and ImageNet datasets.", "keywords": "deep learning;model compression;pruning;PCA", "primary_area": "", "supplementary_material": "", "author": "Isha Garg;Priyadarshini Panda;Kaushik Roy", "authorids": "gargi@purdue.edu;pandap@purdue.edu;kaushik@purdue.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJgzJh0qtQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;5", "wc_review": "309;256;485", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 350.0, 97.88087998514656 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:j2f1uXmLx6kJ:scholar.google.com/&scioq=A+SINGLE+SHOT+PCA-DRIVEN+ANALYSIS+OF+NETWORK+STRUCTURE+TO+REMOVE+REDUNDANCY&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SJl11nCctX", "title": "TFGAN: Improving Conditioning for Text-to-Video Synthesis", "track": "main", "status": "Withdraw", "tldr": "An effective text-conditioning GAN framework for generating videos from text", "abstract": "Developing conditional generative models for text-to-video synthesis is an extremely challenging yet an important topic of research in machine learning. In this work, we address this problem by introducing Text-Filter conditioning Generative Adversarial Network (TFGAN), a GAN model with novel conditioning scheme that aids improving the text-video associations. With a combination of this conditioning scheme and a deep GAN architecture, TFGAN generates photo-realistic videos from text on very challenging real-world video datasets. In addition, we construct a benchmark synthetic dataset of moving shapes to systematically evaluate our conditioning scheme. Extensive experiments demonstrate that TFGAN significantly outperforms the existing approaches, and can also generate videos of novel categories not seen during training.\n", "keywords": "Conditional GAN;Video Generation;Text-to-Video Synthesis;Conditional Generative Models;Deep Generative Models", "primary_area": "", "supplementary_material": "", "author": "Yogesh Balaji;Martin Renqiang Min;Bing Bai;Rama Chellappa;Hans Peter Graf", "authorids": "yogesh@cs.umd.edu;renqiang@nec-labs.com;bbai@nec-labs.com;rama@umiacs.umd.edu;hpg@nec-labs.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJl11nCctX", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;4;3", "wc_review": "319;440;140", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 299.6666666666667, 123.23509601119677 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4201330029429670233&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Beyond Pixel Norm-Balls: Parametric Adversaries using an Analytically Differentiable Renderer", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1132", "id": "SJl2niR9KQ", "author_site": "Hsueh-Ti Derek Liu, Michael Tao, Chun-Liang Li, Derek Nowrouzezahrai, Alec Jacobson", "tldr": "Enabled by a novel differentiable renderer, we propose a new metric that has real-world implications for evaluating adversarial machine learning algorithms, resolving the lack of realism of the existing metric based on pixel norms.", "abstract": "Many machine learning image classifiers are vulnerable to adversarial attacks, inputs with perturbations designed to intentionally trigger misclassification. Current adversarial methods directly alter pixel colors and evaluate against pixel norm-balls: pixel perturbations smaller than a specified magnitude, according to a measurement norm. This evaluation, however, has limited practical utility since perturbations in the pixel space do not correspond to underlying real-world phenomena of image formation that lead to them and has no security motivation attached. Pixels in natural images are measurements of light that has interacted with the geometry of a physical scene. As such, we propose a novel evaluation measure, parametric norm-balls, by directly perturbing physical parameters that underly image formation. One enabling contribution we present is a physically-based differentiable renderer that allows us to propagate pixel gradients to the parametric space of lighting and geometry. Our approach enables physically-based adversarial attacks, and our differentiable renderer leverages models from the interactive rendering literature to balance the performance and accuracy trade-offs necessary for a memory-efficient and scalable adversarial data augmentation workflow.", "keywords": "adversarial examples;norm-balls;differentiable renderer", "primary_area": "", "supplementary_material": "", "author": "Hsueh-Ti Derek Liu;Michael Tao;Chun-Liang Li;Derek Nowrouzezahrai;Alec Jacobson", "authorids": "hsuehtil@cs.toronto.edu;mtao@dgp.toronto.edu;chunlial@cs.cmu.edu;derek@cim.mcgill.ca;jacobson@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nliu2018beyond,\ntitle={Beyond Pixel Norm-Balls: Parametric Adversaries using an Analytically Differentiable Renderer},\nauthor={Hsueh-Ti Derek Liu and Michael Tao and Chun-Liang Li and Derek Nowrouzezahrai and Alec Jacobson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJl2niR9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "wc_review": "429;364;175", "wc_reply_reviewers": "20;0;0", "wc_reply_authors": "64;24;130", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 322.6666666666667, 107.73527225977985 ], "wc_reply_reviewers_avg": [ 6.666666666666667, 9.428090415820632 ], "wc_reply_authors_avg": [ 72.66666666666667, 43.70608907489003 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12447741163485778888&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=SJl2niR9KQ", "pdf": "https://openreview.net/pdf?id=SJl2niR9KQ", "email": ";;;;", "author_num": 5 }, { "id": "SJl2ps0qKQ", "title": "Learning to Decompose Compound Questions with Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We propose a learning-to-decompose agent that helps simple-question answerers to answer compound question over knowledge graph.", "abstract": "As for knowledge-based question answering, a fundamental problem is to relax the assumption of answerable questions from simple questions to compound questions. Traditional approaches firstly detect topic entity mentioned in questions, then traverse the knowledge graph to find relations as a multi-hop path to answers, while we propose a novel approach to leverage simple-question answerers to answer compound questions. Our model consists of two parts: (i) a novel learning-to-decompose agent that learns a policy to decompose a compound question into simple questions and (ii) three independent simple-question answerers that classify the corresponding relations for each simple question. Experiments demonstrate that our model learns complex rules of compositionality as stochastic policy, which benefits simple neural networks to achieve state-of-the-art results on WebQuestions and MetaQA. We analyze the interpretable decomposition process as well as generated partitions.", "keywords": "Compound Question Decomposition;Reinforcement Learning;Knowledge-Based Question Answering;Learning-to-decompose", "primary_area": "", "supplementary_material": "", "author": "Haihong Yang;Han Wang;Shuang Guo;Wei Zhang;Huajun Chen", "authorids": "capriceyhh@zju.edu.cn;wanghanwh@zju.edu.cn;guoshuang@zju.edu.cn;lantau.zw@alibaba-inc.com;huajunsir@zju.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyang2019learning,\ntitle={Learning to Decompose Compound Questions with Reinforcement Learning},\nauthor={Haihong Yang and Han Wang and Shuang Guo and Wei Zhang and Huajun Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=SJl2ps0qKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJl2ps0qKQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;5", "wc_review": "305;229;470", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "399;422;807", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 334.6666666666667, 100.59931521746171 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 542.6666666666666, 187.14759475403957 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12018177016184973802&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJl7DsR5YQ", "title": "ReNeg and Backseat Driver: Learning from demonstration with continuous human feedback", "track": "main", "status": "Reject", "tldr": "We introduce a novel framework for learning from demonstration that uses continuous human feedback; we evaluate this framework on continuous control for autonomous vehicles.", "abstract": "Reinforcement learning (RL) is a powerful framework for solving problems by exploring and learning from mistakes. However, in the context of autonomous vehicle (AV) control, requiring an agent to make mistakes, or even allowing mistakes, can be quite dangerous and costly in the real world. For this reason, AV RL is generally only viable in simulation. Because these simulations have imperfect representations, particularly with respect to graphics, physics, and human interaction, we find motivation for a framework similar to RL, suitable to the real world. To this end, we formulate a learning framework that learns from restricted exploration by having a human demonstrator do the exploration. Existing work on learning from demonstration typically either assumes the collected data is performed by an optimal expert, or requires potentially dangerous exploration to find the optimal policy. We propose an alternative framework that learns continuous control from only safe behavior. One of our key insights is that the problem becomes tractable if the feedback score that rates the demonstration applies to the atomic action, as opposed to the entire sequence of actions. We use human experts to collect driving data as well as to label the driving data through a framework we call ``Backseat Driver'', giving us state-action pairs matched with scalar values representing the score for the action. We call the more general learning framework ReNeg, since it learns a regression from states to actions given negative as well as positive examples. We empirically validate several models in the ReNeg framework, testing on lane-following with limited data. We find that the best solution in this context outperforms behavioral cloning has strong connections to stochastic policy gradient approaches.", "keywords": "learning from demonstration;imitation learning;behavioral cloning;reinforcement learning;off-policy;continuous control;autonomous vehicles;deep learning;machine learning;policy gradient", "primary_area": "", "supplementary_material": "", "author": "Zoe Papakipos;Jacob Beck;Michael Littman", "authorids": "zoe_papakipos@alumni.brown.edu;jacob_beck@alumni.brown.edu;mlittman@cs.brown.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npapakipos2019reneg,\ntitle={ReNeg and Backseat Driver: Learning from demonstration with continuous human feedback},\nauthor={Zoe Papakipos and Jacob Beck and Michael Littman},\nyear={2019},\nurl={https://openreview.net/forum?id=SJl7DsR5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJl7DsR5YQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "wc_review": "541;227;650", "wc_reply_reviewers": "60;0;323", "wc_reply_authors": "400;306;952", "reply_reviewers": "1;0;2", "reply_authors": "1;2;4", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 472.6666666666667, 179.3215609518895 ], "wc_reply_reviewers_avg": [ 127.66666666666667, 140.27671066700827 ], "wc_reply_authors_avg": [ 552.6666666666666, 284.967054626008 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15426830433002923027&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "SJl8J30qFX", "title": "Learning Global Additive Explanations for Neural Nets Using Model Distillation", "track": "main", "status": "Reject", "tldr": "We propose to leverage model distillation to learn global additive explanations in the form of feature shapes (that are more expressive than feature attributions) for models such as neural nets trained on tabular data.", "abstract": "Interpretability has largely focused on local explanations, i.e. explaining why a model made a particular prediction for a sample. These explanations are appealing due to their simplicity and local fidelity. However, they do not provide information about the general behavior of the model. We propose to leverage model distillation to learn global additive explanations that describe the relationship between input features and model predictions. These global explanations take the form of feature shapes, which are more expressive than feature attributions. Through careful experimentation, we show qualitatively and quantitatively that global additive explanations are able to describe model behavior and yield insights about models such as neural nets. A visualization of our approach applied to a neural net as it is trained is available at https://youtu.be/ErQYwNqzEdc", "keywords": "global interpretability;additive explanations;model distillation;neural nets;tabular data", "primary_area": "", "supplementary_material": "", "author": "Sarah Tan;Rich Caruana;Giles Hooker;Paul Koch;Albert Gordo", "authorids": "ht395@cornell.edu;rcaruana@microsoft.com;gjh27@cornell.edu;paulkoch@microsoft.com;albert.gordo.s@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntan2019learning,\ntitle={Learning Global Additive Explanations for Neural Nets Using Model Distillation},\nauthor={Sarah Tan and Rich Caruana and Giles Hooker and Paul Koch and Albert Gordo},\nyear={2019},\nurl={https://openreview.net/forum?id=SJl8J30qFX},\n}", "github": "[![github](/images/github_icon.svg) shftan/distilled_additive_explanations](https://github.com/shftan/distilled_additive_explanations)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJl8J30qFX", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;4;5", "wc_review": "293;231;264", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "34;737;34", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 262.6666666666667, 25.32894698868383 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 268.3333333333333, 331.3973781160953 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 123, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5774691633586158931&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "SJl8gnAqtX", "title": "Prob2Vec: Mathematical Semantic Embedding for Problem Retrieval in Adaptive Tutoring", "track": "main", "status": "Reject", "tldr": "We propose the Prob2Vec method for problem embedding used in a personalized e-learning tool in addition to a data level classification method, called negative pre-training, for cases where the training data set is imbalanced.", "abstract": "We propose a new application of embedding techniques to problem retrieval in adaptive tutoring. The objective is to retrieve problems similar in mathematical concepts. There are two challenges: First, like sentences, problems helpful to tutoring are never exactly the same in terms of the underlying concepts. Instead, good problems mix concepts in innovative ways, while still displaying continuity in their relationships. Second, it is difficult for humans to determine a similarity score consistent across a large enough training set. We propose a hierarchical problem embedding algorithm, called Prob2Vec, that consists of an abstraction and an embedding step. Prob2Vec achieves 96.88\\% accuracy on a problem similarity test, in contrast to 75\\% from directly applying state-of-the-art sentence embedding methods. It is surprising that Prob2Vec is able to distinguish very fine-grained differences among problems, an ability humans need time and effort to acquire. In addition, the sub-problem of concept labeling with imbalanced training data set is interesting in its own right. It is a multi-label problem suffering from dimensionality explosion, which we propose ways to ameliorate. We propose the novel negative pre-training algorithm that dramatically reduces false negative and positive ratios for classification, using an imbalanced training data set.", "keywords": "personalized learning;e-learning;text embedding;Skip-gram;imbalanced data set;data level classification methods", "primary_area": "", "supplementary_material": "", "author": "Du Su;Ali Yekkehkhany;Yi Lu;Wenmiao Lu", "authorids": "dusu3@illinois.edu;yekkehk2@illinois.edu;yilu4@illinois.edu;wenmiao.lu@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsu2019probvec,\ntitle={Prob2Vec: Mathematical Semantic Embedding for Problem Retrieval in Adaptive Tutoring},\nauthor={Du Su and Ali Yekkehkhany and Yi Lu and Wenmiao Lu},\nyear={2019},\nurl={https://openreview.net/forum?id=SJl8gnAqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJl8gnAqtX", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;3", "wc_review": "245;296;281", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "504;449;591", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 274.0, 21.400934559032695 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 514.6666666666666, 58.459862774005515 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14013719798192749721&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "SJl98sR5tX", "title": "Interactive Agent Modeling by Learning to Probe", "track": "main", "status": "Reject", "tldr": "We propose an interactive agent modeling framework by learning a probing policy to diversify task settings and to incite new behaviors of a target agent for a better modeling of the target agent.", "abstract": "The ability of modeling the other agents, such as understanding their intentions and skills, is essential to an agent's interactions with other agents. Conventional agent modeling relies on passive observation from demonstrations. In this work, we propose an interactive agent modeling scheme enabled by encouraging an agent to learn to probe. In particular, the probing agent (i.e. a learner) learns to interact with the environment and with a target agent (i.e., a demonstrator) to maximize the change in the observed behaviors of that agent. Through probing, rich behaviors can be observed and are used for enhancing the agent modeling to learn a more accurate mind model of the target agent. Our framework consists of two learning processes: i) imitation learning for an approximated agent model and ii) pure curiosity-driven reinforcement learning for an efficient probing policy to discover new behaviors that otherwise can not be observed. We have validated our approach in four different tasks. The experimental results suggest that the agent model learned by our approach i) generalizes better in novel scenarios than the ones learned by passive observation, random probing, and other curiosity-driven approaches do, and ii) can be used for enhancing performance in multiple applications including distilling optimal planning to a policy net, collaboration, and competition. A video demo is available at https://www.dropbox.com/s/8mz6rd3349tso67/Probing_Demo.mov?dl=0", "keywords": "Agent Modeling;Theory of Mind;Deep Reinforcement Learning;Multi-agent Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Tianmin Shu;Caiming Xiong;Ying Nian Wu;Song-Chun Zhu", "authorids": "tianmin.shu@ucla.edu;cxiong@salesforce.com;ywu@stat.ucla.edu;sczhu@stat.ucla.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshu2019interactive,\ntitle={Interactive Agent Modeling by Learning to Probe},\nauthor={Tianmin Shu and Caiming Xiong and Ying Nian Wu and Song-Chun Zhu},\nyear={2019},\nurl={https://openreview.net/forum?id=SJl98sR5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SJl98sR5tX", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;3;4", "wc_review": "323;751;271;569", "wc_reply_reviewers": "153;105;0;111", "wc_reply_authors": "241;598;190;425", "reply_reviewers": "1;1;0;1", "reply_authors": "1;2;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 478.5, 193.44443646690902 ], "wc_reply_reviewers_avg": [ 92.25, 56.37985012395829 ], "wc_reply_authors_avg": [ 363.5, 161.15287772795122 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2644422767362890353&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "SJlYcoCcKX", "title": "KNOWLEDGE DISTILL VIA LEARNING NEURON MANIFOLD", "track": "main", "status": "Withdraw", "tldr": "A new knowledge distill method for transfer learning", "abstract": "Although deep neural networks show their extraordinary power in various tasks, they are not feasible for deploying such large models on embedded systems due to high computational cost and storage space limitation. The recent work knowledge distillation (KD) aims at transferring model knowledge from a well-trained teacher model to a small and fast student model which can significantly help extending the usage of large deep neural networks on portable platform. In this paper, we show that, by properly defining the neuron manifold of deep neuron network (DNN), we can significantly improve the performance of student DNN networks through approximating neuron manifold of powerful teacher network. To make this, we propose several novel methods for learning neuron manifold from DNN model. Empowered with neuron manifold knowledge, our experiments show the great improvement across a variety of DNN architectures and training data. Compared with other KD methods, our Neuron Manifold Transfer (NMT) has best transfer ability of the learned features.", "keywords": "Deep Learning;Machine Learning;Knowledge Distill;Model Compression", "primary_area": "", "supplementary_material": "", "author": "Zeyi Tao;Qi Xia;Qun Li", "authorids": "ztao@email.wm.edu;qxia01@email.wm.edu;liqun@cs.wm.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJlYcoCcKX", "pdf_size": 0, "rating": "1;3;5", "confidence": "5;4;3", "wc_review": "26;364;54", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 148.0, 153.16222336681676 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MPCk3vqWVOQJ:scholar.google.com/&scioq=KNOWLEDGE+DISTILL+VIA+LEARNING+NEURON+MANIFOLD&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJldZ2RqFX", "title": "D-GAN: Divergent generative adversarial network for positive unlabeled learning and counter-examples generation", "track": "main", "status": "Reject", "tldr": "A new two-stage positive unlabeled learning approach with GAN", "abstract": "Positive Unlabeled (PU) learning consists in learning to distinguish samples of our class of interest, the positive class, from the counter-examples, the negative class, by using positive labeled and unlabeled samples during the training. Recent approaches exploit the GANs abilities to address the PU learning problem by generating relevant counter-examples. In this paper, we propose a new GAN-based PU learning approach named Divergent-GAN (D-GAN). The key idea is to incorporate a standard Positive Unlabeled learning risk inside the GAN discriminator loss function. In this way, the discriminator can ask the generator to converge towards the unlabeled samples distribution while diverging from the positive samples distribution. This enables the generator convergence towards the unlabeled counter-examples distribution without using prior knowledge, while keeping the standard adversarial GAN architecture. In addition, we discuss normalization techniques in the context of the proposed framework. Experimental results show that the proposed approach overcomes previous GAN-based PU learning methods issues, and it globally outperforms two-stage state of the art PU learning performances in terms of stability and prediction on both simple and complex image datasets.", "keywords": "Representation learning. Generative Adversarial Network (GAN). Positive Unlabeled learning. Image classification", "primary_area": "", "supplementary_material": "", "author": "Florent CHIARONI. Mohamed-Cherif RAHAL. Nicolas HUEBER. Fr\u00e9d\u00e9ric DUFAUX.;Florent CHIARONI. Mohamed-Cherif RAHAL. Nicolas HUEBER. Fr\u00e9d\u00e9ric DUFAUX.;Florent CHIARONI. Mohamed-Cherif RAHAL. Nicolas HUEBER. Fr\u00e9d\u00e9ric DUFAUX.;Florent CHIARONI. Mohamed-Cherif RAHAL. Nicolas HUEBER. Fr\u00e9d\u00e9ric DUFAUX.", "authorids": "florent.chiaroni@vedecom.fr;mohamed.rahal@vedecom.fr;nicolas.hueber@isl.eu;frederic.dufaux@l2s.centralesupelec.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndufaux.2019dgan,\ntitle={D-{GAN}: Divergent generative adversarial network for positive unlabeled learning and counter-examples generation},\nauthor={Florent CHIARONI. Mohamed-Cherif RAHAL. Nicolas HUEBER. Fr\u00e9d\u00e9ric DUFAUX.},\nyear={2019},\nurl={https://openreview.net/forum?id=SJldZ2RqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJldZ2RqFX", "pdf_size": 0, "rating": "3;3;5", "confidence": "5;4;1", "wc_review": "690;396;126", "wc_reply_reviewers": "331;0;0", "wc_reply_authors": "2964;1026;1297", "reply_reviewers": "2;0;0", "reply_authors": "5;2;2", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 1.699673171197595 ], "wc_review_avg": [ 404.0, 230.32151440974852 ], "wc_reply_reviewers_avg": [ 110.33333333333333, 156.0348963818315 ], "wc_reply_authors_avg": [ 1762.3333333333333, 856.8789620217989 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9707253433941506, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l67fGZT3Q5kJ:scholar.google.com/&scioq=D-GAN:+Divergent+generative+adversarial+network+for+positive+unlabeled+learning+and+counter-examples+generation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJlgOjAqYQ", "title": "A quantifiable testing of global translational invariance in Convolutional and Capsule Networks", "track": "main", "status": "Reject", "tldr": "Testing of global translational invariance in Convolutional and Capsule Networks", "abstract": " We design simple and quantifiable testing of global translation-invariance in deep learning models trained on the MNIST dataset. Experiments on convolutional and capsules neural networks show that both models have poor performance in dealing with global translation-invariance; however, the performance improved by using data augmentation. Although the capsule network is better on the MNIST testing dataset, the convolutional neural network generally has better performance on the translation-invariance.", "keywords": "Translational invariance;CNN;Capsule Network", "primary_area": "", "supplementary_material": "", "author": "Weikai Qi", "authorids": "wikaiqi@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nqi2019a,\ntitle={A quantifiable testing of global translational invariance in Convolutional and Capsule Networks},\nauthor={Weikai Qi},\nyear={2019},\nurl={https://openreview.net/forum?id=SJlgOjAqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJlgOjAqYQ", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;4", "wc_review": "297;79;281", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 219.0, 99.21021452787342 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=533665766584254389&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJlh2jR9FX", "title": "Learning with Reflective Likelihoods", "track": "main", "status": "Reject", "tldr": "Training deep probabilistic models with maximum likelihood often leads to \"input forgetting\". We identify a potential cause and propose a new learning criterion to alleviate the issue.", "abstract": "Models parameterized by deep neural networks have achieved state-of-the-art results in many domains. These models are usually trained using the maximum likelihood principle with a finite set of observations. However, training deep probabilistic models with maximum likelihood can lead to the issue we refer to as input forgetting. In deep generative latent-variable models, input forgetting corresponds to posterior collapse---a phenomenon in which the latent variables are driven independent from the observations. However input forgetting can happen even in the absence of latent variables. We attribute input forgetting in deep probabilistic models to the finite sample dilemma of maximum likelihood. We formalize this problem and propose a learning criterion---termed reflective likelihood---that explicitly prevents input forgetting. We empirically observe that the proposed criterion significantly outperforms the maximum likelihood objective when used in classification under a skewed class distribution. Furthermore, the reflective likelihood objective prevents posterior collapse when used to train stochastic auto-encoders with amortized inference. For example in a neural topic modeling experiment, the reflective likelihood objective leads to better quantitative and qualitative results than the variational auto-encoder and the importance-weighted auto-encoder.", "keywords": "new learning criterion;penalized maximum likelihood;posterior inference in deep generative models;input forgetting issue;latent variable collapse issue", "primary_area": "", "supplementary_material": "", "author": "Adji B. Dieng;Kyunghyun Cho;David M. Blei;Yann LeCun", "authorids": "abd2141@columbia.edu;kyunghyun.cho@nyu.edu;david.blei@columbia.edu;yann@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ndieng2019learning,\ntitle={Learning with Reflective Likelihoods},\nauthor={Adji B. Dieng and Kyunghyun Cho and David M. Blei and Yann LeCun},\nyear={2019},\nurl={https://openreview.net/forum?id=SJlh2jR9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJlh2jR9FX", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "wc_review": "216;161;641", "wc_reply_reviewers": "140;0;161", "wc_reply_authors": "813;357;1044", "reply_reviewers": "3;0;1", "reply_authors": "3;2;2", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 339.3333333333333, 214.48905695991942 ], "wc_reply_reviewers_avg": [ 100.33333333333333, 71.46249987852993 ], "wc_reply_authors_avg": [ 738.0, 285.4365078261714 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=842918868722182593&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SJlp8sA5Y7", "title": "An Efficient Network for Predicting Time-Varying Distributions", "track": "main", "status": "Withdraw", "tldr": "We propose an efficient recurrent network model for forward prediction on time-varying distributions.", "abstract": "While deep neural networks have achieved groundbreaking prediction results in many tasks, there is a class of data where existing architectures are not optimal -- sequences of probability distributions. Performing forward prediction on sequences of distributions has many important applications. However, there are two main challenges in designing a network model for this task. First, neural networks are unable to encode distributions compactly as each node encodes just a real value. A recent work of Distribution Regression Network (DRN) solved this problem with a novel network that encodes an entire distribution in a single node, resulting in improved accuracies while using much fewer parameters than neural networks. However, despite its compact distribution representation, DRN does not address the second challenge, which is the need to model time dependencies in a sequence of distributions. In this paper, we propose our Recurrent Distribution Regression Network (RDRN) which adopts a recurrent architecture for DRN. The combination of compact distribution representation and shared weights architecture across time steps makes RDRN suitable for modeling the time dependencies in a distribution sequence. Compared to neural networks and DRN, RDRN achieves the best prediction performance while keeping the network compact.", "keywords": "Distribution regression;Distribution sequence;Forward prediction", "primary_area": "", "supplementary_material": "", "author": "Connie Kou;Hwee Kuan Lee;Teck Khim Ng;Jorge Sanz", "authorids": "koukl@comp.nus.edu.sg;leehk@bii.a-star.edu.sg;ngtk@comp.nus.edu.sg;jorges@nus.edu.sg", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJlp8sA5Y7", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;4", "wc_review": "168;296;208", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 224.0, 53.466500415369126 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xb7f9xtPb34J:scholar.google.com/&scioq=An+Efficient+Network+for+Predicting+Time-Varying+Distributions&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJlpM3RqKQ", "title": "Expanding the Reach of Federated Learning by Reducing Client Resource Requirements", "track": "main", "status": "Reject", "tldr": "", "abstract": "Communication on heterogeneous edge networks is a fundamental bottleneck in Federated Learning (FL), restricting both model capacity and user participation. To address this issue, we introduce two novel strategies to reduce communication costs: (1) the use of lossy compression on the global model sent server-to-client; and (2) Federated Dropout, which allows users to efficiently train locally on smaller subsets of the global model and also provides a reduction in both client-to-server communication and local computation. We empirically show that these strategies, combined with existing compression approaches for client-to-server communication, collectively provide up to a 9.6x reduction in server-to-client communication, a 1.5x reduction in local computation, and a 24x reduction in upload communication, all without degrading the quality of the final model. We thus comprehensively reduce FL's impact on client device resources, allowing higher capacity models to be trained, and a more diverse set of users to be reached.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sebastian Caldas;Jakub Kone\u010dn\u00fd;Brendan McMahan;Ameet Talwalkar", "authorids": "scaldas@cmu.edu;konkey@google.com;mcmahan@google.com;talwalkar@cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ncaldas2019expanding,\ntitle={Expanding the Reach of Federated Learning by Reducing Client Resource Requirements},\nauthor={Sebastian Caldas and Jakub Kone\u010dn\u00fd and Brendan McMahan and Ameet Talwalkar},\nyear={2019},\nurl={https://openreview.net/forum?id=SJlpM3RqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJlpM3RqKQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "wc_review": "404;386;193", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "828;1026;388", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 327.6666666666667, 95.50683512479908 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 747.3333333333334, 266.63499811956837 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 563, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=38188124998474979&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SJlt6oA9Fm", "title": "Selective Convolutional Units: Improving CNNs via Channel Selectivity", "track": "main", "status": "Reject", "tldr": "We propose a new module that improves any ResNet-like architectures by enforcing \"channel selective\" behavior to convolutional layers", "abstract": "Bottleneck structures with identity (e.g., residual) connection are now emerging popular paradigms for designing deep convolutional neural networks (CNN), for processing large-scale features efficiently. In this paper, we focus on the information-preserving nature of identity connection and utilize this to enable a convolutional layer to have a new functionality of channel-selectivity, i.e., re-distributing its computations to important channels. In particular, we propose Selective Convolutional Unit (SCU), a widely-applicable architectural unit that improves parameter efficiency of various modern CNNs with bottlenecks. During training, SCU gradually learns the channel-selectivity on-the-fly via the alternative usage of (a) pruning unimportant channels, and (b) rewiring the pruned parameters to important channels. The rewired parameters emphasize the target channel in a way that selectively enlarges the convolutional kernels corresponding to it. Our experimental results demonstrate that the SCU-based models without any postprocessing generally achieve both model compression and accuracy improvement compared to the baselines, consistently for all tested architectures.", "keywords": "convolutional neural networks;channel-selectivity;channel re-wiring;bottleneck architectures;deep learning", "primary_area": "", "supplementary_material": "", "author": "Jongheon Jeong;Jinwoo Shin", "authorids": "jongheonj@kaist.ac.kr;jinwoos@kaist.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\njeong2019selective,\ntitle={Selective Convolutional Units: Improving {CNN}s via Channel Selectivity},\nauthor={Jongheon Jeong and Jinwoo Shin},\nyear={2019},\nurl={https://openreview.net/forum?id=SJlt6oA9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=SJlt6oA9Fm", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;2", "wc_review": "235;297;135", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "566;182;431", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 222.33333333333334, 66.73995971097243 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 393.0, 159.0534501354812 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:0lAcHlbzZFMJ:scholar.google.com/&scioq=Selective+Convolutional+Units:+Improving+CNNs+via+Channel+Selectivity&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SJx5kn0cK7", "title": "HAPPIER: Hierarchical Polyphonic Music Generative RNN", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generating polyphonic music with coherent global structure is a major challenge for automatic composition algorithms. The primary difficulty arises due to the inefficiency of models to recognize underlying patterns beneath music notes across different levels of time scales and remain long-term consistency while composing. Hierarchical architectures can capture and represent learned patterns in different temporal scales and maintain consistency over long time spans, and this corresponds to the hierarchical structure in music. Motivated by this, focusing on leveraging the idea of hierarchical models and improve them to fit the sequence modeling problem, our paper proposes HAPPIER: a novel HierArchical PolyPhonic musIc gEnerative RNN. In HAPPIER, A higher `measure level' learns correlations across measures and patterns for chord progressions, and a lower `note level' learns a conditional distribution over the notes to generate within a measure. The two hierarchies operate at different clock rates: the higher one operates on a longer timescale and updates every measure, while the lower one operates on a shorter timescale and updates every unit duration. The two levels communicate with each other, and thus the entire architecture is trained jointly end-to-end by back-propagation. HAPPIER, profited from the strength of the hierarchical structure, generates polyphonic music with long-term dependencies compared to the state-of-the-art methods.", "keywords": "hierarchical model;RNN;generative model;automatic composing", "primary_area": "", "supplementary_material": "", "author": "Tianyang Zhao;Xiaoxuan Ma;Honglin Ma;Yizhou Wang", "authorids": "zhaotianyang@pku.edu.cn;maxiaoxuan@pku.edu.cn;mahonglin_pku@outlook.com;yizhou.wang@pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2019happier,\ntitle={{HAPPIER}: Hierarchical Polyphonic Music Generative {RNN}},\nauthor={Tianyang Zhao and Xiaoxuan Ma and Honglin Ma and Yizhou Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=SJx5kn0cK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJx5kn0cK7", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;5;4", "wc_review": "228;510;574", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 437.3333333333333, 150.3093107192262 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4860945383439166281&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Diversity is All You Need: Learning Skills without a Reward Function", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/720", "id": "SJx63jRqFm", "author_site": "Benjamin Eysenbach, Abhishek Gupta, Julian Ibarz, Sergey Levine", "tldr": "We propose an algorithm for learning useful skills without a reward function, and show how these skills can be used to solve downstream tasks.", "abstract": "Intelligent creatures can explore their environments and learn useful skills without supervision.\nIn this paper, we propose ``Diversity is All You Need''(DIAYN), a method for learning useful skills without a reward function. Our proposed method learns skills by maximizing an information theoretic objective using a maximum entropy policy. On a variety of simulated robotic tasks, we show that this simple objective results in the unsupervised emergence of diverse skills, such as walking and jumping. In a number of reinforcement learning benchmark environments, our method is able to learn a skill that solves the benchmark task despite never receiving the true task reward. We show how pretrained skills can provide a good parameter initialization for downstream tasks, and can be composed hierarchically to solve complex, sparse reward tasks. Our results suggest that unsupervised discovery of skills can serve as an effective pretraining mechanism for overcoming challenges of exploration and data efficiency in reinforcement learning.", "keywords": "reinforcement learning;unsupervised learning;skill discovery", "primary_area": "", "supplementary_material": "", "author": "Benjamin Eysenbach;Abhishek Gupta;Julian Ibarz;Sergey Levine", "authorids": "beysenba@cs.cmu.edu;abhigupta@berkeley.edu;julianibarz@google.com;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\neysenbach2018diversity,\ntitle={Diversity is All You Need: Learning Skills without a Reward Function},\nauthor={Benjamin Eysenbach and Abhishek Gupta and Julian Ibarz and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJx63jRqFm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=SJx63jRqFm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;3;4", "wc_review": "835;387;492", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "271;289;34", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 571.3333333333334, 191.30487593251655 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 198.0, 116.19810669714029 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1341, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12324439663284457782&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SJx63jRqFm", "pdf": "https://openreview.net/pdf?id=SJx63jRqFm", "email": ";;;", "author_num": 4 }, { "id": "SJx94o0qYX", "title": "Precision Highway for Ultra Low-precision Quantization", "track": "main", "status": "Reject", "tldr": "precision highway; a generalized concept of high-precision information flow for sub 4-bit quantization ", "abstract": "Quantization of a neural network has an inherent problem called accumulated quantization error, which is the key obstacle towards ultra-low precision, e.g., 2- or 3-bit precision. To resolve this problem, we propose precision highway, which forms an end-to-end high-precision information flow while performing the ultra-low-precision computation. First, we describe how the precision highway reduce the accumulated quantization error in both convolutional and recurrent neural networks. We also provide the quantitative analysis of the benefit of precision highway and evaluate the overhead on the state-of-the-art hardware accelerator. In the experiments, our proposed method outperforms the best existing quantization methods while offering 3-bit weight/activation quantization with no accuracy loss and 2-bit quantization with a 2.45 % top-1 accuracy loss in ResNet-50. We also report that the proposed method significantly outperforms the existing method in the 2-bit quantization of an LSTM for language modeling.", "keywords": "neural network;quantization;optimization;low-precision;convolutional network;recurrent network", "primary_area": "", "supplementary_material": "", "author": "Eunhyeok Park;Dongyoung Kim;Sungjoo Yoo;Peter Vajda", "authorids": "canusglow@gmail.com;dongyoungkim42@gmail.com;sungjoo.yoo@gmail.com;vajdap@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\npark2019precision,\ntitle={Precision Highway for Ultra Low-precision Quantization},\nauthor={Eunhyeok Park and Dongyoung Kim and Sungjoo Yoo and Peter Vajda},\nyear={2019},\nurl={https://openreview.net/forum?id=SJx94o0qYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJx94o0qYX", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;5;4", "wc_review": "217;187;407", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "684;431;336", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 270.3333333333333, 97.41092797468305 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 483.6666666666667, 146.87031316399134 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12227925765342433228&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SJxCsj0qYX", "title": "Stackelberg GAN: Towards Provable Minimax Equilibrium via Multi-Generator Architectures", "track": "main", "status": "Reject", "tldr": "We study the problem of alleviating the instability issue in the GAN training procedure via new architecture design, with theoretical guarantees.", "abstract": "We study the problem of alleviating the instability issue in the GAN training procedure via new architecture design. The discrepancy between the minimax and maximin objective values could serve as a proxy for the difficulties that the alternating gradient descent encounters in the optimization of GANs. In this work, we give new results on the benefits of multi-generator architecture of GANs. We show that the minimax gap shrinks to \\epsilon as the number of generators increases with rate O(1/\\epsilon). This improves over the best-known result of O(1/\\epsilon^2). At the core of our techniques is a novel application of Shapley-Folkman lemma to the generic minimax problem, where in the literature the technique was only known to work when the objective function is restricted to the Lagrangian function of a constraint optimization problem. Our proposed Stackelberg GAN performs well experimentally in both synthetic and real-world datasets, improving Frechet Inception Distance by 14.61% over the previous multi-generator GANs on the benchmark datasets.", "keywords": "generative adversarial nets;minimax duality gap;equilibrium", "primary_area": "", "supplementary_material": "", "author": "Hongyang Zhang;Susu Xu;Jiantao Jiao;Pengtao Xie;Ruslan Salakhutdinov;Eric P. Xing", "authorids": "hongyanz@cs.cmu.edu;susux@andrew.cmu.edu;jiantao@eecs.berkeley.edu;pengtao.xie@petuum.com;rsalakhu@cs.cmu.edu;epxing@cs.cmu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nzhang2019stackelberg,\ntitle={Stackelberg {GAN}: Towards Provable Minimax Equilibrium via Multi-Generator Architectures},\nauthor={Hongyang Zhang and Susu Xu and Jiantao Jiao and Pengtao Xie and Ruslan Salakhutdinov and Eric P. Xing},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxCsj0qYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SJxCsj0qYX", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;3", "wc_review": "648;497;135", "wc_reply_reviewers": "0;203;75", "wc_reply_authors": "754;746;396", "reply_reviewers": "0;1;1", "reply_authors": "1;1;2", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 426.6666666666667, 215.25540385525488 ], "wc_reply_reviewers_avg": [ 92.66666666666667, 83.81063310954178 ], "wc_reply_authors_avg": [ 632.0, 166.9091569287517 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7461640725316871474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SJxFN3RcFX", "title": "Functional Bayesian Neural Networks for Model Uncertainty Quantification", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper, we extend the Bayesian neural network to functional Bayesian neural network with functional Monte Carlo methods that use the samples of functionals instead of samples of networks' parameters for inference to overcome the curse of dimensionality for uncertainty quantification. Based on the previous work on Riemannian Langevin dynamics, we propose the stochastic gradient functional Riemannian dynamics for training functional Bayesian neural network. We show the effectiveness and efficiency of our proposed approach with various experiments. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Nanyang Ye;Zhanxing Zhu", "authorids": "yn272@cam.ac.uk;zhanxing.zhu@pku.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nye2019functional,\ntitle={Functional Bayesian Neural Networks for Model Uncertainty Quantification},\nauthor={Nanyang Ye and Zhanxing Zhu},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxFN3RcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJxFN3RcFX", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;2", "wc_review": "574;425;104", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 367.6666666666667, 196.11277934449407 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13305489898970366805&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SJxJtiRqt7", "title": "Generating Images from Sounds Using Multimodal Features and GANs", "track": "main", "status": "Reject", "tldr": "We propose a method of converting from the sound domain into the image domain based on multimodal features and stacked GANs.", "abstract": "Although generative adversarial networks (GANs) have enabled us to convert images from one domain to another similar one, converting between different sensory modalities, such as images and sounds, has been difficult. This study aims to propose a network that reconstructs images from sounds. First, video data with both images and sounds are labeled with pre-trained classifiers. Second, image and sound features are extracted from the data using pre-trained classifiers. Third, multimodal layers are introduced to extract features that are common to both the images and sounds. These layers are trained to extract similar features regardless of the input modality, such as images only, sounds only, and both images and sounds. Once the multimodal layers have been trained, features are extracted from input sounds and converted into image features using a feature-to-feature GAN. Finally, the generated image features are used to reconstruct images. Experimental results show that this method can successfully convert from the sound domain into the image domain. When we applied a pre-trained classifier to both the generated and original images, 31.9% of the examples had at least one of their top 10 labels in common, suggesting reasonably good image generation. Our results suggest that common representations can be learned for different modalities, and that proposed method can be applied not only to sound-to-image conversion but also to other conversions, such as from images to sounds.", "keywords": "deep learning;machine learning;multimodal;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Jeonghyun Lyu;Takashi Shinozaki;Kaoru Amano", "authorids": "app@live.jp;tshino@nict.go.jp;kaoruamano@nict.go.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlyu2019generating,\ntitle={Generating Images from Sounds Using Multimodal Features and {GAN}s},\nauthor={Jeonghyun Lyu and Takashi Shinozaki and Kaoru Amano},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxJtiRqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SJxJtiRqt7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "wc_review": "276;618;720", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 538.0, 189.88417522268674 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10909877345071256112&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Supervised Policy Update for Deep Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/744", "id": "SJxTroR9F7", "author_site": "Quan Vuong, Yiming Zhang, Keith Ross", "tldr": "first posing and solving the sample efficiency optimization problem in the non-parameterized policy space, and then solving a supervised regression problem to find a parameterized policy that is near the optimal non-parameterized policy.", "abstract": "We propose a new sample-efficient methodology, called Supervised Policy Update (SPU), for deep reinforcement learning. Starting with data generated by the current policy, SPU formulates and solves a constrained optimization problem in the non-parameterized proximal policy space. Using supervised regression, it then converts the optimal non-parameterized policy to a parameterized policy, from which it draws new samples. The methodology is general in that it applies to both discrete and continuous action spaces, and can handle a wide variety of proximity constraints for the non-parameterized optimization problem. We show how the Natural Policy Gradient and Trust Region Policy Optimization (NPG/TRPO) problems, and the Proximal Policy Optimization (PPO) problem can be addressed by this methodology. The SPU implementation is much simpler than TRPO. In terms of sample efficiency, our extensive experiments show SPU outperforms TRPO in Mujoco simulated robotic tasks and outperforms PPO in Atari video game tasks.", "keywords": "Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Quan Vuong;Yiming Zhang;Keith W. Ross", "authorids": "quan.hovuong@gmail.com;yiming.zhang@nyu.edu;keithwross@nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nvuong2018supervised,\ntitle={{SUPERVISED} {POLICY} {UPDATE}},\nauthor={Quan Vuong and Yiming Zhang and Keith W. Ross},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxTroR9F7},\n}", "github": "[![github](/images/github_icon.svg) quanvuong/Supervised_Policy_Update](https://github.com/quanvuong/Supervised_Policy_Update)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;9", "confidence": "3;4;2", "wc_review": "275;793;59", "wc_reply_reviewers": "0;623;0", "wc_reply_authors": "679;2307;27", "reply_reviewers": "0;6;0", "reply_authors": "1;12;1", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 375.6666666666667, 307.992784908276 ], "wc_reply_reviewers_avg": [ 207.66666666666666, 293.68501645281276 ], "wc_reply_authors_avg": [ 1004.3333333333334, 958.8122281702964 ], "reply_reviewers_avg": [ 2.0, 2.8284271247461903 ], "reply_authors_avg": [ 4.666666666666667, 5.185449728701348 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9669638111330201224&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SJxTroR9F7", "pdf": "https://openreview.net/pdf?id=SJxTroR9F7", "email": ";;", "author_num": 3 }, { "id": "SJxbps09K7", "title": "Empirical observations on the instability of aligning word vector spaces with GANs", "track": "main", "status": "Withdraw", "tldr": "An empirical investigation of GAN-based alignment of word vector spaces, focusing on cases, where linear transformations provably exist, but training is unstable.", "abstract": "Unsupervised bilingual dictionary induction (UBDI) is useful for unsupervised machine translation and for cross-lingual transfer of models into low-resource languages. One approach to UBDI is to align word vector spaces in different languages using Generative adversarial networks (GANs) with linear generators, achieving state-of-the-art performance for several language pairs. For some pairs, however, GAN-based induction is unstable or completely fails to align the vector spaces. We focus on cases where linear transformations provably exist, but the performance of GAN-based UBDI depends heavily on the model initialization. We show that the instability depends on the shape and density of the vector sets, but not on noise; it is the result of local optima, but neither over-parameterization nor changing the batch size or the learning rate consistently reduces instability. Nevertheless, we can stabilize GAN-based UBDI through best-of-N model selection, based on an unsupervised stopping criterion. ", "keywords": "natural language processing;bilingual dictionary induction;unsupervised learning;generative adversarial networks", "primary_area": "", "supplementary_material": "", "author": "Mareike Hartmann;Yova Kementchedjhieva;Anders S\u00f8gaard", "authorids": "hartmann@di.ku.dk;yova@di.ku.dk;soegaard@di.ku.dk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJxbps09K7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "600;928;233", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 587.0, 283.881430647844 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5193835862324858036&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SJxfxnA9K7", "title": "Structured Prediction using cGANs with Fusion Discriminator", "track": "main", "status": "Reject", "tldr": "We propose a novel way to incorporate conditional image information into the discriminator of GANs using feature fusion that can be used for structured prediction tasks.", "abstract": "We propose a novel method for incorporating conditional information into a generative adversarial network (GAN) for structured prediction tasks. This method is based on fusing features from the generated and conditional information in feature space and allows the discriminator to better capture higher-order statistics from the data. This method also increases the strength of the signals passed through the network where the real or generated data and the conditional data agree. The proposed method is conceptually simpler than the joint convolutional neural network - conditional Markov random field (CNN-CRF) models and enforces higher-order consistency without being limited to a very specific class of high-order potentials. Experimental results demonstrate that this method leads to improvement on a variety of different structured prediction tasks including image synthesis, semantic segmentation, and depth estimation.", "keywords": "Generative Adversarial Networks;GANs;conditional GANs;Discriminator;Fusion", "primary_area": "", "supplementary_material": "", "author": "Faisal Mahmood;Wenhao Xu;Nicholas J. Durr;Jeremiah W. Johnson;Alan Yuille", "authorids": "faisalm@jhu.edu;wxu47@jhu.edu;ndurr@jhu.edu;jeremiah.johnson@unh.edu;alan.l.yuille@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmahmood2019structured,\ntitle={Structured Prediction using c{GAN}s with Fusion Discriminator},\nauthor={Faisal Mahmood and Wenhao Xu and Nicholas J. Durr and Jeremiah W. Johnson and Alan Yuille},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxfxnA9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJxfxnA9K7", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;4;3", "wc_review": "436;166;153", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;14", "reply_reviewers": "0;0;0", "reply_authors": "0;0;1", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 251.66666666666666, 130.451353214735 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 4.666666666666667, 6.599663291074443 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4025895705812426261&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SJxiHnCqKQ", "title": "MCTSBug: Generating Adversarial Text Sequences via Monte Carlo Tree Search and Homoglyph Attack", "track": "main", "status": "Withdraw", "tldr": "Use Monte carlo Tree Search and Homoglyphs to generate indistinguishable adversarial samples on text data", "abstract": "Crafting adversarial examples on discrete inputs like text sequences is fundamentally different from generating such examples for continuous inputs like images. This paper tries to answer the question: under a black-box setting, can we create adversarial examples automatically to effectively fool deep learning classifiers on texts by making imperceptible changes? Our answer is a firm yes. Previous efforts mostly replied on using gradient evidence, and they are less effective either due to finding the nearest neighbor word (wrt meaning) automatically is difficult or relying heavily on hand-crafted linguistic rules. We, instead, use Monte Carlo tree search (MCTS) for finding the most important few words to perturb and perform homoglyph attack by replacing one character in each selected word with a symbol of identical shape. Our novel algorithm, we call MCTSBug, is black-box and extremely effective at the same time. Our experimental results indicate that MCTSBug can fool deep learning classifiers at the success rates of 95% on seven large-scale benchmark datasets, by perturbing only a few characters. Surprisingly, MCTSBug, without relying on gradient information at all, is more effective than the gradient-based white-box baseline. Thanks to the nature of homoglyph attack, the generated adversarial perturbations are almost imperceptible to human eyes. ", "keywords": "Adversarial sample;Text;Black-box;MCTS;Homoglyph", "primary_area": "", "supplementary_material": "", "author": "Ji Gao;Jack Lanchantin;Yanjun Qi", "authorids": "jg6yd@virginia.edu;jjl5sw@virginia.edu;yanjun@virginia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJxiHnCqKQ", "pdf_size": 0, "rating": "3;4", "confidence": "4;3", "wc_review": "383;566", "wc_reply_reviewers": "0;0", "wc_reply_authors": "27;0", "reply_reviewers": "0;0", "reply_authors": "1;0", "rating_avg": [ 3.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 474.5, 91.5 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 13.5, 13.5 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.5, 0.5 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R0RmEJzloAkJ:scholar.google.com/&scioq=MCTSBug:+Generating+Adversarial+Text+Sequences+via+Monte+Carlo+Tree+Search+and+Homoglyph+Attack&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Learning sparse relational transition models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/934", "id": "SJxsV2R5FQ", "author_site": "Victoria Xia, Zi Wang, Kelsey Allen, Tom Silver, Leslie Kaelbling", "tldr": "A new approach that learns a representation for describing transition models in complex uncertaindomains using relational rules. ", "abstract": "We present a representation for describing transition models in complex uncertain domains using relational rules. For any action, a rule selects a set of relevant objects and computes a distribution over properties of just those objects in the resulting state given their properties in the previous state. An iterative greedy algorithm is used to construct a set of deictic references that determine which objects are relevant in any given state. Feed-forward neural networks are used to learn the transition distribution on the relevant objects' properties. This strategy is demonstrated to be both more versatile and more sample efficient than learning a monolithic transition model in a simulated domain in which a robot pushes stacks of objects on a cluttered table.", "keywords": "Deictic reference;relational model;rule-based transition model", "primary_area": "", "supplementary_material": "", "author": "Victoria Xia;Zi Wang;Kelsey Allen;Tom Silver;Leslie Pack Kaelbling", "authorids": "victoria.f.xia281@gmail.com;ziw@mit.edu;krallen@mit.edu;tslvr@mit.edu;lpk@csail.mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nxia2018learning,\ntitle={Learning sparse relational transition models},\nauthor={Victoria Xia and Zi Wang and Leslie Pack Kaelbling},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxsV2R5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;2;3", "wc_review": "236;258;265", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 253.0, 12.355835328567093 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7997110422667369125&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SJxsV2R5FQ", "pdf": "https://openreview.net/pdf?id=SJxsV2R5FQ", "email": ";;;;", "author_num": 5 }, { "title": "Learning to Schedule Communication in Multi-agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/931", "id": "SJxu5iR9KQ", "author_site": "Daewoo Kim, Sangwoo Moon, David Earl Hostallero, Wan Ju Kang, Taeyoung Lee, Kyunghwan Son, Yung Yi", "tldr": "", "abstract": "Many real-world reinforcement learning tasks require multiple agents to make sequential decisions under the agents\u2019 interaction, where well-coordinated actions among the agents are crucial to achieve the target goal better at these tasks. One way to accelerate the coordination effect is to enable multiple agents to communicate with each other in a distributed manner and behave as a group. In this paper, we study a practical scenario when (i) the communication bandwidth is limited and (ii) the agents share the communication medium so that only a restricted number of agents are able to simultaneously use the medium, as in the state-of-the-art wireless networking standards. This calls for a certain form of communication scheduling. In that regard, we propose a multi-agent deep reinforcement learning framework, called SchedNet, in which agents learn how to schedule themselves, how to encode the messages, and how to select actions based on received messages. SchedNet is capable of deciding which agents should be entitled to broadcasting their (encoded) messages, by learning the importance of each agent\u2019s partially observed information. We evaluate SchedNet against multiple baselines under two different applications, namely, cooperative communication and navigation, and predator-prey. Our experiments show a non-negligible performance gap between SchedNet and other mechanisms such as the ones without communication and with vanilla scheduling methods, e.g., round robin, ranging from 32% to 43%.", "keywords": "Multi agent reinforcement learning;deep reinforcement learning;Communication", "primary_area": "", "supplementary_material": "", "author": "Daewoo Kim;Sangwoo Moon;David Hostallero;Wan Ju Kang;Taeyoung Lee;Kyunghwan Son;Yung Yi", "authorids": "kdw2139@gmail.com;swmoon00@gmail.com;ddhostallero@kaist.ac.kr;soarhigh0714@gmail.com;tylee0325@gmail.com;khson@lanada.kaist.ac.kr;yiyung@kaist.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nkim2018learning,\ntitle={Learning to Schedule Communication in Multi-agent Reinforcement Learning},\nauthor={Daewoo Kim and Sangwoo Moon and David Hostallero and Wan Ju Kang and Taeyoung Lee and Kyunghwan Son and Yung Yi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxu5iR9KQ},\n}", "github": "[![github](/images/github_icon.svg) rhoowd/sched_net](https://github.com/rhoowd/sched_net)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "2;3;5", "wc_review": "202;198;101", "wc_reply_reviewers": "42;0;87", "wc_reply_authors": "600;277;787", "reply_reviewers": "1;0;1", "reply_authors": "1;1;2", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 167.0, 46.69760878960149 ], "wc_reply_reviewers_avg": [ 43.0, 35.52463933666322 ], "wc_reply_authors_avg": [ 554.6666666666666, 210.65981001499918 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 273, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2430706253185717368&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SJxu5iR9KQ", "pdf": "https://openreview.net/pdf?id=SJxu5iR9KQ", "email": ";;;;;;", "author_num": 7 }, { "id": "SJxzPsAqFQ", "title": "Multi-turn Dialogue Response Generation in an Adversarial Learning Framework", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose an adversarial learning approach to the generation of multi-turn dialogue responses. Our proposed framework, hredGAN, is based on conditional generative adversarial networks (GANs). The GAN's generator is a modified hierarchical recurrent encoder-decoder network (HRED) and the discriminator is a word-level bidirectional RNN that shares context and word embedding with the generator. During inference, noise samples conditioned on the dialogue history are used to perturb the generator's latent space to generate several possible responses. The final response is the one ranked best by the discriminator. The hredGAN shows major advantages over existing methods: (1) it generalizes better than networks trained using only the log-likelihood criterion, and (2) it generates longer, more informative and more diverse responses with high utterance and topic relevance even with limited training data. This superiority is demonstrated on the Movie triples and Ubuntu dialogue datasets with both the automatic and human evaluations.", "keywords": "dialogue models;adversarial networks;dialogue generation", "primary_area": "", "supplementary_material": "", "author": "Oluwatobi O. Olabiyi;Alan Salimov;Anish Khazane;Erik T. Mueller", "authorids": "oluwatobi.olabiyi@capitalone.com;alan.salimov@capitalone.com;anish.khazan@capitalone.com;erik.mueller@capitalone.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nolabiyi2019multiturn,\ntitle={Multi-turn Dialogue Response Generation in an Adversarial Learning Framework},\nauthor={Oluwatobi O. Olabiyi and Alan Salimov and Anish Khazane and Erik T. Mueller},\nyear={2019},\nurl={https://openreview.net/forum?id=SJxzPsAqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer5;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJxzPsAqFQ", "pdf_size": 0, "rating": "4;4;5;6", "confidence": "4;4;5;4", "wc_review": "581;392;401;172", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "611;385;458;143", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 4.75, 0.82915619758885 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "wc_review_avg": [ 386.5, 144.983619764441 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 399.25, 168.93545365020333 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.17407765595569782, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9787685526336224093&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Hierarchical RL Using an Ensemble of Proprioceptive Periodic Policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1042", "id": "SJz1x20cFQ", "author_site": "Kenneth Marino, Abhinav Gupta, Rob Fergus, Arthur Szlam", "tldr": "", "abstract": "In this paper we introduce a simple, robust approach to hierarchically training an agent in the setting of sparse reward tasks.\nThe agent is split into a low-level and a high-level policy. The low-level policy only accesses internal, proprioceptive dimensions of the state observation. The low-level policies are trained with a simple reward that encourages changing the values of the non-proprioceptive dimensions. Furthermore, it is induced to be periodic with the use a ``phase function.'' The high-level policy is trained using a sparse, task-dependent reward, and operates by choosing which of the low-level policies to run at any given time. Using this approach, we solve difficult maze and navigation tasks with sparse rewards using the Mujoco Ant and Humanoid agents and show improvement over recent hierarchical methods. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kenneth Marino;Abhinav Gupta;Rob Fergus;Arthur Szlam", "authorids": "kdmarino@cs.cmu.edu;abhinavg@cs.cmu.edu;fergus@cs.nyu.edu;aszlam@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmarino2018hierarchical,\ntitle={Hierarchical {RL} Using an Ensemble of Proprioceptive Periodic Policies},\nauthor={Kenneth Marino and Abhinav Gupta and Rob Fergus and Arthur Szlam},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJz1x20cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;5", "wc_review": "366;339;583", "wc_reply_reviewers": "0;0;78", "wc_reply_authors": "287;258;961", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 429.3333333333333, 109.2164008847674 ], "wc_reply_reviewers_avg": [ 26.0, 36.76955262170047 ], "wc_reply_authors_avg": [ 502.0, 324.77787280950446 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7300179530988775512&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SJz1x20cFQ", "pdf": "https://openreview.net/pdf?id=SJz1x20cFQ", "email": ";;;", "author_num": 4 }, { "id": "SJz6MnC5YQ", "title": "DEEP GRAPH TRANSLATION", "track": "main", "status": "Reject", "tldr": "", "abstract": "The tremendous success of deep generative models on generating continuous data\nlike image and audio has been achieved; however, few deep graph generative models\nhave been proposed to generate discrete data such as graphs. The recently proposed\napproaches are typically unconditioned generative models which have no\ncontrol over modes of the graphs being generated. Differently, in this paper, we\nare interested in a new problem named Deep Graph Translation: given an input\ngraph, the goal is to infer a target graph by learning their underlying translation\nmapping. Graph translation could be highly desirable in many applications such\nas disaster management and rare event forecasting, where the rare and abnormal\ngraph patterns (e.g., traffic congestions and terrorism events) will be inferred prior\nto their occurrence even without historical data on the abnormal patterns for this\nspecific graph (e.g., a road network or human contact network). To this end, we\npropose a novel Graph-Translation-Generative Adversarial Networks (GT-GAN)\nwhich translates one mode of the input graphs to its target mode. GT-GAN consists\nof a graph translator where we propose new graph convolution and deconvolution\nlayers to learn the global and local translation mapping. A new conditional\ngraph discriminator has also been proposed to classify target graphs by conditioning\non input graphs. Extensive experiments on multiple synthetic and real-world\ndatasets demonstrate the effectiveness and scalability of the proposed GT-GAN.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xiaojie Guo;Lingfei Wu;Liang Zhao", "authorids": "xguo7@gmu.edu;lwu@email.wm.edu;lzhao9@gmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nguo2019deep,\ntitle={{DEEP} {GRAPH} {TRANSLATION}},\nauthor={Xiaojie Guo and Lingfei Wu and Liang Zhao},\nyear={2019},\nurl={https://openreview.net/forum?id=SJz6MnC5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJz6MnC5YQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;2;4", "wc_review": "542;293;1200", "wc_reply_reviewers": "48;0;36", "wc_reply_authors": "1134;1611;3249", "reply_reviewers": "1;0;1", "reply_authors": "3;3;5", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 678.3333333333334, 382.6245621435659 ], "wc_reply_reviewers_avg": [ 28.0, 20.396078054371138 ], "wc_reply_authors_avg": [ 1998.0, 905.7714943626786 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12513540941978749079&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Multi-class classification without multi-class labels", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/737", "id": "SJzR2iRcK7", "author_site": "Yen-Chang Hsu, Zhaoyang Lv, Joel Schlosser, Phillip Odom, Zsolt Kira", "tldr": "", "abstract": "This work presents a new strategy for multi-class classification that requires no class-specific labels, but instead leverages pairwise similarity between examples, which is a weaker form of annotation. The proposed method, meta classification learning, optimizes a binary classifier for pairwise similarity prediction and through this process learns a multi-class classifier as a submodule. We formulate this approach, present a probabilistic graphical model for it, and derive a surprisingly simple loss function that can be used to learn neural network-based models. We then demonstrate that this same framework generalizes to the supervised, unsupervised cross-task, and semi-supervised settings. Our method is evaluated against state of the art in all three learning paradigms and shows a superior or comparable accuracy, providing evidence that learning multi-class classification without multi-class labels is a viable learning option.", "keywords": "classification;unsupervised learning;semi-supervised learning;problem reduction;weak supervision;cross-task;learning;deep learning;neural network", "primary_area": "", "supplementary_material": "", "author": "Yen-Chang Hsu;Zhaoyang Lv;Joel Schlosser;Phillip Odom;Zsolt Kira", "authorids": "yenchang.hsu@gatech.edu;zhaoyang.lv@gatech.edu;joel.schlosser@gtri.gatech.edu;phillip.odom@gtri.gatech.edu;zkira@gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhsu2018multiclass,\ntitle={Multi-class classification without multi-class labels},\nauthor={Yen-Chang Hsu and Zhaoyang Lv and Joel Schlosser and Phillip Odom and Zsolt Kira},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJzR2iRcK7},\n}", "github": "[![github](/images/github_icon.svg) GT-RIPL/L2C](https://github.com/GT-RIPL/L2C)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "wc_review": "112;231;506", "wc_reply_reviewers": "0;0;39", "wc_reply_authors": "457;507;772", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 283.0, 164.99898989589806 ], "wc_reply_reviewers_avg": [ 13.0, 18.384776310850235 ], "wc_reply_authors_avg": [ 578.6666666666666, 138.22284744410223 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15660059153270341215&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SJzR2iRcK7", "pdf": "https://openreview.net/pdf?id=SJzR2iRcK7", "email": ";;;;", "author_num": 5 }, { "title": "What do you learn from context? Probing for sentence structure in contextualized word representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1009", "id": "SJzSgnRcKX", "author_site": "Ian Tenney, Patrick Xia, Berlin Chen, Alex Wang, Adam Poliak, Tom McCoy, Najoung Kim, Benjamin Van Durme, Sam Bowman, Dipanjan Das, Ellie Pavlick", "tldr": "We probe for sentence structure in ELMo and related contextual embedding models. We find existing models efficiently encode syntax and show evidence of long-range dependencies, but only offer small improvements on semantic tasks.", "abstract": "Contextualized representation models such as ELMo (Peters et al., 2018a) and BERT (Devlin et al., 2018) have recently achieved state-of-the-art results on a diverse array of downstream NLP tasks. Building on recent token-level probing work, we introduce a novel edge probing task design and construct a broad suite of sub-sentence tasks derived from the traditional structured NLP pipeline. We probe word-level contextual representations from four recent models and investigate how they encode sentence structure across a range of syntactic, semantic, local, and long-range phenomena. We find that existing models trained on language modeling and translation produce strong representations for syntactic phenomena, but only offer comparably small improvements on semantic tasks over a non-contextual baseline.", "keywords": "natural language processing;word embeddings;transfer learning;interpretability", "primary_area": "", "supplementary_material": "", "author": "Ian Tenney;Patrick Xia;Berlin Chen;Alex Wang;Adam Poliak;R Thomas McCoy;Najoung Kim;Benjamin Van Durme;Samuel R. Bowman;Dipanjan Das;Ellie Pavlick", "authorids": "iftenney@google.com;paxia@cs.jhu.edu;bchen6@swarthmore.edu;alexwang@nyu.edu;azpoliak@cs.jhu.edu;tom.mccoy@jhu.edu;n.kim@jhu.edu;vandurme@cs.jhu.edu;bowman@nyu.edu;dipanjand@google.com;ellie_pavlick@brown.edu", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@inproceedings{\ntenney2018what,\ntitle={What do you learn from context? Probing for sentence structure in contextualized word representations},\nauthor={Ian Tenney and Patrick Xia and Berlin Chen and Alex Wang and Adam Poliak and R Thomas McCoy and Najoung Kim and Benjamin Van Durme and Sam Bowman and Dipanjan Das and Ellie Pavlick},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJzSgnRcKX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SJzSgnRcKX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;4", "wc_review": "134;169;184", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "111;139;53", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 162.33333333333334, 20.949675149960893 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 101.0, 35.81433604950212 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 11, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1017, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=446886033048011777&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=SJzSgnRcKX", "pdf": "https://openreview.net/pdf?id=SJzSgnRcKX", "email": ";;;;;;;;;;", "author_num": 11 }, { "id": "SJzYdsAqY7", "title": "Spatial-Winograd Pruning Enabling Sparse Winograd Convolution", "track": "main", "status": "Reject", "tldr": "To accelerate the computation of convolutional neural networks, we propose a new two-step pruning technique which achieves a higher Winograd-domain weight sparsity without changing the network structure.", "abstract": "Deep convolutional neural networks (CNNs) are deployed in various applications but demand immense computational requirements. Pruning techniques and Winograd convolution are two typical methods to reduce the CNN computation. However, they cannot be directly combined because Winograd transformation fills in the sparsity resulting from pruning. Li et al. (2017) propose sparse Winograd convolution in which weights are directly pruned in the Winograd domain, but this technique is not very practical because Winograd-domain retraining requires low learning rates and hence significantly longer training time. Besides, Liu et al. (2018) move the ReLU function into the Winograd domain, which can help increase the weight sparsity but requires changes in the network structure. To achieve a high Winograd-domain weight sparsity without changing network structures, we propose a new pruning method, spatial-Winograd pruning. As the first step, spatial-domain weights are pruned in a structured way, which efficiently transfers the spatial-domain sparsity into the Winograd domain and avoids Winograd-domain retraining. For the next step, we also perform pruning and retraining directly in the Winograd domain but propose to use an importance factor matrix to adjust weight importance and weight gradients. This adjustment makes it possible to effectively retrain the pruned Winograd-domain network without changing the network structure. For the three models on the datasets of CIFAR-10, CIFAR-100, and ImageNet, our proposed method can achieve the Winograd-domain sparsities of 63%, 50%, and 74%, respectively.", "keywords": "deep learning;convolutional neural network;pruning;Winograd convolution", "primary_area": "", "supplementary_material": "", "author": "Jiecao Yu;Jongsoo Park;Maxim Naumov", "authorids": "jiecaoyu@umich.edu;jongsoo@fb.com;mnaumov@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyu2019spatialwinograd,\ntitle={Spatial-Winograd Pruning Enabling Sparse Winograd Convolution},\nauthor={Jiecao Yu and Jongsoo Park and Maxim Naumov},\nyear={2019},\nurl={https://openreview.net/forum?id=SJzYdsAqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SJzYdsAqY7", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;3", "wc_review": "634;95;228", "wc_reply_reviewers": "290;0;81", "wc_reply_authors": "1566;496;618", "reply_reviewers": "1;0;1", "reply_authors": "3;1;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 319.0, 229.2611320452437 ], "wc_reply_reviewers_avg": [ 123.66666666666667, 122.17564769171018 ], "wc_reply_authors_avg": [ 893.3333333333334, 478.247727531338 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15739465085304518465&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Spectral Inference Networks: Unifying Deep and Spectral Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/962", "id": "SJzqpj09YQ", "author_site": "David Pfau, Stig Petersen, Ashish Agarwal, David Barrett, Kimberly Stachenfeld", "tldr": "We show how to learn spectral decompositions of linear operators with deep learning, and use it for unsupervised learning without a generative model.", "abstract": "We present Spectral Inference Networks, a framework for learning eigenfunctions of linear operators by stochastic optimization. Spectral Inference Networks generalize Slow Feature Analysis to generic symmetric operators, and are closely related to Variational Monte Carlo methods from computational physics. As such, they can be a powerful tool for unsupervised representation learning from video or graph-structured data. We cast training Spectral Inference Networks as a bilevel optimization problem, which allows for online learning of multiple eigenfunctions. We show results of training Spectral Inference Networks on problems in quantum mechanics and feature learning for videos on synthetic datasets. Our results demonstrate that Spectral Inference Networks accurately recover eigenfunctions of linear operators and can discover interpretable representations from video in a fully unsupervised manner.", "keywords": "spectral learning;unsupervised learning;manifold learning;dimensionality reduction", "primary_area": "", "supplementary_material": "", "author": "David Pfau;Stig Petersen;Ashish Agarwal;David G. T. Barrett;Kimberly L. Stachenfeld", "authorids": "pfau@google.com;svp@google.com;agarwal@google.com;barrettdavid@google.com;stachenfeld@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\npfau2018spectral,\ntitle={Spectral Inference Networks: Unifying Deep and Spectral Learning},\nauthor={David Pfau and Stig Petersen and Ashish Agarwal and David G. T. Barrett and Kimberly L. Stachenfeld},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SJzqpj09YQ},\n}", "github": "[![github](/images/github_icon.svg) deepmind/spectral_inference_networks](https://github.com/deepmind/spectral_inference_networks) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SJzqpj09YQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;3;3", "wc_review": "250;180;323", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "472;724;725", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 251.0, 58.38378770400792 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 640.3333333333334, 119.03034160340052 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16660579419089969631&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SJzqpj09YQ", "pdf": "https://openreview.net/pdf?id=SJzqpj09YQ", "email": ";;;;", "author_num": 5 }, { "id": "SJzuHiA9tQ", "title": "Generative Adversarial Network Training is a Continual Learning Problem", "track": "main", "status": "Reject", "tldr": "Generative Adversarial Network Training is a Continual Learning Problem.", "abstract": "Generative Adversarial Networks (GANs) have proven to be a powerful framework for learning to draw samples from complex distributions. However, GANs are also notoriously difficult to train, with mode collapse and oscillations a common problem. We hypothesize that this is at least in part due to the evolution of the generator distribution and the catastrophic forgetting tendency of neural networks, which leads to the discriminator losing the ability to remember synthesized samples from previous instantiations of the generator. Recognizing this, our contributions are twofold. First, we show that GAN training makes for a more interesting and realistic benchmark for continual learning methods evaluation than some of the more canonical datasets. Second, we propose leveraging continual learning techniques to augment the discriminator, preserving its ability to recognize previous generator samples. We show that the resulting methods add only a light amount of computation, involve minimal changes to the model, and result in better overall performance on the examined image and text generation tasks.", "keywords": "Generative Adversarial Networks;Continual Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Kevin J Liang;Chunyuan Li;Guoyin Wang;Lawrence Carin", "authorids": "kevin.liang@duke.edu;chunyuan.li@duke.edu;guoyin.wang@duke.edu;lcarin@duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nliang2019generative,\ntitle={Generative Adversarial Network Training is a Continual Learning Problem},\nauthor={Kevin J Liang and Chunyuan Li and Guoyin Wang and Lawrence Carin},\nyear={2019},\nurl={https://openreview.net/forum?id=SJzuHiA9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SJzuHiA9tQ", "pdf_size": 0, "rating": "3;5;7", "confidence": "5;4;4", "wc_review": "433;395;631", "wc_reply_reviewers": "230;0;227", "wc_reply_authors": "552;443;974", "reply_reviewers": "1;0;2", "reply_authors": "1;1;4", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 486.3333333333333, 103.46443296557948 ], "wc_reply_reviewers_avg": [ 152.33333333333334, 107.7228955339682 ], "wc_reply_authors_avg": [ 656.3333333333334, 228.98956793317512 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4525626415891074742&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SJzvDjAcK7", "title": "Intriguing Properties of Learned Representations", "track": "main", "status": "Reject", "tldr": "Imposing a low rank structure on learned representations in deep networks yields a lot of interesting benefits.", "abstract": "A key feature of neural networks, particularly deep convolutional neural networks, is their ability to learn useful representations from data. The very last layer of a neural network is then simply a linear model trained on these learned representations. Despite their numerous applications in other tasks such as classification, retrieval, clustering etc., a.k.a. transfer learning, not much work has been published that investigates the structure of these representations or indeed whether structure can be imposed on them during the training process.\n\nIn this paper, we study the effective dimensionality of the learned representations by models that have proved highly successful for image classification. We focus on ResNet-18, ResNet-50 and VGG-19 and observe that when trained on CIFAR10 or CIFAR100, the learned representations exhibit a fairly low rank structure. We propose a modification to the training procedure, which further encourages low rank structure on learned activations. Empirically, we show that this has implications for robustness to adversarial examples and compression.", "keywords": "deep learning;low rank representations;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Amartya Sanyal;Varun Kanade;Philip H. Torr", "authorids": "amartya.sanyal@cs.ox.ac.uk;varunk@cs.ox.ac.uk;philip.torr@eng.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsanyal2019intriguing,\ntitle={Intriguing Properties of Learned Representations},\nauthor={Amartya Sanyal and Varun Kanade and Philip H. Torr},\nyear={2019},\nurl={https://openreview.net/forum?id=SJzvDjAcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SJzvDjAcK7", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;2;2", "wc_review": "366;499;352", "wc_reply_reviewers": "497;0;0", "wc_reply_authors": "854;625;0", "reply_reviewers": "2;0;0", "reply_authors": "2;1;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 405.6666666666667, 66.24365797736581 ], "wc_reply_reviewers_avg": [ 165.66666666666666, 234.28804683314277 ], "wc_reply_authors_avg": [ 493.0, 360.92196755900943 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Iy8D1y-J5nkJ:scholar.google.com/&scioq=Intriguing+Properties+of+Learned+Representations&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SJzwb2RcK7", "title": "Adversarial Decomposition of Text Representation", "track": "main", "status": "Withdraw", "tldr": "A method which learns separate representations for the meaning and the form of a sentence", "abstract": "In this paper, we present a method for adversarial decomposition of text representation. This method can be used to decompose a representation of an input sentence into several independent vectors, where each vector is responsible for a specific aspect of the input sentence. We evaluate the proposed method on two case studies: the conversion between different social registers and diachronic language change. We show that the proposed method is capable of fine-grained con- trolled change of these aspects of the input sentence. For example, our model is capable of learning a continuous (rather than categorical) representation of the style of the sentence, in line with the reality of language use. The model uses adversarial-motivational training and includes a special motivational loss, which acts opposite to the discriminator and encourages a better decomposition. Finally, we evaluate the obtained meaning embeddings on a downstream task of para- phrase detection and show that they are significantly better than embeddings of a regular autoencoder.", "keywords": "learning representation;decomposition;adversarial training;style transfer", "primary_area": "", "supplementary_material": "", "author": "Alexey Romanov;Anna Rumshisky;Anna Rogers;David Donahue", "authorids": "jgc128@outlook.com;arum@cs.uml.edu;arogers@cs.uml.edu;david_donahue@student.uml.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJzwb2RcK7", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;3", "wc_review": "340;385;517", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "208;312;531", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 414.0, 75.11324783285569 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 350.3333333333333, 134.6212794802103 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9034206936363471451&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SJzwvoCqF7", "title": "On Tighter Generalization Bounds for Deep Neural Networks: CNNs, ResNets, and Beyond", "track": "main", "status": "Reject", "tldr": "", "abstract": "We propose a generalization error bound for a general family of deep neural networks based on the depth and width of the networks, as well as the spectral norm of weight matrices. Through introducing a novel characterization of the Lipschitz properties of neural network family, we achieve a tighter generalization error bound. We further obtain a result that is free of linear dependence on norms for bounded losses. Besides the general deep neural networks, our results can be applied to derive new bounds for several popular architectures, including convolutional neural networks (CNNs), residual networks (ResNets), and hyperspherical networks (SphereNets). When achieving same generalization errors with previous arts, our bounds allow for the choice of much larger parameter spaces of weight matrices, inducing potentially stronger expressive ability for neural networks.", "keywords": "deep learning;generalization error bound;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Xingguo Li;Junwei Lu;Zhaoran Wang;Jarvis Haupt;Tuo Zhao", "authorids": "xingguol@princeton.edu;junweilu@hsph.harvard.edu;zhaoranwang@gmail.com;jdhaupt@umn.edu;tourzhao@gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019on,\ntitle={On Tighter Generalization Bounds for Deep Neural Networks: {CNN}s, ResNets, and Beyond},\nauthor={Xingguo Li and Junwei Lu and Zhaoran Wang and Jarvis Haupt and Tuo Zhao},\nyear={2019},\nurl={https://openreview.net/forum?id=SJzwvoCqF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SJzwvoCqF7", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;4", "wc_review": "284;288;277", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "598;403;499", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 283.0, 4.546060565661952 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 500.0, 79.61155694998057 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 77, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18446039737882674667&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3 }, { "title": "PeerNets: Exploiting Peer Wisdom Against Adversarial Attacks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/853", "id": "Sk4jFoA9K7", "author_site": "Jan Svoboda, Jonathan Masci, Federico Monti, Michael Bronstein, Leonidas Guibas", "tldr": "", "abstract": "Deep learning systems have become ubiquitous in many aspects of our lives. Unfortunately, it has been shown that such systems are vulnerable to adversarial attacks, making them prone to potential unlawful uses. \nDesigning deep neural networks that are robust to adversarial attacks is a fundamental step in making such systems safer and deployable in a broader variety of applications (e.g. autonomous driving), but more importantly is a necessary step to design novel and more advanced architectures built on new computational paradigms rather than marginally building on the existing ones.\nIn this paper we introduce PeerNets, a novel family of convolutional networks alternating classical Euclidean convolutions with graph convolutions to harness information from a graph of peer samples. This results in a form of non-local forward propagation in the model, where latent features are conditioned on the global structure induced by the graph, that is up to 3 times more robust to a variety of white- and black-box adversarial attacks compared to conventional architectures with almost no drop in accuracy.", "keywords": "peernet;peernets;graph;geometric deep learning;adversarial;perturbation;defense;peer regularization", "primary_area": "", "supplementary_material": "", "author": "Jan Svoboda;Jonathan Masci;Federico Monti;Michael Bronstein;Leonidas Guibas", "authorids": "jan.svoboda@usi.ch;jonathan@nnaisense.com;federico.monti@usi.ch;michael.bronstein@usi.ch;guibas@cs.stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nsvoboda2018peernets,\ntitle={PeerNets: Exploiting Peer Wisdom Against Adversarial Attacks},\nauthor={Jan Svoboda and Jonathan Masci and Federico Monti and Michael Bronstein and Leonidas Guibas},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Sk4jFoA9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7", "confidence": "5;4", "wc_review": "207;257", "wc_reply_reviewers": "0;58", "wc_reply_authors": "603;441", "reply_reviewers": "0;1", "reply_authors": "1;2", "rating_avg": [ 6.5, 0.5 ], "confidence_avg": [ 4.5, 0.5 ], "wc_review_avg": [ 232.0, 25.0 ], "wc_reply_reviewers_avg": [ 29.0, 29.0 ], "wc_reply_authors_avg": [ 522.0, 81.0 ], "reply_reviewers_avg": [ 0.5, 0.5 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=Sk4jFoA9K7", "pdf": "https://openreview.net/pdf?id=Sk4jFoA9K7", "email": ";;;;", "author_num": 5 }, { "title": "Attentive Neural Processes", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1006", "id": "SkE6PjC9KX", "author_site": "Hyunjik Kim, Andriy Mnih, Jonathan Schwarz, Marta Garnelo, S. M. Ali Eslami, Dan Rosenbaum, Oriol Vinyals, Yee Whye Teh", "tldr": "A model for regression that learns conditional distributions of a stochastic process, by incorporating attention into Neural Processes.", "abstract": "Neural Processes (NPs) (Garnelo et al., 2018) approach regression by learning to map a context set of observed input-output pairs to a distribution over regression functions. Each function models the distribution of the output given an input, conditioned on the context. NPs have the benefit of fitting observed data efficiently with linear complexity in the number of context input-output pairs, and can learn a wide family of conditional distributions; they learn predictive distributions conditioned on context sets of arbitrary size. Nonetheless, we show that NPs suffer a fundamental drawback of underfitting, giving inaccurate predictions at the inputs of the observed data they condition on. We address this issue by incorporating attention into NPs, allowing each input location to attend to the relevant context points for the prediction. We show that this greatly improves the accuracy of predictions, results in noticeably faster training, and expands the range of functions that can be modelled. ", "keywords": "Neural Processes;Conditional Neural Processes;Stochastic Processes;Regression;Attention", "primary_area": "", "supplementary_material": "", "author": "Hyunjik Kim;Andriy Mnih;Jonathan Schwarz;Marta Garnelo;Ali Eslami;Dan Rosenbaum;Oriol Vinyals;Yee Whye Teh", "authorids": "hyunjikk@google.com;amnih@google.com;schwarzjn@google.com;garnelo@google.com;aeslami@google.com;danro@google.com;vinyals@google.com;ywteh@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nkim2018attentive,\ntitle={Attentive Neural Processes},\nauthor={Hyunjik Kim and Andriy Mnih and Jonathan Schwarz and Marta Garnelo and Ali Eslami and Dan Rosenbaum and Oriol Vinyals and Yee Whye Teh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkE6PjC9KX},\n}", "github": "[![github](/images/github_icon.svg) deepmind/neural-processes](https://github.com/deepmind/neural-processes) + [![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=SkE6PjC9KX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "324;409;234", "wc_reply_reviewers": "0;0;57", "wc_reply_authors": "194;717;388", "reply_reviewers": "0;0;2", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 322.3333333333333, 71.45317036742006 ], "wc_reply_reviewers_avg": [ 19.0, 26.870057685088806 ], "wc_reply_authors_avg": [ 433.0, 215.8718755805551 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 519, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6519833436864425356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SkE6PjC9KX", "pdf": "https://openreview.net/pdf?id=SkE6PjC9KX", "email": ";;;;;;;", "author_num": 8 }, { "title": "Representation Degeneration Problem in Training Natural Language Generation Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/759", "id": "SkEYojRqtm", "author_site": "Jun Gao, Di He, Xu Tan, Tao Qin, Liwei Wang, Tie-Yan Liu", "tldr": "", "abstract": "We study an interesting problem in training neural network-based models for natural language generation tasks, which we call the \\emph{representation degeneration problem}. We observe that when training a model for natural language generation tasks through likelihood maximization with the weight tying trick, especially with big training datasets, most of the learnt word embeddings tend to degenerate and be distributed into a narrow cone, which largely limits the representation power of word embeddings. We analyze the conditions and causes of this problem and propose a novel regularization method to address it. Experiments on language modeling and machine translation show that our method can largely mitigate the representation degeneration problem and achieve better performance than baseline algorithms.", "keywords": "Natural Language Processing;Representation Learning", "primary_area": "", "supplementary_material": "", "author": "Jun Gao;Di He;Xu Tan;Tao Qin;Liwei Wang;Tieyan Liu", "authorids": "jungao@cs.toronto.edu;dihe@microsoft.com;xu.tan@microsoft.com;taoqin@microsoft.com;wanglw@cis.pku.edu.cn;tyliu@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ngao2018representation,\ntitle={Representation Degeneration Problem in Training Natural Language Generation Models},\nauthor={Jun Gao and Di He and Xu Tan and Tao Qin and Liwei Wang and Tieyan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkEYojRqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;3", "wc_review": "364;225;90", "wc_reply_reviewers": "11;0;0", "wc_reply_authors": "500;324;289", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 226.33333333333334, 111.86400473590938 ], "wc_reply_reviewers_avg": [ 3.6666666666666665, 5.185449728701348 ], "wc_reply_authors_avg": [ 371.0, 92.32912144424783 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 311, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3878205322217052013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SkEYojRqtm", "pdf": "https://openreview.net/pdf?id=SkEYojRqtm", "email": ";;;;;", "author_num": 6 }, { "title": "Hierarchical interpretations for neural network predictions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/681", "id": "SkEqro0ctQ", "author_site": "Chandan Singh, William Murdoch, Bin Yu", "tldr": "We introduce and validate hierarchical local interpretations, the first technique to automatically search for and display important interactions for individual predictions made by LSTMs and CNNs.", "abstract": "Deep neural networks (DNNs) have achieved impressive predictive performance due to their ability to learn complex, non-linear relationships between variables. However, the inability to effectively visualize these relationships has led to DNNs being characterized as black boxes and consequently limited their applications. To ameliorate this problem, we introduce the use of hierarchical interpretations to explain DNN predictions through our proposed method: agglomerative contextual decomposition (ACD). Given a prediction from a trained DNN, ACD produces a hierarchical clustering of the input features, along with the contribution of each cluster to the final prediction. This hierarchy is optimized to identify clusters of features that the DNN learned are predictive. We introduce ACD using examples from Stanford Sentiment Treebank and ImageNet, in order to diagnose incorrect predictions, identify dataset bias, and extract polarizing phrases of varying lengths. Through human experiments, we demonstrate that ACD enables users both to identify the more accurate of two DNNs and to better trust a DNN's outputs. We also find that ACD's hierarchy is largely robust to adversarial perturbations, implying that it captures fundamental aspects of the input and ignores spurious noise.", "keywords": "interpretability;natural language processing;computer vision", "primary_area": "", "supplementary_material": "", "author": "Chandan Singh;W. James Murdoch;Bin Yu", "authorids": "chandan_singh@berkeley.edu;jmurdoch@berkeley.edu;binyu@berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsingh2018hierarchical,\ntitle={Hierarchical interpretations for neural network predictions},\nauthor={Chandan Singh and W. James Murdoch and Bin Yu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkEqro0ctQ},\n}", "github": "[![github](/images/github_icon.svg) csinva/hierarchical-dnn-interpretations](https://github.com/csinva/hierarchical-dnn-interpretations)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "wc_review": "390;130;124", "wc_reply_reviewers": "128;19;0", "wc_reply_authors": "860;567;255", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 214.66666666666666, 124.00358417759098 ], "wc_reply_reviewers_avg": [ 49.0, 56.39739946723312 ], "wc_reply_authors_avg": [ 560.6666666666666, 247.03081229316766 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 186, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14523630218994203463&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SkEqro0ctQ", "pdf": "https://openreview.net/pdf?id=SkEqro0ctQ", "email": ";;", "author_num": 3 }, { "id": "SkGH2oRcYX", "title": "DEEP ADVERSARIAL FORWARD MODEL", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning world dynamics has recently been investigated as a way to make reinforcement\nlearning (RL) algorithms to be more sample efficient and interpretable.\nIn this paper, we propose to capture an environment dynamics with a novel forward\nmodel that leverages recent works on adversarial learning and visual control. Such\na model estimates future observations conditioned on the current ones and other\ninput variables such as actions taken by an RL-agent. We focus on image generation\nwhich is a particularly challenging topic but our method can be adapted to\nother modalities. More precisely, our forward model is trained to produce realistic\nobservations of the future while a discriminator model is trained to distinguish\nbetween real images and the model\u2019s prediction of the future. This approach overcomes\nthe need to define an explicit loss function for the forward model which is currently\nused for solving such a class of problem. As a consequence, our learning protocol\ndoes not have to rely on an explicit distance such as Euclidean distance which\ntends to produce unsatisfactory predictions. To illustrate our method, empirical\nqualitative and quantitative results are presented on a real driving scenario, along\nwith qualitative results on Atari game Frostbite.", "keywords": "forward model;adversarial learning", "primary_area": "", "supplementary_material": "", "author": "Morgan Funtowicz;Tomi Silander;Arnaud Sors;Julien Perez", "authorids": "morgan.funtowicz@naverlabs.com;tomi.silander@naverlabs.com;arnaud.sors@naverlabs.com;julien.perez@naverlabs.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nfuntowicz2019deep,\ntitle={{DEEP} {ADVERSARIAL} {FORWARD} {MODEL}},\nauthor={Morgan Funtowicz and Tomi Silander and Arnaud Sors and Julien Perez},\nyear={2019},\nurl={https://openreview.net/forum?id=SkGH2oRcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkGH2oRcYX", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;5", "wc_review": "618;178;509", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 435.0, 187.0953411142743 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gG3-Qldi3ssJ:scholar.google.com/&scioq=DEEP+ADVERSARIAL+FORWARD+MODEL&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SkGMOi05FQ", "title": "Generating Text through Adversarial Training using Skip-Thought Vectors", "track": "main", "status": "Withdraw", "tldr": "Generating text using sentence embeddings from Skip-Thought Vectors with the help of Generative Adversarial Networks.", "abstract": "In the past few years, various advancements have been made in generative models owing to the formulation of Generative Adversarial Networks (GANs). GANs have been shown to perform exceedingly well on a wide variety of tasks pertaining to image generation and style transfer. In the field of Natural Language Processing, word embeddings such as word2vec and GLoVe are state-of-the-art methods for applying neural network models on textual data. Attempts have been made for utilizing GANs with word embeddings for text generation. This work presents an approach to text generation using Skip-Thought sentence embeddings in conjunction with GANs based on gradient penalty functions and f-measures. The results of using sentence embeddings with GANs for generating text conditioned on input information are comparable to the approaches where word embeddings are used. ", "keywords": "Natural Language Generation;Computation and Language;Machine Learning;Generative Adversarial Networks;Sentence Embeddings", "primary_area": "", "supplementary_material": "", "author": "Afroz Ahamad", "authorids": "afrozsahamad@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkGMOi05FQ", "pdf_size": 0, "rating": "2;2;3", "confidence": "5;5;5", "wc_review": "210;143;132", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 161.66666666666666, 34.4705993887867 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1731234001602155571&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SkGNrnC9FQ", "title": "Manifold Alignment via Feature Correspondence", "track": "main", "status": "Reject", "tldr": "We propose a method for aligning the latent features learned from different datasets using harmonic correlations.", "abstract": "We propose a novel framework for combining datasets via alignment of their associated intrinsic dimensions. Our approach assumes that the two datasets are sampled from a common latent space, i.e., they measure equivalent systems. Thus, we expect there to exist a natural (albeit unknown) alignment of the data manifolds associated with the intrinsic geometry of these datasets, which are perturbed by measurement artifacts in the sampling process. Importantly, we do not assume any individual correspondence (partial or complete) between data points. Instead, we rely on our assumption that a subset of data features have correspondence across datasets. We leverage this assumption to estimate relations between intrinsic manifold dimensions, which are given by diffusion map coordinates over each of the datasets. We compute a correlation matrix between diffusion coordinates of the datasets by considering graph (or manifold) Fourier coefficients of corresponding data features. We then orthogonalize this correlation matrix to form an isometric transformation between the diffusion maps of the datasets. Finally, we apply this transformation to the diffusion coordinates and construct a unified diffusion geometry of the datasets together. We show that this approach successfully corrects misalignment artifacts, and allows for integrated data.", "keywords": "graph signal processing;graph alignment;manifold alignment;spectral graph wavelet transform;diffusion geometry;harmonic analysis", "primary_area": "", "supplementary_material": "", "author": "Jay S. Stanley III;Guy Wolf;Smita Krishnaswamy", "authorids": "jay.stanley@yale.edu;guy.wolf@yale.edu;smita.krishnaswamy@yale.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\niii2019manifold,\ntitle={Manifold Alignment via Feature Correspondence},\nauthor={Jay S. Stanley III and Guy Wolf and Smita Krishnaswamy},\nyear={2019},\nurl={https://openreview.net/forum?id=SkGNrnC9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkGNrnC9FQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;4", "wc_review": "442;427;432", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 433.6666666666667, 6.236095644623236 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WB1r5fSrAOkJ:scholar.google.com/&scioq=Manifold+Alignment+via+Feature+Correspondence&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkGQujR5FX", "title": "DANA: Scalable Out-of-the-box Distributed ASGD Without Retuning", "track": "main", "status": "Reject", "tldr": "A new distributed asynchronous SGD algorithm that achieves state-of-the-art accuracy on existing architectures without any additional tuning or overhead.", "abstract": "Distributed computing can significantly reduce the training time of neural networks. Despite its potential, however, distributed training has not been widely adopted: scaling the training process is difficult, and existing SGD methods require substantial tuning of hyperparameters and learning schedules to achieve sufficient accuracy when increasing the number of workers. In practice, such tuning can be prohibitively expensive given the huge number of potential hyperparameter configurations and the effort required to test each one.\n \nWe propose DANA, a novel approach that scales out-of-the-box to large clusters using the same hyperparameters and learning schedule optimized for training on a single worker, while maintaining similar final accuracy without additional overhead. DANA estimates the future value of model parameters by adapting Nesterov Accelerated Gradient to a distributed setting, and so mitigates the effect of gradient staleness, one of the main difficulties in scaling SGD to more workers.\n\nEvaluation on three state-of-the-art network architectures and three datasets shows that DANA scales as well as or better than existing work without having to tune any hyperparameters or tweak the learning schedule. For example, DANA achieves 75.73% accuracy on ImageNet when training ResNet-50 with 16 workers, similar to the non-distributed baseline.", "keywords": "distributed;asynchronous;gradient staleness;nesterov;optimization;out-of-the-box;stochastic gradient descent;sgd;imagenet;distributed training;neural networks;deep learning", "primary_area": "", "supplementary_material": "", "author": "Ido Hakimi;Saar Barkai;Moshe Gabel;Assaf Schuster", "authorids": "idohakimi@gmail.com;saarbarkai@gmail.com;mgabel@cs.toronto.edu;assaf@cs.technion.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhakimi2019dana,\ntitle={{DANA}: Scalable Out-of-the-box Distributed {ASGD} Without Retuning},\nauthor={Ido Hakimi and Saar Barkai and Moshe Gabel and Assaf Schuster},\nyear={2019},\nurl={https://openreview.net/forum?id=SkGQujR5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkGQujR5FX", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "wc_review": "419;326;278", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "317;287;138", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 341.0, 58.532042506647585 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 247.33333333333334, 78.27444595069893 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:4akhVXwZqjoJ:scholar.google.com/&scioq=DANA:+Scalable+Out-of-the-box+Distributed+ASGD+Without+Retuning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkGT6sRcFX", "title": "Infinitely Deep Infinite-Width Networks", "track": "main", "status": "Reject", "tldr": "We propose a method for the construction of arbitrarily deep infinite-width networks, based on which we derive a novel weight initialisation scheme for finite-width networks and demonstrate its competitive performance.", "abstract": "Infinite-width neural networks have been extensively used to study the theoretical properties underlying the extraordinary empirical success of standard, finite-width neural networks. Nevertheless, until now, infinite-width networks have been limited to at most two hidden layers. To address this shortcoming, we study the initialisation requirements of these networks and show that the main challenge for constructing them is defining the appropriate sampling distributions for the weights. Based on these observations, we propose a principled approach to weight initialisation that correctly accounts for the functional nature of the hidden layer activations and facilitates the construction of arbitrarily many infinite-width layers, thus enabling the construction of arbitrarily deep infinite-width networks. The main idea of our approach is to iteratively reparametrise the hidden-layer activations into appropriately defined reproducing kernel Hilbert spaces and use the canonical way of constructing probability distributions over these spaces for specifying the required weight distributions in a principled way. Furthermore, we examine the practical implications of this construction for standard, finite-width networks. In particular, we derive a novel weight initialisation scheme for standard, finite-width networks that takes into account the structure of the data and information about the task at hand. We demonstrate the effectiveness of this weight initialisation approach on the MNIST, CIFAR-10 and Year Prediction MSD datasets.", "keywords": "Infinite-width networks;initialisation;kernel methods;reproducing kernel Hilbert spaces;Gaussian processes", "primary_area": "", "supplementary_material": "", "author": "Jovana Mitrovic;Peter Wirnsberger;Charles Blundell;Dino Sejdinovic;Yee Whye Teh", "authorids": "jovana.mitrovic@spc.ox.ac.uk;pewi@google.com;cblundell@google.com;dino.sejdinovic@stats.ox.ac.uk;ywteh@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmitrovic2019infinitely,\ntitle={Infinitely Deep Infinite-Width Networks},\nauthor={Jovana Mitrovic and Peter Wirnsberger and Charles Blundell and Dino Sejdinovic and Yee Whye Teh},\nyear={2019},\nurl={https://openreview.net/forum?id=SkGT6sRcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkGT6sRcFX", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;2", "wc_review": "235;258;244", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "697;1083;666", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 245.66666666666666, 9.46337971105226 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 815.3333333333334, 189.6915625136998 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SkGpW3C5KX", "title": "Heated-Up Softmax Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "Metric learning aims at learning a distance which is consistent with the semantic meaning of the samples. The problem is generally solved by learning an embedding, such that the samples of the same category are close (compact) while samples from different categories are far away (spread-out) in the embedding space. One popular way of generating such embeddings is to use the second-to-last layer of a deep neural network trained as a classifier with the softmax cross-entropy loss. In this paper, we show that training classifiers with different temperatures of the softmax function lead to different distributions of the embedding space. And finding a balance between the compactness, 'spread-out' and the generalization ability of the feature is critical in metric learning. Leveraging these insights, we propose a 'heating-up' strategy to train a classifier with increasing temperatures. Extensive experiments show that the proposed method achieves state-of-the-art embeddings on a variety of metric learning benchmarks. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xu Zhang;Felix Xinnan Yu;Svebor Karaman;Wei Zhang;Shih-Fu Chang", "authorids": "xu.zhang@columbia.edu;felixyu@google.com;svebor.karaman@gmail.com;wz2363@columbia.edu;sc250@columbia.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2019heatedup,\ntitle={Heated-Up Softmax Embedding},\nauthor={Xu Zhang and Felix Xinnan Yu and Svebor Karaman and Wei Zhang and Shih-Fu Chang},\nyear={2019},\nurl={https://openreview.net/forum?id=SkGpW3C5KX},\n}", "github": "[![github](/images/github_icon.svg) ColumbiaDVMM/Heated_Up_Softmax_Embedding](https://github.com/ColumbiaDVMM/Heated_Up_Softmax_Embedding)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkGpW3C5KX", "pdf_size": 0, "rating": "3;5;8", "confidence": "5;4;4", "wc_review": "294;814;183", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "329;617;406", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 2.0548046676563256 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 430.3333333333333, 275.05191429174886 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 450.6666666666667, 121.7438111591532 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8029550685469661, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11271259545833421367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkGtjjR5t7", "title": "Learning to Drive by Observing the Best and Synthesizing the Worst", "track": "main", "status": "Reject", "tldr": "This work explores how far we can take (supervised) imitation learning for the task of driving a car.", "abstract": "Our goal is to train a policy for autonomous driving via imitation learning that is robust enough to drive a real vehicle. We find that standard behavior cloning is insufficient for handling complex driving scenarios, even when we leverage a perception system for preprocessing the input and a controller for executing the output on the car: 30 million examples are still not enough. We propose exposing the learner to synthesized data in the form of perturbations to the expert's driving, which creates interesting situations such as collisions and/or going off the road. Rather than purely imitating all data, we augment the imitation loss with additional losses that penalize undesirable events and encourage progress -- the perturbations then provide an important signal for these losses and lead to robustness of the learned model. We show that the model can handle complex situations in simulation, and present ablation experiments that emphasize the importance of each of our proposed changes and show that the model is responding to the appropriate causal factors. Finally, we demonstrate the model driving a car in the real world ( https://sites.google.com/view/learn-to-drive ).", "keywords": "Imitation Learning;End-to-End Driving;Learning to drive;Autonomous Driving", "primary_area": "", "supplementary_material": "", "author": "Mayank Bansal;Alex Krizhevsky;Abhijit Ogale", "authorids": "mayban@waymo.com;akrizhevsky@gmail.com;ogale@waymo.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbansal2019learning,\ntitle={Learning to Drive by Observing the Best and Synthesizing the Worst},\nauthor={Mayank Bansal and Alex Krizhevsky and Abhijit Ogale},\nyear={2019},\nurl={https://openreview.net/forum?id=SkGtjjR5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkGtjjR5t7", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "489;424;402", "wc_reply_reviewers": "0;0;97", "wc_reply_authors": "1246;470;1053", "reply_reviewers": "0;0;1", "reply_authors": "2;1;2", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 438.3333333333333, 36.935379004718804 ], "wc_reply_reviewers_avg": [ 32.333333333333336, 45.72623851673007 ], "wc_reply_authors_avg": [ 923.0, 329.86765022758243 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:deYrOipxQQ8J:scholar.google.com/&scioq=Learning+to+Drive+by+Observing+the+Best+and+Synthesizing+the+Worst&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Spreading vectors for similarity search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1005", "id": "SkGuG2R5tm", "author_site": "Alexandre Sablayrolles, Matthijs Douze, Cordelia Schmid, Herv\u00e9 J\u00e9gou", "tldr": "We learn a neural network that uniformizes the input distribution, which leads to competitive indexing performance in high-dimensional space", "abstract": "Discretizing floating-point vectors is a fundamental step of modern indexing methods. State-of-the-art techniques learn parameters of the quantizers on training data for optimal performance, thus adapting quantizers to the data. In this work, we propose to reverse this paradigm and adapt the data to the quantizer: we train a neural net whose last layers form a fixed parameter-free quantizer, such as pre-defined points of a sphere. As a proxy objective, we design and train a neural network that favors uniformity in the spherical latent space, while preserving the neighborhood structure after the mapping. For this purpose, we propose a new regularizer derived from the Kozachenko-Leonenko differential entropy estimator and combine it with a locality-aware triplet loss. \nExperiments show that our end-to-end approach outperforms most learned quantization methods, and is competitive with the state of the art on widely adopted benchmarks. Further more, we show that training without the quantization step results in almost no difference in accuracy, but yields a generic catalyser that can be applied with any subsequent quantization technique.\n", "keywords": "dimensionality reduction;similarity search;indexing;differential entropy", "primary_area": "", "supplementary_material": "", "author": "Alexandre Sablayrolles;Matthijs Douze;Cordelia Schmid;Herv\u00e9 J\u00e9gou", "authorids": "asablayrolles@fb.com;matthijs@fb.com;cordelia.schmid@inria.fr;rvj@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsablayrolles2018spreading,\ntitle={Spreading vectors for similarity search},\nauthor={Alexandre Sablayrolles and Matthijs Douze and Cordelia Schmid and Herv\u00e9 J\u00e9gou},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkGuG2R5tm},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/spreadingvectors](https://github.com/facebookresearch/spreadingvectors) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SkGuG2R5tm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "wc_review": "124;171;280", "wc_reply_reviewers": "0;0;151", "wc_reply_authors": "394;401;733", "reply_reviewers": "0;0;2", "reply_authors": "1;1;3", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 191.66666666666666, 65.34183618138961 ], "wc_reply_reviewers_avg": [ 50.333333333333336, 71.18208263944578 ], "wc_reply_authors_avg": [ 509.3333333333333, 158.18203297326647 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7912762574684423820&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SkGuG2R5tm", "pdf": "https://openreview.net/pdf?id=SkGuG2R5tm", "email": ";;;", "author_num": 4 }, { "id": "SkMON20ctX", "title": "On the Trajectory of Stochastic Gradient Descent in the Information Plane", "track": "main", "status": "Reject", "tldr": "We look at SGD as a trajectory in the space of probability measures, show its connection to Markov processes, propose a simple Markov model of SGD learning, and experimentally compare it with SGD using information theoretic quantities. ", "abstract": "Studying the evolution of information theoretic quantities during Stochastic Gradient Descent (SGD) learning of Artificial Neural Networks (ANNs) has gained popularity in recent years. \nNevertheless, these type of experiments require estimating mutual information and entropy which becomes intractable for moderately large problems. In this work we propose a framework for understanding SGD learning in the information plane which consists of observing entropy and conditional entropy of the output labels of ANN. Through experimental results and theoretical justifications it is shown that, under some assumptions, the SGD learning trajectories appear to be similar for different ANN architectures. First, the SGD learning is modeled as a Hidden Markov Process (HMP) whose entropy tends to increase to the maximum. Then, it is shown that the SGD learning trajectory appears to move close to the shortest path between the initial and final joint distributions in the space of probability measures equipped with the total variation metric. Furthermore, it is shown that the trajectory of learning in the information plane can provide an alternative for observing the learning process, with potentially richer information about the learning than the trajectories in training and test error. ", "keywords": "Stochastic gradient descent;Deep neural networks;Entropy;Information theory;Markov chains;Hidden Markov process.", "primary_area": "", "supplementary_material": "", "author": "Emilio Rafael Balda;Arash Behboodi;Rudolf Mathar", "authorids": "emilio.balda@ti.rwth-aachen.de;arash.behboodi@ti.rwth-aachen.de;mathar@ti.rwth-aachen.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbalda2019on,\ntitle={On the Trajectory of Stochastic Gradient Descent in the Information Plane},\nauthor={Emilio Rafael Balda and Arash Behboodi and Rudolf Mathar},\nyear={2019},\nurl={https://openreview.net/forum?id=SkMON20ctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkMON20ctX", "pdf_size": 0, "rating": "2;4;6", "confidence": "4;3;4", "wc_review": "352;236;437", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "407;96;201", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 341.6666666666667, 82.3825763671475 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 234.66666666666666, 129.17774146070556 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10248195108951322936&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SkMPNoCcKQ", "title": "Linearizing Visual Processes with Deep Generative Models", "track": "main", "status": "Withdraw", "tldr": "We model non-linear visual processes as autoregressive noise via generative deep learning.", "abstract": "This work studies the problem of modeling non-linear visual processes by leveraging deep generative architectures for learning linear, Gaussian models of observed sequences. We propose a joint learning framework, combining a multivariate autoregressive model and deep convolutional generative networks. After justification of theoretical assumptions of inearization, we propose an architecture that allows Variational Autoencoders and Generative Adversarial Networks to simultaneously learn the non-linear observation as well as the linear state-transition model from a sequence of observed frames. Finally, we demonstrate our approach on conceptual toy examples and dynamic textures.", "keywords": "Genearative Adversarial Network;Variational Autoencoder;Wasserstein GAN;Autoregressive Model;Dynamic Texture;Video", "primary_area": "", "supplementary_material": "", "author": "Alexander Sagel;Hao Shen", "authorids": "a.sagel@tum.de;shen@fortiss.org", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkMPNoCcKQ", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;3", "wc_review": "894;432;179", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "440;389;328", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 501.6666666666667, 296.025149081778 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 385.6666666666667, 45.784519460427035 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pD4iVsmA6bEJ:scholar.google.com/&scioq=Linearizing+Visual+Processes+with+Deep+Generative+Models&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "A Convergence Analysis of Gradient Descent for Deep Linear Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/789", "id": "SkMQg3C5K7", "author_site": "Sanjeev Arora, Nadav Cohen, Noah Golowich, Wei Hu", "tldr": "We analyze gradient descent for deep linear neural networks, providing a guarantee of convergence to global optimum at a linear rate.", "abstract": "We analyze speed of convergence to global optimum for gradient descent training a deep linear neural network by minimizing the L2 loss over whitened data. Convergence at a linear rate is guaranteed when the following hold: (i) dimensions of hidden layers are at least the minimum of the input and output dimensions; (ii) weight matrices at initialization are approximately balanced; and (iii) the initial loss is smaller than the loss of any rank-deficient solution. The assumptions on initialization (conditions (ii) and (iii)) are necessary, in the sense that violating any one of them may lead to convergence failure. Moreover, in the important case of output dimension 1, i.e. scalar regression, they are met, and thus convergence to global optimum holds, with constant probability under a random initialization scheme. Our results significantly extend previous analyses, e.g., of deep linear residual networks (Bartlett et al., 2018).", "keywords": "Deep Learning;Learning Theory;Non-Convex Optimization", "primary_area": "", "supplementary_material": "", "author": "Sanjeev Arora;Nadav Cohen;Noah Golowich;Wei Hu", "authorids": "arora@cs.princeton.edu;cohennadav@ias.edu;ngolowich@college.harvard.edu;huwei@cs.princeton.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\narora2018a,\ntitle={A Convergence Analysis of Gradient Descent for Deep Linear Neural Networks},\nauthor={Sanjeev Arora and Nadav Cohen and Noah Golowich and Wei Hu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkMQg3C5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;5;4", "wc_review": "498;781;320", "wc_reply_reviewers": "110;0;0", "wc_reply_authors": "588;608;260", "reply_reviewers": "2;0;0", "reply_authors": "5;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 533.0, 189.8227243157854 ], "wc_reply_reviewers_avg": [ 36.666666666666664, 51.85449728701349 ], "wc_reply_authors_avg": [ 485.3333333333333, 159.5437940573754 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 322, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9015925541758289839&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SkMQg3C5K7", "pdf": "https://openreview.net/pdf?id=SkMQg3C5K7", "email": ";;;", "author_num": 4 }, { "title": "Feed-forward Propagation in Probabilistic Neural Networks with Categorical and Max Layers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1122", "id": "SkMuPjRcKQ", "author_site": "Alexander (Oleksandr) Shekhovtsov, Boris Flach", "tldr": "Approximating mean and variance of the NN output over noisy input / dropout / uncertain parameters. Analytic approximations for argmax, softmax and max layers.", "abstract": "Probabilistic Neural Networks deal with various sources of stochasticity: input noise, dropout, stochastic neurons, parameter uncertainties modeled as random variables, etc.\nIn this paper we revisit a feed-forward propagation approach that allows one to estimate for each neuron its mean and variance w.r.t. all mentioned sources of stochasticity. In contrast, standard NNs propagate only point estimates, discarding the uncertainty.\nMethods propagating also the variance have been proposed by several authors in different context. The view presented here attempts to clarify the assumptions and derivation behind such methods, relate them to classical NNs and broaden their scope of applicability.\nThe main technical contributions are new approximations for the distributions of argmax and max-related transforms, which allow for fully analytic uncertainty propagation in networks with softmax and max-pooling layers as well as leaky ReLU activations.\nWe evaluate the accuracy of the approximation and suggest a simple calibration. Applying the method to networks with dropout allows for faster training and gives improved test likelihoods without the need of sampling.", "keywords": "probabilistic neural network;uncertainty;dropout;bayesian;softmax;argmax;logsumexp", "primary_area": "", "supplementary_material": "", "author": "Alexander Shekhovtsov;Boris Flach", "authorids": "shekhovtsov@gmail.com;bflach@inf.tu-dresden.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nshekhovtsov2018feedforward,\ntitle={Feed-forward Propagation in Probabilistic Neural Networks with Categorical and Max Layers},\nauthor={Alexander Shekhovtsov and Boris Flach},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkMuPjRcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;5;3", "wc_review": "120;446;450", "wc_reply_reviewers": "33;125;0", "wc_reply_authors": "389;787;447", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 338.6666666666667, 154.62930583243985 ], "wc_reply_reviewers_avg": [ 52.666666666666664, 52.891923348991156 ], "wc_reply_authors_avg": [ 541.0, 175.5524612948126 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1018527712043791327&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=SkMuPjRcKQ", "pdf": "https://openreview.net/pdf?id=SkMuPjRcKQ", "email": ";", "author_num": 2 }, { "title": "Measuring and regularizing networks in function space", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/837", "id": "SkMwpiR9Y7", "author_site": "Ari S Benjamin, David Rolnick, Konrad P Kording", "tldr": "We find movement in function space is not proportional to movement in parameter space during optimization. We propose a new natural-gradient style optimizer to address this.", "abstract": "To optimize a neural network one often thinks of optimizing its parameters, but it is ultimately a matter of optimizing the function that maps inputs to outputs. Since a change in the parameters might serve as a poor proxy for the change in the function, it is of some concern that primacy is given to parameters but that the correspondence has not been tested. Here, we show that it is simple and computationally feasible to calculate distances between functions in a $L^2$ Hilbert space. We examine how typical networks behave in this space, and compare how parameter $\\ell^2$ distances compare to function $L^2$ distances between various points of an optimization trajectory. We find that the two distances are nontrivially related. In particular, the $L^2/\\ell^2$ ratio decreases throughout optimization, reaching a steady value around when test error plateaus. We then investigate how the $L^2$ distance could be applied directly to optimization. We first propose that in multitask learning, one can avoid catastrophic forgetting by directly limiting how much the input/output function changes between tasks. Secondly, we propose a new learning rule that constrains the distance a network can travel through $L^2$-space in any one update. This allows new examples to be learned in a way that minimally interferes with what has previously been learned. These applications demonstrate how one can measure and regularize function distances directly, without relying on parameters or local approximations like loss curvature.", "keywords": "function space;Hilbert space;empirical characterization;multitask learning;catastrophic forgetting;optimization;natural gradient", "primary_area": "", "supplementary_material": "", "author": "Ari Benjamin;David Rolnick;Konrad Kording", "authorids": "aarrii@seas.upenn.edu;drolnick@mit.edu;koerding@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbenjamin2018measuring,\ntitle={Measuring and regularizing networks in function space},\nauthor={Ari Benjamin and David Rolnick and Konrad Kording},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkMwpiR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "wc_review": "666;637;570", "wc_reply_reviewers": "295;288;0", "wc_reply_authors": "813;786;655", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 624.3333333333334, 40.20226638166339 ], "wc_reply_reviewers_avg": [ 194.33333333333334, 137.4441300634148 ], "wc_reply_authors_avg": [ 751.3333333333334, 69.00402564746173 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 179, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15363433308760579134&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SkMwpiR9Y7", "pdf": "https://openreview.net/pdf?id=SkMwpiR9Y7", "email": ";;", "author_num": 3 }, { "id": "SkMx_iC9K7", "title": "DelibGAN: Coarse-to-Fine Text Generation via Adversarial Network", "track": "main", "status": "Reject", "tldr": "A novel adversarial learning framework, namely DelibGAN, is proposed for generating high-quality sentences without supervision.", "abstract": "In this paper, we propose a novel adversarial learning framework, namely DelibGAN, for generating high-quality sentences without supervision. Our framework consists of a coarse-to-fine generator, which contains a first-pass decoder and a second-pass decoder, and a multiple instance discriminator. And we propose two training mechanisms DelibGAN-I and DelibGAN-II. The discriminator is used to fine-tune the second-pass decoder in DelibGAN-I and further evaluate the importance of each word and tune the first-pass decoder in DelibGAN-II. We compare our models with several typical and state-of-the-art unsupervised generic text generation models on three datasets (a synthetic dataset, a descriptive text dataset and a sentimental text dataset). Both qualitative and quantitative experimental results show that our models produce more realistic samples, and DelibGAN-II performs best.", "keywords": "unsupervised text generation;coarse-to-fine generator;multiple instance discriminator;GAN;DelibGAN", "primary_area": "", "supplementary_material": "", "author": "Ke Wang;Xiaojun Wan", "authorids": "wangke17@pku.edu.cn;wanxiaojun@pku.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nwang2019delibgan,\ntitle={Delib{GAN}: Coarse-to-Fine Text Generation via Adversarial Network},\nauthor={Ke Wang and Xiaojun Wan},\nyear={2019},\nurl={https://openreview.net/forum?id=SkMx_iC9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkMx_iC9K7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "wc_review": "668;709;188", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 521.6666666666666, 236.53094700036377 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2T0ffZaUL6oJ:scholar.google.com/&scioq=DelibGAN:+Coarse-to-Fine+Text+Generation+via+Adversarial+Network&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SkNSOjR9Y7", "title": "Training Variational Auto Encoders with Discrete Latent Representations using Importance Sampling", "track": "main", "status": "Reject", "tldr": "We propose an easy method to train Variational Auto Encoders (VAE) with discrete latent representations, using importance sampling", "abstract": "The Variational Auto Encoder (VAE) is a popular generative \nlatent variable model that is often \napplied for representation learning.\nStandard VAEs assume continuous valued \nlatent variables and are trained by maximization\nof the evidence lower bound (ELBO). Conventional methods obtain a \ndifferentiable estimate of the ELBO with reparametrized sampling and\noptimize it with Stochastic Gradient Descend (SGD). However, this is not possible if \nwe want to train VAEs with discrete valued latent variables, \nsince reparametrized sampling is not possible. Till now, there\nexist no simple solutions to circumvent this problem.\nIn this paper, we propose an easy method to train VAEs \nwith binary or categorically valued latent representations. Therefore, we use a differentiable\nestimator for the ELBO which is based on importance sampling. In experiments, we verify the approach and\ntrain two different VAEs architectures with Bernoulli and \nCategorically distributed latent representations on two different benchmark\ndatasets.\t", "keywords": "Variational Auto Encoder;Importance Sampling;Discrete latent representation", "primary_area": "", "supplementary_material": "", "author": "Alexander Bartler;Felix Wiewel;Bin Yang;Lukas Mauch", "authorids": "alexander.bartler@iss.uni-stuttgart.de;felix.wiewel@iss.uni-stuttgart.de;bin.yang@iss.uni-stuttgart.de;lukas.mauch@iss.uni-stuttgart.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbartler2019training,\ntitle={Training Variational Auto Encoders with Discrete Latent Representations using Importance Sampling},\nauthor={Alexander Bartler and Felix Wiewel and Bin Yang and Lukas Mauch},\nyear={2019},\nurl={https://openreview.net/forum?id=SkNSOjR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkNSOjR9Y7", "pdf_size": 0, "rating": "1;3;3", "confidence": "5;5;5", "wc_review": "62;352;601", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 338.3333333333333, 220.25792960880105 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:MF0oK0ByQ64J:scholar.google.com/&scioq=Training+Variational+Auto+Encoders+with+Discrete+Latent+Representations+using+Importance+Sampling&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkNSehA9FQ", "title": "Open Vocabulary Learning on Source Code with a Graph-Structured Cache", "track": "main", "status": "Reject", "tldr": "We show that caching out-of-vocabulary words in a graph, with edges connecting them to their usages, and processing it with a graph neural network improves performance on supervised learning tasks on computer source code.", "abstract": "Machine learning models that take computer program source code as input typically use Natural Language Processing (NLP) techniques. However, a major challenge is that code is written using an open, rapidly changing vocabulary due to, e.g., the coinage of new variable and method names. Reasoning over such a vocabulary is not something for which most NLP methods are designed. We introduce a Graph-Structured Cache to address this problem; this cache contains a node for each new word the model encounters with edges connecting each word to its occurrences in the code. We find that combining this graph-structured cache strategy with recent Graph-Neural-Network-based models for supervised learning on code improves the models' performance on a code completion task and a variable naming task --- with over 100\\% relative improvement on the latter --- at the cost of a moderate increase in computation time.", "keywords": "deep learning;graph neural network;open vocabulary;natural language processing;source code;abstract syntax tree;code completion;variable naming", "primary_area": "", "supplementary_material": "", "author": "Milan Cvitkovic;Badal Singh;Anima Anandkumar", "authorids": "mcvitkov@caltech.edu;sbadal@amazon.com;anima@caltech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncvitkovic2019open,\ntitle={Open Vocabulary Learning on Source Code with a Graph-Structured Cache},\nauthor={Milan Cvitkovic and Badal Singh and Anima Anandkumar},\nyear={2019},\nurl={https://openreview.net/forum?id=SkNSehA9FQ},\n}", "github": "[![github](/images/github_icon.svg) mwcvitkovic/Deep_Learning_On_Code_With_A_Graph_Vocabulary--Code_Preprocessor](https://github.com/mwcvitkovic/Deep_Learning_On_Code_With_A_Graph_Vocabulary--Code_Preprocessor) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SkNSehA9FQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkNSehA9FQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;5", "wc_review": "112;550;440", "wc_reply_reviewers": "210;336;0", "wc_reply_authors": "467;792;341", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 367.3333333333333, 186.04897801982742 ], "wc_reply_reviewers_avg": [ 182.0, 138.59292911256333 ], "wc_reply_authors_avg": [ 533.3333333333334, 190.00058479442166 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1145489630896909786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 14 }, { "title": "Fluctuation-dissipation relations for stochastic gradient descent", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/784", "id": "SkNksoRctQ", "tldr": "We prove fluctuation-dissipation relations for SGD, which can be used to (i) adaptively set learning rates and (ii) probe loss surfaces.", "abstract": "The notion of the stationary equilibrium ensemble has played a central role in statistical mechanics. In machine learning as well, training serves as generalized equilibration that drives the probability distribution of model parameters toward stationarity. Here, we derive stationary fluctuation-dissipation relations that link measurable quantities and hyperparameters in the stochastic gradient descent algorithm. These relations hold exactly for any stationary state and can in particular be used to adaptively set training schedule. We can further use the relations to efficiently extract information pertaining to a loss-function landscape such as the magnitudes of its Hessian and anharmonicity. Our claims are empirically verified.", "keywords": "stochastic gradient descent;adaptive method;loss surface;Hessian", "primary_area": "", "supplementary_material": "", "author": "Sho Yaida", "authorids": "shoyaida@fb.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nyaida2018fluctuationdissipation,\ntitle={Fluctuation-dissipation relations for stochastic gradient descent},\nauthor={Sho Yaida},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkNksoRctQ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SkNksoRctQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;3;5", "wc_review": "200;184;256", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "604;478;216", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 213.33333333333334, 30.868898407440604 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 432.6666666666667, 161.61133073588073 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.6546536707079772, "gs_citation": 89, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6542696282294112644&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SkNksoRctQ", "pdf": "https://openreview.net/pdf?id=SkNksoRctQ", "email": "", "author_num": 1 }, { "id": "SkVRTj0cYQ", "title": "Differentially Private Federated Learning: A Client Level Perspective", "track": "main", "status": "Reject", "tldr": "Ensuring that models learned in federated fashion do not reveal a client's participation.", "abstract": "Federated learning is a recent advance in privacy protection. \nIn this context, a trusted curator aggregates parameters optimized in decentralized fashion by multiple clients. The resulting model is then distributed back to all clients, ultimately converging to a joint representative model without explicitly having to share the data. \nHowever, the protocol is vulnerable to differential attacks, which could originate from any party contributing during federated optimization. In such an attack, a client's contribution during training and information about their data set is revealed through analyzing the distributed model. \nWe tackle this problem and propose an algorithm for client sided differential privacy preserving federated optimization. The aim is to hide clients' contributions during training, balancing the trade-off between privacy loss and model performance. \nEmpirical studies suggest that given a sufficiently large number of participating clients, our proposed procedure can maintain client-level differential privacy at only a minor cost in model performance. ", "keywords": "Machine Learning;Federated Learning;Privacy;Security;Differential Privacy", "primary_area": "", "supplementary_material": "", "author": "Robin C. Geyer;Tassilo J. Klein;Moin Nabi", "authorids": "geyerr@ethz.ch;tassilo.klein@sap.com;moin.nabi@sap.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngeyer2019differentially,\ntitle={Differentially Private Federated Learning: A Client Level Perspective},\nauthor={Robin C. Geyer and Tassilo J. Klein and Moin Nabi},\nyear={2019},\nurl={https://openreview.net/forum?id=SkVRTj0cYQ},\n}", "github": "[![github](/images/github_icon.svg) cyrusgeyer/DiffPrivate_FedLearning](https://github.com/cyrusgeyer/DiffPrivate_FedLearning) + [![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=SkVRTj0cYQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkVRTj0cYQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;4", "wc_review": "1026;728;218", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 657.3333333333334, 333.6278698723408 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1833, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5585879427699661342&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkVe3iA9Ym", "title": "Beyond Winning and Losing: Modeling Human Motivations and Behaviors with Vector-valued Inverse Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "In recent years, reinforcement learning methods have been applied to model gameplay with great success, achieving super-human performance in various environments, such as Atari, Go and Poker.\nHowever, those studies mostly focus on winning the game and have largely ignored the rich and complex human motivations, which are essential for understanding the agents' diverse behavior.\nIn this paper, we present a multi-motivation behavior modeling which investigates the multifaceted human motivations and models the underlying value structure of the agents.\nOur approach extends inverse RL to the vectored-valued setting which imposes a much weaker assumption than previous studies.\nThe vectorized rewards incorporate Pareto optimality, which is a powerful tool to explain a wide range of behavior by its optimality.\nFor practical assessment, our algorithm is tested on the World of Warcraft Avatar History dataset spanning three years of the gameplay.\nOur experiments demonstrate the improvement over the scalarization-based methods on real-world problem settings.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Baoxiang Wang;Tongfang Sun;Xianjun Sam Zheng", "authorids": "wangbx66@gmail.com;tongfs@uw.edu;sam.zheng@deephow.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2019beyond,\ntitle={Beyond Winning and Losing: Modeling Human Motivations and Behaviors with Vector-valued Inverse Reinforcement Learning},\nauthor={Baoxiang Wang and Tongfang Sun and Xianjun Sam Zheng},\nyear={2019},\nurl={https://openreview.net/forum?id=SkVe3iA9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkVe3iA9Ym", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "737;237;237", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "335;186;151", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 403.6666666666667, 235.70226039551585 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 224.0, 79.77886102638134 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15643561650078060212&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Pay Less Attention with Lightweight and Dynamic Convolutions", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1043", "id": "SkVhlh09tX", "author_site": "Felix Wu, Angela Fan, Alexei Baevski, Yann Dauphin, Michael Auli", "tldr": "Dynamic lightweight convolutions are competitive to self-attention on language tasks.", "abstract": "Self-attention is a useful mechanism to build generative models for language and images. It determines the importance of context elements by comparing each element to the current time step. In this paper, we show that a very lightweight convolution can perform competitively to the best reported self-attention results. Next, we introduce dynamic convolutions which are simpler and more efficient than self-attention. We predict separate convolution kernels based solely on the current time-step in order to determine the importance of context elements. The number of operations required by this approach scales linearly in the input length, whereas self-attention is quadratic. Experiments on large-scale machine translation, language modeling and abstractive summarization show that dynamic convolutions improve over strong self-attention models. On the WMT'14 English-German test set dynamic convolutions achieve a new state of the art of 29.7 BLEU.", "keywords": "Deep learning;sequence to sequence learning;convolutional neural networks;generative models", "primary_area": "", "supplementary_material": "", "author": "Felix Wu;Angela Fan;Alexei Baevski;Yann Dauphin;Michael Auli", "authorids": "fw245@cornell.edu;angelfan@fb.com;alexei.b@gmail.com;yann@dauphin.io;michael.auli@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nwu2018pay,\ntitle={Pay Less Attention with Lightweight and Dynamic Convolutions},\nauthor={Felix Wu and Angela Fan and Alexei Baevski and Yann Dauphin and Michael Auli},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkVhlh09tX},\n}", "github": "[![github](/images/github_icon.svg) pytorch/fairseq](https://github.com/pytorch/fairseq) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=SkVhlh09tX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "8;8;8", "confidence": "4;4;4", "wc_review": "324;233;538", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "51;266;331", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 8.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 365.0, 127.84626184080106 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 216.0, 119.65227397198377 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 734, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3358231780148394025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SkVhlh09tX", "pdf": "https://openreview.net/pdf?id=SkVhlh09tX", "email": ";;;;", "author_num": 5 }, { "id": "Ske1-209Y7", "title": "Probabilistic Model-Based Dynamic Architecture Search", "track": "main", "status": "Reject", "tldr": "We present an efficient neural network architecture search method based on stochastic natural gradient method via probabilistic modeling.", "abstract": "The architecture search methods for convolutional neural networks (CNNs) have shown promising results. These methods require significant computational resources, as they repeat the neural network training many times to evaluate and search the architectures. Developing the computationally efficient architecture search method is an important research topic. In this paper, we assume that the structure parameters of CNNs are categorical variables, such as types and connectivities of layers, and they are regarded as the learnable parameters. Introducing the multivariate categorical distribution as the underlying distribution for the structure parameters, we formulate a differentiable loss for the training task, where the training of the weights and the optimization of the parameters of the distribution for the structure parameters are coupled. They are trained using the stochastic gradient descent, leading to the optimization of the structure parameters within a single training. We apply the proposed method to search the architecture for two computer vision tasks: image classification and inpainting. The experimental results show that the proposed architecture search method is fast and can achieve comparable performance to the existing methods.", "keywords": "architecture search;stochastic natural gradient;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Nozomu Yoshinari;Kento Uchida;Shota Saito;Shinichi Shirakawa;Youhei Akimoto", "authorids": "yoshinari-nozomu-ry@ynu.jp;uchida-kento-nc@ynu.jp;saito-shota-bt@ynu.jp;shirakawa-shinichi-bg@ynu.ac.jp;akimoto@cs.tsukuba.ac.jp", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyoshinari2019probabilistic,\ntitle={Probabilistic Model-Based Dynamic Architecture Search},\nauthor={Nozomu Yoshinari and Kento Uchida and Shota Saito and Shinichi Shirakawa and Youhei Akimoto},\nyear={2019},\nurl={https://openreview.net/forum?id=Ske1-209Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Ske1-209Y7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "179;347;341", "wc_reply_reviewers": "113;0;0", "wc_reply_authors": "273;760;411", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 289.0, 77.82030583337487 ], "wc_reply_reviewers_avg": [ 37.666666666666664, 53.268710849386586 ], "wc_reply_authors_avg": [ 481.3333333333333, 204.94281045100254 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3283988860191714264&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Ske25sC9FQ", "title": "Robustness and Equivariance of Neural Networks", "track": "main", "status": "Reject", "tldr": "Robustness to rotations comes at the cost of robustness of pixel-wise adversarial perturbations.", "abstract": "Neural networks models are known to be vulnerable to geometric transformations\nas well as small pixel-wise perturbations of input. Convolutional Neural Networks\n(CNNs) are translation-equivariant but can be easily fooled using rotations and\nsmall pixel-wise perturbations. Moreover, CNNs require sufficient translations in\ntheir training data to achieve translation-invariance. Recent work by Cohen &\nWelling (2016), Worrall et al. (2016), Kondor & Trivedi (2018), Cohen & Welling\n(2017), Marcos et al. (2017), and Esteves et al. (2018) has gone beyond translations,\nand constructed rotation-equivariant or more general group-equivariant\nneural network models. In this paper, we do an extensive empirical study of various\nrotation-equivariant neural network models to understand how effectively they\nlearn rotations. This includes Group-equivariant Convolutional Networks (GCNNs)\nby Cohen & Welling (2016), Harmonic Networks (H-Nets) by Worrall et al.\n(2016), Polar Transformer Networks (PTN) by Esteves et al. (2018) and Rotation\nequivariant vector field networks by Marcos et al. (2017). We empirically compare\nthe ability of these networks to learn rotations efficiently in terms of their\nnumber of parameters, sample complexity, rotation augmentation used in training.\nWe compare them against each other as well as Standard CNNs. We observe\nthat as these rotation-equivariant neural networks learn rotations, they instead become\nmore vulnerable to small pixel-wise adversarial attacks, e.g., Fast Gradient\nSign Method (FGSM) and Projected Gradient Descent (PGD), in comparison with\nStandard CNNs. In other words, robustness to geometric transformations in these\nmodels comes at the cost of robustness to small pixel-wise perturbations.", "keywords": "robust;adversarial;equivariance;rotations;GCNNs;CNNs;steerable;neural networks", "primary_area": "", "supplementary_material": "", "author": "Amit Deshpande;Sandesh Kamath;K.V.Subrahmanyam", "authorids": "amitdesh@microsoft.com;ksandeshk@cmi.ac.in;kv@cmi.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndeshpande2019robustness,\ntitle={Robustness and Equivariance of Neural Networks},\nauthor={Amit Deshpande and Sandesh Kamath and K.V.Subrahmanyam},\nyear={2019},\nurl={https://openreview.net/forum?id=Ske25sC9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Ske25sC9FQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;3", "wc_review": "128;73;782", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "114;114;447", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 327.6666666666667, 322.04589044972386 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 225.0, 156.97770542341354 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mlpoZqHxz4kJ:scholar.google.com/&scioq=Robustness+and+Equivariance+of+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Poincare Glove: Hyperbolic Word Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/787", "id": "Ske5r3AqK7", "author_site": "Alexandru Tifrea, Gary B\u00e9cigneul, Octavian Ganea", "tldr": "We embed words in the hyperbolic space and make the connection with the Gaussian word embeddings.", "abstract": "Words are not created equal. In fact, they form an aristocratic graph with a latent hierarchical structure that the next generation of unsupervised learned word embeddings should reveal. In this paper, justified by the notion of delta-hyperbolicity or tree-likeliness of a space, we propose to embed words in a Cartesian product of hyperbolic spaces which we theoretically connect to the Gaussian word embeddings and their Fisher geometry. This connection allows us to introduce a novel principled hypernymy score for word embeddings. Moreover, we adapt the well-known Glove algorithm to learn unsupervised word embeddings in this type of Riemannian manifolds. We further explain how to solve the analogy task using the Riemannian parallel transport that generalizes vector arithmetics to this new type of geometry. Empirically, based on extensive experiments, we prove that our embeddings, trained unsupervised, are the first to simultaneously outperform strong and popular baselines on the tasks of similarity, analogy and hypernymy detection. In particular, for word hypernymy, we obtain new state-of-the-art on fully unsupervised WBLESS classification accuracy.", "keywords": "word embeddings;hyperbolic spaces;poincare ball;hypernymy;analogy;similarity;gaussian embeddings", "primary_area": "", "supplementary_material": "", "author": "Alexandru Tifrea*;Gary Becigneul*;Octavian-Eugen Ganea*", "authorids": "tifreaa@student.ethz.ch;gary.becigneul@inf.ethz.ch;octavian.ganea@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntifrea2018poincare,\ntitle={Poincare Glove: Hyperbolic Word Embeddings},\nauthor={Alexandru Tifrea and Gary Becigneul and Octavian-Eugen Ganea},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Ske5r3AqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "wc_review": "377;288;502", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "564;475;459", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 389.0, 87.7762306474063 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 499.3333333333333, 46.190427098648435 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 357, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4245081116962532671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Ske5r3AqK7", "pdf": "https://openreview.net/pdf?id=Ske5r3AqK7", "email": ";;", "author_num": 3 }, { "id": "Ske6wiAcKQ", "title": "Real-time Neural-based Input Method", "track": "main", "status": "Reject", "tldr": "", "abstract": "The input method is an essential service on every mobile and desktop devices that provides text suggestions. It converts sequential keyboard inputs to the characters in its target language, which is indispensable for Japanese and Chinese users. Due to critical resource constraints and limited network bandwidth of the target devices, applying neural models to input method is not well explored. In this work, we apply a LSTM-based language model to input method and evaluate its performance for both prediction and conversion tasks with Japanese BCCWJ corpus. We articulate the bottleneck to be the slow softmax computation during conversion. To solve the issue, we propose incremental softmax approximation approach, which computes softmax with a selected subset vocabulary and fix the stale probabilities when the vocabulary is updated in future steps. We refer to this method as incremental selective softmax. The results show a two order speedup for the softmax computation when converting Japanese input sequences with a large vocabulary, reaching real-time speed on commodity CPU. We also exploit the model compressing potential to achieve a 92% model size reduction without losing accuracy.", "keywords": "input method;language model;neural network;softmax", "primary_area": "", "supplementary_material": "", "author": "Jiali Yao;Raphael Shu;Xinjian Li;Katsutoshi Ohtsuki;Hideki Nakayama", "authorids": "jiayao@microsoft.com;shu@nlab.ci.i.u-tokyo.ac.jp;xinjianl@andrew.cmu.edu;katsutoshi.ohtsuki@microsoft.com;nakayama@ci.i.u-tokyo.ac.jp", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nyao2019realtime,\ntitle={Real-time Neural-based Input Method},\nauthor={Jiali Yao and Raphael Shu and Xinjian Li and Katsutoshi Ohtsuki and Hideki Nakayama},\nyear={2019},\nurl={https://openreview.net/forum?id=Ske6wiAcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Ske6wiAcKQ", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;3", "wc_review": "403;610;167", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "259;212;156", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 393.3333333333333, 180.9831176902666 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 209.0, 42.10304818735416 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6442959450793766515&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Ske7ToC5Km", "title": "Graph2Seq: Scalable Learning Dynamics for Graphs", "track": "main", "status": "Reject", "tldr": "Today's neural networks for graphs do not generalize to graphs that are much bigger than the training graphs. We propose graph2seq, a method that represents vertices as time-series sequences instead of fixed-sized vectors for improved generalization.", "abstract": "Neural networks have been shown to be an effective tool for learning algorithms over graph-structured data. However, graph representation techniques---that convert graphs to real-valued vectors for use with neural networks---are still in their infancy. Recent works have proposed several approaches (e.g., graph convolutional networks), but these methods have difficulty scaling and generalizing to graphs with different sizes and shapes. We present Graph2Seq, a new technique that represents vertices of graphs as infinite time-series. By not limiting the representation to a fixed dimension, Graph2Seq scales naturally to graphs of arbitrary sizes and shapes. Graph2Seq is also reversible, allowing full recovery of the graph structure from the sequence. By analyzing a formal computational model for graph representation, we show that an unbounded sequence is necessary for scalability. Our experimental results with Graph2Seq show strong generalization and new state-of-the-art performance on a variety of graph combinatorial optimization problems.\n", "keywords": "graph neural networks;scalable representations;combinatorial optimization;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Shaileshh Bojja Venkatakrishnan;Mohammad Alizadeh;Pramod Viswanath", "authorids": "bjjvnkt@csail.mit.edu;alizadeh@csail.mit.edu;pramodv@illinois.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nvenkatakrishnan2019graphseq,\ntitle={Graph2Seq: Scalable Learning Dynamics for Graphs},\nauthor={Shaileshh Bojja Venkatakrishnan and Mohammad Alizadeh and Pramod Viswanath},\nyear={2019},\nurl={https://openreview.net/forum?id=Ske7ToC5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Ske7ToC5Km", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;3", "wc_review": "394;460;116", "wc_reply_reviewers": "1059;116;0", "wc_reply_authors": "2606;667;154", "reply_reviewers": "3;1;0", "reply_authors": "7;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 323.3333333333333, 149.06225395973172 ], "wc_reply_reviewers_avg": [ 391.6666666666667, 474.246302627185 ], "wc_reply_authors_avg": [ 1142.3333333333333, 1055.9458108991935 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 3.3333333333333335, 2.6246692913372702 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3204986358554024947&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SkeJ6iR9Km", "title": "Variational Sparse Coding", "track": "main", "status": "Reject", "tldr": "We explore the intersection of VAEs and sparse coding.", "abstract": "Variational auto-encoders\n (VAEs) offer a tractable approach when performing approximate inference in otherwise intractable generative models. However, standard VAEs often produce latent codes that are disperse and lack interpretability, thus making the resulting representations unsuitable for auxiliary tasks (e.g. classi\ufb01cation) and human interpretation. We address these issues by merging ideas from variational auto-encoders and sparse coding, and propose to explicitly model sparsity in the latent space of a VAE with a Spike and Slab prior distribution. We derive the evidence lower bound using a discrete mixture recognition function thereby making approximate posterior inference as computational ef\ufb01cient as in the standard VAE case. With the new approach, we are able to infer truly sparse representations with generally intractable non-linear probabilistic models. We show that these sparse representations are advantageous over standard VAE representations on two benchmark classi\ufb01cation tasks (MNIST and Fashion-MNIST) by demonstrating improved classi\ufb01cation accuracy and signi\ufb01cantly increased robustness to the number of latent dimensions. Furthermore, we demonstrate qualitatively that the sparse elements capture subjectively understandable sources of variation.", "keywords": "Variational Auto-Encoders;Sparse Coding;Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Francesco Tonolini;Bjorn Sand Jensen;Roderick Murray-Smith", "authorids": "2402432t@student.gla.ac.uk;bjorn.jensen@glasgow.ac.uk;roderick.murray-smith@glasgow.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntonolini2019variational,\ntitle={Variational Sparse Coding},\nauthor={Francesco Tonolini and Bjorn Sand Jensen and Roderick Murray-Smith},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeJ6iR9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkeJ6iR9Km", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;5", "wc_review": "453;541;576", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "651;372;850", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 523.3333333333334, 51.74510175422941 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 624.3333333333334, 196.0515805144713 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=80284467254869635&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Episodic Curiosity through Reachability", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/790", "id": "SkeK3s0qKQ", "author_site": "Nikolay Savinov, Anton Raichuk, Damien Vincent, Rapha\u00ebl Marinier, Marc Pollefeys, Timothy Lillicrap, Sylvain Gelly", "tldr": "We propose a novel model of curiosity based on episodic memory and the ideas of reachability which allows us to overcome the known \"couch-potato\" issues of prior work.", "abstract": "Rewards are sparse in the real world and most of today's reinforcement learning algorithms struggle with such sparsity. One solution to this problem is to allow the agent to create rewards for itself - thus making rewards dense and more suitable for learning. In particular, inspired by curious behaviour in animals, observing something novel could be rewarded with a bonus. Such bonus is summed up with the real task reward - making it possible for RL algorithms to learn from the combined reward. We propose a new curiosity method which uses episodic memory to form the novelty bonus. To determine the bonus, the current observation is compared with the observations in memory. Crucially, the comparison is done based on how many environment steps it takes to reach the current observation from those in memory - which incorporates rich information about environment dynamics. This allows us to overcome the known \"couch-potato\" issues of prior work - when the agent finds a way to instantly gratify itself by exploiting actions which lead to hardly predictable consequences. We test our approach in visually rich 3D environments in ViZDoom, DMLab and MuJoCo. In navigational tasks from ViZDoom and DMLab, our agent outperforms the state-of-the-art curiosity method ICM. In MuJoCo, an ant equipped with our curiosity module learns locomotion out of the first-person-view curiosity only. The code is available at https://github.com/google-research/episodic-curiosity/.", "keywords": "deep learning;reinforcement learning;curiosity;exploration;episodic memory", "primary_area": "", "supplementary_material": "", "author": "Nikolay Savinov;Anton Raichuk;Damien Vincent;Raphael Marinier;Marc Pollefeys;Timothy Lillicrap;Sylvain Gelly", "authorids": "nikolay.savinov@inf.ethz.ch;raveman@google.com;damienv@google.com;raphaelm@google.com;marc.pollefeys@inf.ethz.ch;countzero@google.com;sylvaingelly@google.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nsavinov2018episodic,\ntitle={Episodic Curiosity through Reachability},\nauthor={Nikolay Savinov and Anton Raichuk and Damien Vincent and Raphael Marinier and Marc Pollefeys and Timothy Lillicrap and Sylvain Gelly},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeK3s0qKQ},\n}", "github": "[![github](/images/github_icon.svg) google-research/episodic-curiosity](https://github.com/google-research/episodic-curiosity)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8;8", "confidence": "4;4;3;4", "wc_review": "252;566;421;712", "wc_reply_reviewers": "142;0;53;130", "wc_reply_authors": "417;895;821;1952", "reply_reviewers": "1;0;1;1", "reply_authors": "2;2;2;5", "rating_avg": [ 7.25, 0.82915619758885 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 487.75, 170.62000908451506 ], "wc_reply_reviewers_avg": [ 81.25, 58.02316347804556 ], "wc_reply_authors_avg": [ 1021.25, 567.3298753811578 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.75, 1.299038105676658 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.5222329678670935, "gs_citation": 359, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3202653392377789217&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SkeK3s0qKQ", "pdf": "https://openreview.net/pdf?id=SkeK3s0qKQ", "email": ";;;;;;", "author_num": 7 }, { "id": "SkeL6sCqK7", "title": "REPRESENTATION COMPRESSION AND GENERALIZATION IN DEEP NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "Introduce an information theoretic viewpoint on the behavior of deep networks optimization processes and their generalization abilities", "abstract": "Understanding the groundbreaking performance of Deep Neural Networks is one\nof the greatest challenges to the scientific community today. In this work, we\nintroduce an information theoretic viewpoint on the behavior of deep networks\noptimization processes and their generalization abilities. By studying the Information\nPlane, the plane of the mutual information between the input variable and\nthe desired label, for each hidden layer. Specifically, we show that the training of\nthe network is characterized by a rapid increase in the mutual information (MI)\nbetween the layers and the target label, followed by a longer decrease in the MI\nbetween the layers and the input variable. Further, we explicitly show that these\ntwo fundamental information-theoretic quantities correspond to the generalization\nerror of the network, as a result of introducing a new generalization bound that is\nexponential in the representation compression. The analysis focuses on typical\npatterns of large-scale problems. For this purpose, we introduce a novel analytic\nbound on the mutual information between consecutive layers in the network.\nAn important consequence of our analysis is a super-linear boost in training time\nwith the number of non-degenerate hidden layers, demonstrating the computational\nbenefit of the hidden layers.", "keywords": "Deep neural network;information theory;training dynamics", "primary_area": "", "supplementary_material": "", "author": "Ravid Shwartz-Ziv;Amichai Painsky;Naftali Tishby", "authorids": "ravid.ziv@mail.huji.ac.il;amichai.painsky@mail.huji.ac.il;tishby@cs.huji.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nshwartz-ziv2019representation,\ntitle={{REPRESENTATION} {COMPRESSION} {AND} {GENERALIZATION} {IN} {DEEP} {NEURAL} {NETWORKS}},\nauthor={Ravid Shwartz-Ziv and Amichai Painsky and Naftali Tishby},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeL6sCqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkeL6sCqK7", "pdf_size": 0, "rating": "3;4;6", "confidence": "3;3;3", "wc_review": "414;141;219", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 258.0, 114.81289126226201 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9534858175510625886&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SkeQniAqK7", "title": "Combining Learned Representations for Combinatorial Optimization", "track": "main", "status": "Reject", "tldr": "We use combinations of RBMs to solve number factorization and combinatorial optimization problems.", "abstract": "We propose a new approach to combine Restricted Boltzmann Machines (RBMs) that can be used to solve combinatorial optimization problems. This allows synthesis of larger models from smaller RBMs that have been pretrained, thus effectively bypassing the problem of learning in large RBMs, and creating a system able to model a large, complex multi-modal space. We validate this approach by using learned representations to create ``invertible boolean logic'', where we can use Markov chain Monte Carlo (MCMC) approaches to find the solution to large scale boolean satisfiability problems and show viability towards other combinatorial optimization problems. Using this method, we are able to solve 64 bit addition based problems, as well as factorize 16 bit numbers. We find that these combined representations can provide a more accurate result for the same sample size as compared to a fully trained model. ", "keywords": "Generative Models;Restricted Boltzmann Machines;Transfer Learning;Compositional Learning", "primary_area": "", "supplementary_material": "", "author": "Saavan Patel;Sayeef Salahuddin", "authorids": "saavan@berkeley.edu;sayeef@berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\npatel2019combining,\ntitle={Combining Learned Representations for Combinatorial Optimization},\nauthor={Saavan Patel and Sayeef Salahuddin},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeQniAqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=SkeQniAqK7", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;3", "wc_review": "269;701;327", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "351;799;395", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 432.3333333333333, 191.44595988308438 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 515.0, 201.6201048176165 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6c7PFtPz7boJ:scholar.google.com/&scioq=Combining+Learned+Representations+for+Combinatorial+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 5 }, { "id": "SkeRTsAcYm", "title": "Phase-Aware Speech Enhancement with Deep Complex U-Net", "track": "main", "status": "Poster", "tldr": "This paper proposes a novel complex masking method for speech enhancement along with a loss function for efficient phase estimation.", "abstract": "Most deep learning-based models for speech enhancement have mainly focused on estimating the magnitude of spectrogram while reusing the phase from noisy speech for reconstruction. This is due to the difficulty of estimating the phase of clean speech. To improve speech enhancement performance, we tackle the phase estimation problem in three ways. First, we propose Deep Complex U-Net, an advanced U-Net structured model incorporating well-defined complex-valued building blocks to deal with complex-valued spectrograms. Second, we propose a polar coordinate-wise complex-valued masking method to reflect the distribution of complex ideal ratio masks. Third, we define a novel loss function, weighted source-to-distortion ratio (wSDR) loss, which is designed to directly correlate with a quantitative evaluation measure. Our model was evaluated on a mixture of the Voice Bank corpus and DEMAND database, which has been widely used by many deep learning models for speech enhancement. Ablation experiments were conducted on the mixed dataset showing that all three proposed approaches are empirically valid. Experimental results show that the proposed method achieves state-of-the-art performance in all metrics, outperforming previous approaches by a large margin.", "keywords": "speech enhancement;deep learning;complex neural networks;phase estimation", "primary_area": "", "supplementary_material": "", "author": "Hyeong-Seok Choi;Jang-Hyun Kim;Jaesung Huh;Adrian Kim;Jung-Woo Ha;Kyogu Lee", "authorids": "kekepa15@snu.ac.kr;blue378@snu.ac.kr;jaesung.huh@navercorp.com;adrian.kim@navercorp.com;jungwoo.ha@navercorp.com;kglee@snu.ac.kr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nchoi2018phaseaware,\ntitle={Phase-Aware Speech Enhancement with Deep Complex U-Net},\nauthor={Hyeong-Seok Choi and Janghyun Kim and Jaesung Huh and Adrian Kim and Jung-Woo Ha and Kyogu Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeRTsAcYm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=SkeRTsAcYm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkeRTsAcYm", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "52;143;386", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "181;328;660", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 193.66666666666666, 140.98305650758968 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 389.6666666666667, 200.35357634830368 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 476, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11525798829336957800&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkeUG30cFQ", "title": "The Expressive Power of Deep Neural Networks with Circulant Matrices", "track": "main", "status": "Reject", "tldr": "We provide a theoretical study of the properties of Deep circulant-diagonal ReLU Networks and demonstrate that they are bounded width universal approximators.", "abstract": "Recent results from linear algebra stating that any matrix can be decomposed into products of diagonal and circulant matrices has lead to the design of compact deep neural network architectures that perform well in practice. In this paper, we bridge the gap between these good empirical results \nand the theoretical approximation capabilities of Deep diagonal-circulant ReLU networks. More precisely, we first demonstrate that a Deep diagonal-circulant ReLU networks of\nbounded width and small depth can approximate a deep ReLU network in which the dense matrices are\nof low rank. Based on this result, we provide new bounds on the expressive power and universal approximativeness of this type of networks. We support our experimental results with thorough experiments on a large, real world video classification problem.", "keywords": "deep learning;circulant matrices;universal approximation", "primary_area": "", "supplementary_material": "", "author": "Alexandre Araujo;Benjamin Negrevergne;Yann Chevaleyre;Jamal Atif", "authorids": "alexandre.araujo@dauphine.eu;benjamin.negrevergne@dauphine.fr;yann.chevaleyre@lamsade.dauphine.fr;jamal.atif@lamsade.dauphine.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\naraujo2019the,\ntitle={The Expressive Power of Deep Neural Networks with Circulant Matrices},\nauthor={Alexandre Araujo and Benjamin Negrevergne and Yann Chevaleyre and Jamal Atif},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeUG30cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkeUG30cFQ", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;5;4", "wc_review": "319;243;118", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "312;630;0", "reply_reviewers": "0;0;0", "reply_authors": "2;1;0", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 226.66666666666666, 82.8666934835033 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 314.0, 257.2003110418026 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.18898223650461357, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lZ8XtuTPtngJ:scholar.google.com/&scioq=The+Expressive+Power+of+Deep+Neural+Networks+with+Circulant+Matrices&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Generative predecessor models for sample-efficient imitation learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/814", "id": "SkeVsiAcYm", "author_site": "Yannick Schroecker, Mel Vecerik, Jon Scholz", "tldr": "", "abstract": "We propose Generative Predecessor Models for Imitation Learning (GPRIL), a novel imitation learning algorithm that matches the state-action distribution to the distribution observed in expert demonstrations, using generative models to reason probabilistically about alternative histories of demonstrated states. We show that this approach allows an agent to learn robust policies using only a small number of expert demonstrations and self-supervised interactions with the environment. We derive this approach from first principles and compare it empirically to a state-of-the-art imitation learning method, showing that it outperforms or matches its performance on two simulated robot manipulation tasks and demonstrate significantly higher sample efficiency by applying the algorithm on a real robot.", "keywords": "Imitation Learning;Generative Models;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Yannick Schroecker;Mel Vecerik;Jon Scholz", "authorids": "yannickschroecker@gatech.edu;vec@google.com;jscholz@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nschroecker2018generative,\ntitle={Generative predecessor models for sample-efficient imitation learning},\nauthor={Yannick Schroecker and Mel Vecerik and Jon Scholz},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeVsiAcYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;3;4", "wc_review": "235;127;380", "wc_reply_reviewers": "804;0;107", "wc_reply_authors": "1531;485;285", "reply_reviewers": "4;0;1", "reply_authors": "5;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 247.33333333333334, 103.65434010316316 ], "wc_reply_reviewers_avg": [ 303.6666666666667, 356.47564978394934 ], "wc_reply_authors_avg": [ 767.0, 546.3649573926449 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.699673171197595 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8821752117050196964&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SkeVsiAcYm", "pdf": "https://openreview.net/pdf?id=SkeVsiAcYm", "email": ";;", "author_num": 3 }, { "id": "SkeXehR9t7", "title": "Graph2Seq: Graph to Sequence Learning with Attention-Based Neural Networks", "track": "main", "status": "Reject", "tldr": "Graph to Sequence Learning with Attention-Based Neural Networks", "abstract": "The celebrated Sequence to Sequence learning (Seq2Seq) technique and its numerous variants achieve excellent performance on many tasks. However, many machine learning tasks have inputs naturally represented as graphs; existing Seq2Seq models face a significant challenge in achieving accurate conversion from graph form to the appropriate sequence. To address this challenge, we introduce a general end-to-end graph-to-sequence neural encoder-decoder architecture that maps an input graph to a sequence of vectors and uses an attention-based LSTM method to decode the target sequence from these vectors. Our method first generates the node and graph embeddings using an improved graph-based neural network with a novel aggregation strategy to incorporate edge direction information in the node embeddings. We further introduce an attention mechanism that aligns node embeddings and the decoding sequence to better cope with large graphs. Experimental results on bAbI, Shortest Path, and Natural Language Generation tasks demonstrate that our model achieves state-of-the-art performance and significantly outperforms existing graph neural networks, Seq2Seq, and Tree2Seq models; using the proposed bi-directional node embedding aggregation strategy, the model can converge rapidly to the optimal performance.", "keywords": "Graph Encoder;Graph Decoder;Graph2Seq;Graph Attention", "primary_area": "", "supplementary_material": "", "author": "Kun Xu;Lingfei Wu;Zhiguo Wang;Yansong Feng;Michael Witbrock;Vadim Sheinin", "authorids": "xukun@pku.edu.cn;lwu@email.wm.edu;zhigwang@us.ibm.com;fengyansong@pku.edu.cn;witbrock@us.ibm.com;vadims@us.ibm.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nxu2019graphseq,\ntitle={Graph2Seq: Graph to Sequence Learning with Attention-Based Neural Networks},\nauthor={Kun Xu and Lingfei Wu and Zhiguo Wang and Yansong Feng and Michael Witbrock and Vadim Sheinin},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeXehR9t7},\n}", "github": "[![github](/images/github_icon.svg) IBM/Graph2Seq](https://github.com/IBM/Graph2Seq) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=SkeXehR9t7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkeXehR9t7", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;4;4", "wc_review": "349;423;576", "wc_reply_reviewers": "1829;190;0", "wc_reply_authors": "4947;1194;1208", "reply_reviewers": "4;1;0", "reply_authors": "8;2;2", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 449.3333333333333, 94.52454119903936 ], "wc_reply_reviewers_avg": [ 673.0, 821.0874902631672 ], "wc_reply_authors_avg": [ 2449.6666666666665, 1765.8905842536099 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.699673171197595 ], "reply_authors_avg": [ 4.0, 2.8284271247461903 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 236, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2530659066775293355&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SkeZEhR5FQ", "title": "Learning Graph Decomposition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a novel end-to-end trainable framework for the graph decomposition problem. The minimum cost multicut problem is first converted to an unconstrained binary cubic formulation where cycle consistency constraints are incorporated into the objective function. The new optimization problem can be viewed as a Conditional Random Field (CRF) in which the random variables are associated with the binary edge labels of the initial graph and the hard constraints are introduced in the CRF as high-order potentials. The parameters of a standard Neural Network and the fully differentiable CRF can be optimized in an end-to-end manner. We demonstrate the proposed learning algorithm in the context of clustering of hand written digits, particularly in a setting where no direct supervision for the graph decomposition task is available, and multiple person pose estimation from images in the wild. The experiments validate the effectiveness of our approach both for the feature learning and for the final clustering task.", "keywords": "multicut graph decomposition;optimization by learning;pose estimation;clustering", "primary_area": "", "supplementary_material": "", "author": "Jie Song;Bjoern Andres;Michael Black;Otmar Hilliges;Siyu Tang", "authorids": "jsong@inf.ethz.ch;bjoern.andres@de.bosch.com;black@tuebingen.mpg.de;otmar.hilliges@inf.ethz.ch;stang@tuebingen.mpg.de", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkeZEhR5FQ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "wc_review": "911;411;68", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 463.3333333333333, 346.1370955111412 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Adaptive Estimators Show Information Compression in Deep Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/977", "id": "SkeZisA5t7", "author_site": "Ivan Chelombiev, Conor Houghton, Cian O'Donnell", "tldr": "We developed robust mutual information estimates for DNNs and used them to observe compression in networks with non-saturating activation functions", "abstract": "To improve how neural networks function it is crucial to understand their learning process. The information bottleneck theory of deep learning proposes that neural networks achieve good generalization by compressing their representations to disregard information that is not relevant to the task. However, empirical evidence for this theory is conflicting, as compression was only observed when networks used saturating activation functions. In contrast, networks with non-saturating activation functions achieved comparable levels of task performance but did not show compression. In this paper we developed more robust mutual information estimation techniques, that adapt to hidden activity of neural networks and produce more sensitive measurements of activations from all functions, especially unbounded functions. Using these adaptive estimation techniques, we explored compression in networks with a range of different activation functions. With two improved methods of estimation, firstly, we show that saturation of the activation function is not required for compression, and the amount of compression varies between different activation functions. We also find that there is a large amount of variation in compression between different network initializations. Secondary, we see that L2 regularization leads to significantly increased compression, while preventing overfitting. Finally, we show that only compression of the last layer is positively correlated with generalization.", "keywords": "deep neural networks;mutual information;information bottleneck;noise;L2 regularization", "primary_area": "", "supplementary_material": "", "author": "Ivan Chelombiev;Conor Houghton;Cian O'Donnell", "authorids": "ic14436@bristol.ac.uk;conor.houghton@bristol.ac.uk;cian.odonnell@bristol.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchelombiev2018adaptive,\ntitle={Adaptive Estimators Show Information Compression in Deep Neural Networks},\nauthor={Ivan Chelombiev and Conor Houghton and Cian O'Donnell},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkeZisA5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;4", "wc_review": "208;625;238", "wc_reply_reviewers": "0;0;53", "wc_reply_authors": "490;1207;586", "reply_reviewers": "0;0;1", "reply_authors": "1;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 357.0, 189.899973670351 ], "wc_reply_reviewers_avg": [ 17.666666666666668, 24.984439601924677 ], "wc_reply_authors_avg": [ 761.0, 317.79553174958266 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 52, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2654288184895561029&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SkeZisA5t7", "pdf": "https://openreview.net/pdf?id=SkeZisA5t7", "email": ";;", "author_num": 3 }, { "id": "SkeiPsAqK7", "title": "NA", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "NA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Dai Quoc Nguyen;Tu Dinh Nguyen;Dinh Phung", "authorids": "dai.nguyen@monash.edu;tu.dinh.nguyen@monash.edu;dinh.phung@monash.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkeiPsAqK7", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "249;233;414", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "194;153;18", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 298.6666666666667, 81.81415253827149 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 121.66666666666667, 75.19012937583999 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Multilingual Neural Machine Translation With Soft Decoupled Encoding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1052", "id": "Skeke3C5Fm", "author_site": "Xinyi Wang, Hieu Pham, Philip Arthur, Graham Neubig", "tldr": "", "abstract": "Multilingual training of neural machine translation (NMT) systems has led to impressive accuracy improvements on low-resource languages. However, there are still significant challenges in efficiently learning word representations in the face of paucity of data. In this paper, we propose Soft Decoupled Encoding (SDE), a multilingual lexicon encoding framework specifically designed to share lexical-level information intelligently without requiring heuristic preprocessing such as pre-segmenting the data. SDE represents a word by its spelling through a character encoding, and its semantic meaning through a latent embedding space shared by all languages. Experiments on a standard dataset of four low-resource languages show consistent improvements over strong multilingual NMT baselines, with gains of up to 2 BLEU on one of the tested languages, achieving the new state-of-the-art on all four language pairs.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xinyi Wang;Hieu Pham;Philip Arthur;Graham Neubig", "authorids": "xinyiw1@cs.cmu.edu;hyhieu@cmu.edu;philip.arthur@monash.edu;gneubig@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwang2018multilingual,\ntitle={Multilingual Neural Machine Translation With Soft Decoupled Encoding},\nauthor={Xinyi Wang and Hieu Pham and Philip Arthur and Graham Neubig},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Skeke3C5Fm},\n}", "github": "[![github](/images/github_icon.svg) cindyxinyiwang/SDE](https://github.com/cindyxinyiwang/SDE)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;4", "wc_review": "531;311;280", "wc_reply_reviewers": "132;0;0", "wc_reply_authors": "1228;664;498", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 374.0, 111.73480508179475 ], "wc_reply_reviewers_avg": [ 44.0, 62.22539674441618 ], "wc_reply_authors_avg": [ 796.6666666666666, 312.4370158750222 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 74, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1841872742547049658&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=Skeke3C5Fm", "pdf": "https://openreview.net/pdf?id=Skeke3C5Fm", "email": ";;;", "author_num": 4 }, { "id": "SkelJnRqt7", "title": "Neural separation of observed and unobserved distributions", "track": "main", "status": "Reject", "tldr": "An iterative neural method for extracting signals that are only observed mixed with other signals", "abstract": "Separating mixed distributions is a long standing challenge for machine learning and signal processing. Applications include: single-channel multi-speaker separation (cocktail party problem), singing voice separation and separating reflections from images. Most current methods either rely on making strong assumptions on the source distributions (e.g. sparsity, low rank, repetitiveness) or rely on having training samples of each source in the mixture. In this work, we tackle the scenario of extracting an unobserved distribution additively mixed with a signal from an observed (arbitrary) distribution. We introduce a new method: Neural Egg Separation - an iterative method that learns to separate the known distribution from progressively finer estimates of the unknown distribution. In some settings, Neural Egg Separation is initialization sensitive, we therefore introduce GLO Masking which ensures a good initialization. Extensive experiments show that our method outperforms current methods that use the same level of supervision and often achieves similar performance to full supervision. ", "keywords": "source separation;non-adversarial training;source unmixing;iterative neural training;generative modeling", "primary_area": "", "supplementary_material": "", "author": "Tavi Halperin;Ariel Ephrat;Yedid Hoshen", "authorids": "tavihalperin@gmail.com;ariel.ephrat@gmail.com;yedidh@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhalperin2019neural,\ntitle={Neural separation of observed and unobserved distributions},\nauthor={Tavi Halperin and Ariel Ephrat and Yedid Hoshen},\nyear={2019},\nurl={https://openreview.net/forum?id=SkelJnRqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SkelJnRqt7", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;2", "wc_review": "399;158;233", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "303;704;145", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 263.3333333333333, 100.6986704094062 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 384.0, 235.28847542254735 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3583406841206585968&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SkenUj0qYm", "title": "Semi-supervised Learning with Multi-Domain Sentiment Word Embeddings", "track": "main", "status": "Reject", "tldr": "", "abstract": "Word embeddings are known to boost performance of many NLP tasks such as text classification, meanwhile they can be enhanced by labels at the document level to capture nuanced meaning such as sentiment and topic. Can one combine these two research directions to benefit from both? In this paper, we propose to jointly train a text classifier with a label-enhanced and domain-aware word embedding model, using an unlabeled corpus and only a few labeled data from non-target domains. The embeddings are trained on the unlabed corpus and enhanced by pseudo labels coming from the classifier, and at the same time are used by the classifier as input and training signals. We formalize this symbiotic cycle in a variational Bayes framework, and show that our method improves both the embeddings and the text classifier, outperforming state-of-the-art domain adaptation and semi-supervised learning techniques. We conduct detailed ablative tests to reveal gains from important components of our approach. The source code and experiment data will be publicly released.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ran Tian;Yash Agrawal;Kento Watanabe;Hiroya Takamura", "authorids": "robin.tianran@gmail.com;yashagrawal@iitkgp.ac.in;kento.watanabe@aist.go.jp;takamura.hiroya@aist.go.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ntian2019semisupervised,\ntitle={Semi-supervised Learning with Multi-Domain Sentiment Word Embeddings},\nauthor={Ran Tian and Yash Agrawal and Kento Watanabe and Hiroya Takamura},\nyear={2019},\nurl={https://openreview.net/forum?id=SkenUj0qYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkenUj0qYm", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;3", "wc_review": "389;453;213", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "521;791;544", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 351.6666666666667, 101.47358714901573 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 618.6666666666666, 122.219292894189 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4219156629215109554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "Skf-oo0qt7", "title": "On Generalization Bounds of a Family of Recurrent Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recurrent Neural Networks (RNNs) have been widely applied to sequential data analysis. Due to their complicated modeling structures, however, the theory behind is still largely missing. To connect theory and practice, we study the generalization properties of vanilla RNNs as well as their variants, including Minimal Gated Unit (MGU) and Long Short Term Memory (LSTM) RNNs. Specifically, our theory is established under the PAC-Learning framework. The generalization bound is presented in terms of the spectral norms of the weight matrices and the total number of parameters. We also establish refined generalization bounds with additional norm assumptions, and draw a comparison among these bounds. We remark: (1) Our generalization bound for vanilla RNNs is significantly tighter than the best of existing results; (2) We are not aware of any other generalization bounds for MGU and LSTM in the exiting literature; (3) We demonstrate the advantages of these variants in generalization.", "keywords": "Recurrent Neural Networks;MGU;LSTM;Generalization Bound;PAC-Learning", "primary_area": "", "supplementary_material": "", "author": "Minshuo Chen;Xingguo Li;Tuo Zhao", "authorids": "mchen393@gatech.edu;lixx1661@umn.edu;tourzhao@gatech.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2019on,\ntitle={On Generalization Bounds of a Family of Recurrent Neural Networks},\nauthor={Minshuo Chen and Xingguo Li and Tuo Zhao},\nyear={2019},\nurl={https://openreview.net/forum?id=Skf-oo0qt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Skf-oo0qt7", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;4", "wc_review": "316;362;222", "wc_reply_reviewers": "171;103;0", "wc_reply_authors": "1250;710;419", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 300.0, 58.26376804384236 ], "wc_reply_reviewers_avg": [ 91.33333333333333, 70.29619872763787 ], "wc_reply_authors_avg": [ 793.0, 344.2934794619265 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7438521539080286164&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "Skf5qiC5KQ", "title": "A Unified View of Deep Metric Learning via Gradient Analysis", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Loss functions play a pivotal role in deep metric learning (DML). A large variety of loss functions have been proposed in DML recently. However, it remains difficult to answer this question: what are the intrinsic differences among these loss functions?This paper answers this question by proposing a unified perspective to rethink deep metric loss functions. We show theoretically that most DML methods in deep metric learning, in view of gradient equivalence, are essentially weight assignment strategies of training pairs. Based on this unified view, we revisit several typical DML methods and disclose their hidden drawbacks. Moreover, we point out the key components of an effective DML approach which drives us to propose our weight assignment framework. We evaluate our method on image retrieval tasks, and show that it outperforms the state-of-the-art DML approaches by a significant margin on the CUB-200-2011, Cars-196, Stanford Online Products and In-Shop Clothes Retrieval datasets. ", "keywords": "metric learning;gradient equivalence;image retrieval", "primary_area": "", "supplementary_material": "", "author": "Xun Wang;Xintong Han;Weilin Huang;Dengke Dong;Matthew R. Scott", "authorids": "xunwang@malong.com;;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Skf5qiC5KQ", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "156;316;566", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 346.0, 168.72067646458353 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5501398931203048184&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Approximating CNNs with Bag-of-local-Features models works surprisingly well on ImageNet", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/667", "id": "SkfMWhAqYQ", "author_site": "Wieland Brendel, Matthias Bethge", "tldr": "Aggregating class evidence from many small image patches suffices to solve ImageNet, yields more interpretable models and can explain aspects of the decision-making of popular DNNs.", "abstract": "Deep Neural Networks (DNNs) excel on many complex perceptual tasks but it has proven notoriously difficult to understand how they reach their decisions. We here introduce a high-performance DNN architecture on ImageNet whose decisions are considerably easier to explain. Our model, a simple variant of the ResNet-50 architecture called BagNet, classifies an image based on the occurrences of small local image features without taking into account their spatial ordering. This strategy is closely related to the bag-of-feature (BoF) models popular before the onset of deep learning and reaches a surprisingly high accuracy on ImageNet (87.6% top-5 for 32 x 32 px features and Alexnet performance for 16 x16 px features). The constraint on local features makes it straight-forward to analyse how exactly each part of the image influences the classification. Furthermore, the BagNets behave similar to state-of-the art deep neural networks such as VGG-16, ResNet-152 or DenseNet-169 in terms of feature sensitivity, error distribution and interactions between image parts. This suggests that the improvements of DNNs over previous bag-of-feature classifiers in the last few years is mostly achieved by better fine-tuning rather than by qualitatively different decision strategies.", "keywords": "interpretability;representation learning;bag of features;deep learning;object recognition", "primary_area": "", "supplementary_material": "", "author": "Wieland Brendel;Matthias Bethge", "authorids": "wieland.brendel@bethgelab.org;matthias.bethge@uni-tuebingen.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbrendel2018approximating,\ntitle={Approximating {CNN}s with Bag-of-local-Features models works surprisingly well on ImageNet},\nauthor={Wieland Brendel and Matthias Bethge},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkfMWhAqYQ},\n}", "github": "[![github](/images/github_icon.svg) wielandbrendel/bag-of-local-features-models](https://github.com/wielandbrendel/bag-of-local-features-models) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=SkfMWhAqYQ)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "160;499;162", "wc_reply_reviewers": "330;19;0", "wc_reply_authors": "1414;886;69", "reply_reviewers": "2;2;0", "reply_authors": "7;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 273.6666666666667, 159.3368200455319 ], "wc_reply_reviewers_avg": [ 116.33333333333333, 151.28413297133605 ], "wc_reply_authors_avg": [ 789.6666666666666, 553.3030112173819 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 3.3333333333333335, 2.6246692913372702 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 741, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13421262728275736184&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SkfMWhAqYQ", "pdf": "https://openreview.net/pdf?id=SkfMWhAqYQ", "email": ";", "author_num": 2 }, { "id": "SkfQAiA9YX", "title": "In search of theoretically grounded pruning", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Deep learning relies on resource-heavy linear algebra operations which can be prohibitively expensive when deploying to constrained embedded and mobile devices, or even when training large-scale networks. One way to reduce a neural network's resource requirements is to sparsify its weight matrices - a process often referred to as pruning. It is typically achieved by removing least important weights as measured by some salience criterion, with pruning by magnitude being the most popular option. This, however, often makes close to random judgments. In this paper we aim to closely investigate the concept of model weight importance, with a particular focus on the magnitude criterion and its most suitable substitute. To this end we identify a suitable Statistical framework and derive deep model parameter asymptotic theory to use with it. Thus, we derive a statistically-grounded pruning criterion which we compare with the magnitude pruning both qualitatively and quantitatively. We find this criterion to better capture parameter salience, by accounting for its estimation uncertainty. This results in improved performance and easier post-pruned re-training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Filip Svoboda;Edgar Liberis;Nicholas D. Lane", "authorids": "filip.svoboda@stx.ox.ac.uk;edgar.liberis@chch.ox.ac.uk;nicholas.lane@cs.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkfQAiA9YX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;3", "wc_review": "354;645;288", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "51;232;58", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 429.0, 155.09352017411945 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 113.66666666666667, 83.72308854524871 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OtllnZcs_d8J:scholar.google.com/&scioq=In+search+of+theoretically+grounded+pruning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkfTIj0cKX", "title": "Purchase as Reward : Session-based Recommendation by Imagination Reconstruction", "track": "main", "status": "Reject", "tldr": "We propose the IRN architecture to augment sparse and delayed purchase reward for session-based recommendation.", "abstract": "One of the key challenges of session-based recommender systems is to enhance users\u2019 purchase intentions. In this paper, we formulate the sequential interactions between user sessions and a recommender agent as a Markov Decision Process (MDP). In practice, the purchase reward is delayed and sparse, and may be buried by clicks, making it an impoverished signal for policy learning. Inspired by the prediction error minimization (PEM) and embodied cognition, we propose a simple architecture to augment reward, namely Imagination Reconstruction Network (IRN). Speci\ufb01cally, IRN enables the agent to explore its environment and learn predictive representations via three key components. The imagination core generates predicted trajectories, i.e., imagined items that users may purchase. The trajectory manager controls the granularity of imagined trajectories using the planning strategies, which balances the long-term rewards and short-term rewards. To optimize the action policy, the imagination-augmented executor minimizes the intrinsic imagination error of simulated trajectories by self-supervised reconstruction, while maximizing the extrinsic reward using model-free algorithms. Empirically, IRN promotes quicker adaptation to user interest, and shows improved robustness to the cold-start scenario and ultimately higher purchase performance compared to several baselines. Somewhat surprisingly, IRN using only the purchase reward achieves excellent next-click prediction performance, demonstrating that the agent can \"guess what you like\" via internal planning.", "keywords": "recommender systems;reinforcement learning;predictive learning;self-supervised RL;model-based planning", "primary_area": "", "supplementary_material": "", "author": "Qibing Li;Xiaolin Zheng", "authorids": "qblee@zju.edu.cn;xlzheng@zju.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2019purchase,\ntitle={Purchase as Reward : Session-based Recommendation by Imagination Reconstruction},\nauthor={Qibing Li and Xiaolin Zheng},\nyear={2019},\nurl={https://openreview.net/forum?id=SkfTIj0cKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkfTIj0cKX", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;3;2", "wc_review": "293;453;289", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "736;662;422", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 345.0, 76.38498979948002 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 606.6666666666666, 134.0281894063915 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EJptCQ-bLOsJ:scholar.google.com/&scioq=Purchase+as+Reward+:+Session-based+Recommendation+by+Imagination+Reconstruction&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SkfhIo0qtQ", "title": "Volumetric Convolution: Automatic Representation Learning in Unit Ball", "track": "main", "status": "Reject", "tldr": "A novel convolution operator for automatic representation learning inside unit ball", "abstract": "Convolution is an efficient technique to obtain abstract feature representations using hierarchical layers in deep networks. Although performing convolution in Euclidean geometries is fairly straightforward, its extension to other topological spaces---such as a sphere S^2 or a unit ball B^3---entails unique challenges. In this work, we propose a novel `\"volumetric convolution\" operation that can effectively convolve arbitrary functions in B^3. We develop a theoretical framework for \"volumetric convolution\" based on Zernike polynomials and efficiently implement it as a differentiable and an easily pluggable layer for deep networks. Furthermore, our formulation leads to derivation of a novel formula to measure the symmetry of a function in B^3 around an arbitrary axis, that is useful in 3D shape analysis tasks. We demonstrate the efficacy of proposed volumetric convolution operation on a possible use-case i.e., 3D object recognition task.", "keywords": "convolution;unit sphere;3D object recognition", "primary_area": "", "supplementary_material": "", "author": "Sameera Ramasinghe;Salman Khan;Nick Barnes", "authorids": "sameera.ramasinghe@anu.edu.au;salman.khan@anu.edu.au;nick.barnes@data61.csiro.au", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nramasinghe2019volumetric,\ntitle={Volumetric Convolution: Automatic Representation Learning in Unit Ball},\nauthor={Sameera Ramasinghe and Salman Khan and Nick Barnes},\nyear={2019},\nurl={https://openreview.net/forum?id=SkfhIo0qtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=SkfhIo0qtQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;5;2", "wc_review": "277;712;398", "wc_reply_reviewers": "0;240;0", "wc_reply_authors": "374;1328;1170", "reply_reviewers": "0;1;0", "reply_authors": "1;2;3", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 462.3333333333333, 183.3218178201626 ], "wc_reply_reviewers_avg": [ 80.0, 113.13708498984761 ], "wc_reply_authors_avg": [ 957.3333333333334, 417.4919826242841 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15772950880586874960&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Reward Constrained Policy Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/756", "id": "SkfrvsA9FX", "author_site": "Chen Tessler, Daniel J Mankowitz, Shie Mannor", "tldr": "For complex constraints in which it is not easy to estimate the gradient, we use the discounted penalty as a guiding signal. We prove that under certain assumptions it converges to a feasible solution.", "abstract": "Solving tasks in Reinforcement Learning is no easy feat. As the goal of the agent is to maximize the accumulated reward, it often learns to exploit loopholes and misspecifications in the reward signal resulting in unwanted behavior. While constraints may solve this issue, there is no closed form solution for general constraints. In this work we present a novel multi-timescale approach for constrained policy optimization, called `Reward Constrained Policy Optimization' (RCPO), which uses an alternative penalty signal to guide the policy towards a constraint satisfying one. We prove the convergence of our approach and provide empirical evidence of its ability to train constraint satisfying policies.", "keywords": "reinforcement learning;markov decision process;constrained markov decision process;deep learning", "primary_area": "", "supplementary_material": "", "author": "Chen Tessler;Daniel J. Mankowitz;Shie Mannor", "authorids": "chen.tessler@gmail.com;daniel.mankowitz@gmail.com;shiemannor@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntessler2018reward,\ntitle={Reward Constrained Policy Optimization},\nauthor={Chen Tessler and Daniel J. Mankowitz and Shie Mannor},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkfrvsA9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "2;4;2", "wc_review": "422;245;170", "wc_reply_reviewers": "13;0;0", "wc_reply_authors": "693;409;70", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 279.0, 105.65036677645753 ], "wc_reply_reviewers_avg": [ 4.333333333333333, 6.128258770283412 ], "wc_reply_authors_avg": [ 390.6666666666667, 254.66884815819589 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 667, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8528215054992084387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SkfrvsA9FX", "pdf": "https://openreview.net/pdf?id=SkfrvsA9FX", "email": ";;", "author_num": 3 }, { "id": "SkgCV205tQ", "title": "Accelerating first order optimization algorithms", "track": "main", "status": "Reject", "tldr": "", "abstract": "There exist several stochastic optimization algorithms. However in most cases, it is difficult to tell for a particular problem which will be the best optimizer to choose as each of them are good. Thus, we present a simple and intuitive technique, when applied to first order optimization algorithms, is able to improve the speed of convergence and reaches a better minimum for the loss function compared to the original algorithms. The proposed solution modifies the update rule, based on the variation of the direction of the gradient during training. We conducted several tests with Adam and AMSGrad on two different datasets. The preliminary results show that the proposed technique improves the performance of existing optimization algorithms and works well in practice.", "keywords": "Optimization;Optimizer;Adam;Gradient Descent", "primary_area": "", "supplementary_material": "", "author": "Ange tato;Roger nkambou", "authorids": "nyamen_tato.ange_adrienne@courrier.uqam.ca;nkambou.roger@uqam.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntato2019accelerating,\ntitle={Accelerating first order optimization algorithms},\nauthor={Ange tato and Roger nkambou},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgCV205tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkgCV205tQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;5;3", "wc_review": "276;92;226", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 198.0, 77.68311699891211 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:S3wmFeAfx10J:scholar.google.com/&scioq=Accelerating+first+order+optimization+algorithms&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SkgD4jAcYX", "title": "NA", "track": "main", "status": "Withdraw", "tldr": "NA", "abstract": "NA", "keywords": "NA", "primary_area": "", "supplementary_material": "", "author": "NA", "authorids": "hongyang.gao@wsu.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkgD4jAcYX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "1178;428;1242", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 949.3333333333334, 369.5631054216437 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SkgE8sRcK7", "title": "Sample Efficient Deep Neuroevolution in Low Dimensional Latent Space", "track": "main", "status": "Reject", "tldr": "", "abstract": "Current deep neuroevolution models are usually trained in a large parameter search space for complex learning tasks, e.g. playing video games, which needs billions of samples and thousands of search steps to obtain significant performance. This raises a question of whether we can make use of sequential data generated during evolution, encode input samples, and evolve in low dimensional parameter space with latent state input in a fast and efficient manner. Here we give an affirmative answer: we train a VAE to encode input samples, then an RNN to model environment dynamics and handle temporal information, and last evolve our low dimensional policy network in latent space. We demonstrate that this approach is surprisingly efficient: our experiments on Atari games show that within 10M frames and 30 evolution steps of training, our algorithm could achieve competitive result compared with ES, A3C, and DQN which need billions of frames.", "keywords": "Neuroevolution;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Bin Zhou;Jiashi Feng", "authorids": "bin.zhou@u.nus.edu;elefjia@u.nus.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhou2019sample,\ntitle={Sample Efficient Deep Neuroevolution in Low Dimensional Latent Space},\nauthor={Bin Zhou and Jiashi Feng},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgE8sRcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkgE8sRcK7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;5", "wc_review": "86;489;709", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 428.0, 257.97028252623727 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2622925594452571137&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "On the Relation Between the Sharpest Directions of DNN Loss and the SGD Step Length", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/875", "id": "SkgEaj05t7", "author_site": "Stanislaw Jastrzebski, Zachary Kenton, Nicolas Ballas, Asja Fischer, Yoshua Bengio, Amos Storkey", "tldr": "SGD is steered early on in training towards a region in which its step is too large compared to curvature, which impacts the rest of training. ", "abstract": "The training of deep neural networks with Stochastic Gradient Descent (SGD) with a large learning rate or a small batch-size typically ends in flat regions of the weight space, as indicated by small eigenvalues of the Hessian of the training loss. This was found to correlate with a good final generalization performance. In this paper we extend previous work by investigating the curvature of the loss surface along the whole training trajectory, rather than only at the endpoint. We find that initially SGD visits increasingly sharp regions, reaching a maximum sharpness determined by both the learning rate and the batch-size of SGD. At this peak value SGD starts to fail to minimize the loss along directions in the loss surface corresponding to the largest curvature (sharpest directions). To further investigate the effect of these dynamics in the training process, we study a variant of SGD using a reduced learning rate along the sharpest directions which we show can improve training speed while finding both sharper and better generalizing solution, compared to vanilla SGD. Overall, our results show that the SGD dynamics in the subspace of the sharpest directions influence the regions that SGD steers to (where larger learning rate or smaller batch size result in wider regions visited), the overall training speed, and the generalization ability of the final model.", "keywords": "optimization;generalization;theory of deep learning;SGD;hessian", "primary_area": "", "supplementary_material": "", "author": "Stanis\u0142aw Jastrz\u0119bski;Zachary Kenton;Nicolas Ballas;Asja Fischer;Yoshua Bengio;Amos Storkey", "authorids": "staszek.jastrzebski@gmail.com;zakenton@gmail.com;ballasn@fb.com;asja.fischer@gmail.com;yoshua.umontreal@gmail.com;a.storkey@ed.ac.uk", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\njastrz\u0119bski2018on,\ntitle={On the Relation Between the Sharpest Directions of {DNN} Loss and the {SGD} Step Length},\nauthor={Stanis\u0142aw Jastrz\u0119bski and Zachary Kenton and Nicolas Ballas and Asja Fischer and Yoshua Bengio and Amost Storkey},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgEaj05t7},\n}", "github": "[![github](/images/github_icon.svg) kudkudak/dnn_sharpest_directions](https://github.com/kudkudak/dnn_sharpest_directions)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "wc_review": "445;485;214", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "740;638;64", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 381.3333333333333, 119.44408268679068 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 480.6666666666667, 297.5559704585938 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 137, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3857357074541596262&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SkgEaj05t7", "pdf": "https://openreview.net/pdf?id=SkgEaj05t7", "email": ";;;;;", "author_num": 6 }, { "id": "SkgKzh0cY7", "title": "Unsupervised Video-to-Video Translation", "track": "main", "status": "Reject", "tldr": "Proposed new task, datasets and baselines; 3D Conv CycleGAN preserves object properties across frames; batch structure in frame-level methods matters.", "abstract": "Unsupervised image-to-image translation is a recently proposed task of translating an image to a different style or domain given only unpaired image examples at training time. In this paper, we formulate a new task of unsupervised video-to-video translation, which poses its own unique challenges. Translating video implies learning not only the appearance of objects and scenes but also realistic motion and transitions between consecutive frames. We investigate the performance of per-frame video-to-video translation using existing image-to-image translation networks, and propose a spatio-temporal 3D translator as an alternative solution to this problem. We evaluate our 3D method on multiple synthetic datasets, such as moving colorized digits, as well as the realistic segmentation-to-video GTA dataset and a new CT-to-MRI volumetric images translation dataset. Our results show that frame-wise translation produces realistic results on a single frame level but underperforms significantly on the scale of the whole video compared to our three-dimensional translation approach, which is better able to learn the complex structure of video and motion and continuity of object appearance. ", "keywords": "Generative Adversarial Networks;Computer Vision;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Dina Bashkirova;Ben Usman;Kate Saenko", "authorids": "dbash@bu.edu;usmn@bu.edu;saenko@bu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbashkirova2019unsupervised,\ntitle={Unsupervised Video-to-Video Translation},\nauthor={Dina Bashkirova and Ben Usman and Kate Saenko},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgKzh0cY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkgKzh0cY7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;5", "wc_review": "92;264;590", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 315.3333333333333, 206.52253038241506 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1235844976572057489&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Modeling the Long Term Future in Model-Based Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1075", "id": "SkgQBn0cF7", "author_site": "Nan Rosemary Ke, Amanpreet Singh, Ahmed Touati, Anirudh Goyal, Yoshua Bengio, Devi Parikh, Dhruv Batra", "tldr": "incorporating, in the model, latent variables that encode future content improves the long-term prediction accuracy, which is critical for better planning in model-based RL.", "abstract": "In model-based reinforcement learning, the agent interleaves between model learning and planning. These two components are inextricably intertwined. If the model is not able to provide sensible long-term prediction, the executed planer would exploit model flaws, which can yield catastrophic failures. This paper focuses on building a model that reasons about the long-term future and demonstrates how to use this for efficient planning and exploration. To this end, we build a latent-variable autoregressive model by leveraging recent ideas in variational inference. We argue that forcing latent variables to carry future information through an auxiliary task substantially improves long-term predictions. Moreover, by planning in the latent space, the planner's solution is ensured to be within regions where the model is valid. An exploration strategy can be devised by searching for unlikely trajectories under the model. Our methods achieves higher reward faster compared to baselines on a variety of tasks and environments in both the imitation learning and model-based reinforcement learning settings. ", "keywords": "model-based reinforcement learning;variation inference", "primary_area": "", "supplementary_material": "", "author": "Nan Rosemary Ke;Amanpreet Singh;Ahmed Touati;Anirudh Goyal;Yoshua Bengio;Devi Parikh;Dhruv Batra", "authorids": "rosemary.nan.ke@gmail.com;asg@fb.com;ahmed.touati@umontreal.ca;anirudhgoyal9119@gmail.com;yoshua.umontreal@gmail.com;parikh@gatech.edu;dbatra@gatech.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nke2018modeling,\ntitle={Modeling the Long Term Future in Model-Based Reinforcement Learning},\nauthor={Nan Rosemary Ke and Amanpreet Singh and Ahmed Touati and Anirudh Goyal and Yoshua Bengio and Devi Parikh and Dhruv Batra},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgQBn0cF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "334;363;883", "wc_reply_reviewers": "0;0;97", "wc_reply_authors": "869;1293;1359", "reply_reviewers": "0;0;2", "reply_authors": "4;5;6", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 526.6666666666666, 252.24370931479913 ], "wc_reply_reviewers_avg": [ 32.333333333333336, 45.72623851673007 ], "wc_reply_authors_avg": [ 1173.6666666666667, 217.11031502185446 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 5.0, 0.816496580927726 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14570382599498236785&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SkgQBn0cF7", "pdf": "https://openreview.net/pdf?id=SkgQBn0cF7", "email": ";;;;;;", "author_num": 7 }, { "id": "SkgToo0qFm", "title": "Transferrable End-to-End Learning for Protein Interface Prediction", "track": "main", "status": "Reject", "tldr": "We demonstrate the first successful application of transfer learning to atomic-level data in order to build a state-of-the-art end-to-end learning model for the protein interface prediction problem.", "abstract": "While there has been an explosion in the number of experimentally determined, atomically detailed structures of proteins, how to represent these structures in a machine learning context remains an open research question. In this work we demonstrate that representations learned from raw atomic coordinates can outperform hand-engineered structural features while displaying a much higher degree of transferrability. To do so, we focus on a central problem in biology: predicting how proteins interact with one another\u2014that is, which surfaces of one protein bind to which surfaces of another protein. We present Siamese Atomic Surfacelet Network (SASNet), the first end-to-end learning method for protein interface prediction. Despite using only spatial coordinates and identities of atoms as inputs, SASNet outperforms state-of-the-art methods that rely on hand-engineered, high-level features. These results are particularly striking because we train the method entirely on a significantly biased data set that does not account for the fact that proteins deform when binding to one another. Demonstrating the first successful application of transfer learning to atomic-level data, our network maintains high performance, without retraining, when tested on real cases in which proteins do deform.", "keywords": "transfer learning;protein interface prediction;deep learning;structural biology", "primary_area": "", "supplementary_material": "", "author": "Raphael J. L. Townshend;Rishi Bedi;Ron O. Dror", "authorids": "raphael@cs.stanford.edu;rbedi@cs.stanford.edu;rondror@cs.stanford.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntownshend2019transferrable,\ntitle={Transferrable End-to-End Learning for Protein Interface Prediction},\nauthor={Raphael J. L. Townshend and Rishi Bedi and Ron O. Dror},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgToo0qFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkgToo0qFm", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;3", "wc_review": "93;269;391", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "142;582;905", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 251.0, 122.3219794912863 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 543.0, 312.7117948953424 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kyU-PiMMMekJ:scholar.google.com/&scioq=Transferrable+End-to-End+Learning+for+Protein+Interface+Prediction&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkgVRiC9Km", "title": "Fortified Networks: Improving the Robustness of Deep Networks by Modeling the Manifold of Hidden Representations", "track": "main", "status": "Reject", "tldr": "Better adversarial training by learning to map back to the data manifold with autoencoders in the hidden states. ", "abstract": "Deep networks have achieved impressive results across a variety of important tasks. However, a known weakness is a failure to perform well when evaluated on data which differ from the training distribution, even if these differences are very small, as is the case with adversarial examples. We propose \\emph{Fortified Networks}, a simple transformation of existing networks, which \u201cfortifies\u201d the hidden layers in a deep network by identifying when the hidden states are off of the data manifold, and maps these hidden states back to parts of the data manifold where the network performs well. Our principal contribution is to show that fortifying these hidden states improves the robustness of deep networks and our experiments (i) demonstrate improved robustness to standard adversarial attacks in both black-box and white-box threat models; (ii) suggest that our improvements are not primarily due to the problem of deceptively good results due to degraded quality in the gradient signal (the gradient masking problem) and (iii) show the advantage of doing this fortification in the hidden layers instead of the input space. We demonstrate improvements in adversarial robustness on three datasets (MNIST, Fashion MNIST, CIFAR10), across several attack parameters, both white-box and black-box settings, and the most widely studied attacks (FGSM, PGD, Carlini-Wagner). We show that these improvements are achieved across a wide variety of hyperparameters. ", "keywords": "adversarial examples;adversarial training;autoencoders;hidden state", "primary_area": "", "supplementary_material": "", "author": "Alex Lamb;Jonathan Binas;Anirudh Goyal;Dmitriy Serdyuk;Sandeep Subramanian;Ioannis Mitliagkas;Yoshua Bengio", "authorids": "lambalex@iro.umontreal.ca;jonathan.binas@umontreal.ca;anirudhgoyal9119@gmail.com;serdyuk.dmitriy@gmail.com;sandeep.subramanian@gmail.com;ioannis@iro.umontreal.ca;yoshua.bengio@mila.quebec", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nlamb2019fortified,\ntitle={Fortified Networks: Improving the Robustness of Deep Networks by Modeling the Manifold of Hidden Representations},\nauthor={Alex Lamb and Jonathan Binas and Anirudh Goyal and Dmitriy Serdyuk and Sandeep Subramanian and Ioannis Mitliagkas and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgVRiC9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkgVRiC9Km", "pdf_size": 0, "rating": "4;5;6;9", "confidence": "5;3;3;4", "wc_review": "322;119;225;67", "wc_reply_reviewers": "350;0;0;0", "wc_reply_authors": "1511;479;117;93", "reply_reviewers": "1;0;0;0", "reply_authors": "4;2;1;1", "rating_avg": [ 6.0, 1.8708286933869707 ], "confidence_avg": [ 3.75, 0.82915619758885 ], "wc_review_avg": [ 183.25, 98.28116554050425 ], "wc_reply_reviewers_avg": [ 87.5, 151.55444566227678 ], "wc_reply_authors_avg": [ 550.0, 575.5215026391281 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 1.224744871391589 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.16116459280507606, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11853519139612639286&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SkgYciAqF7", "title": "N/A", "track": "main", "status": "Withdraw", "tldr": "N/A", "abstract": "N/A", "keywords": "N/A", "primary_area": "", "supplementary_material": "", "author": "N/A", "authorids": "youngjoon.yoo@navercorp.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkgYciAqF7", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;5", "wc_review": "810;420;901", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 710.3333333333334, 208.63098736498586 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "SkgZNnR5tX", "title": "Uncovering Surprising Behaviors in Reinforcement Learning via Worst-case Analysis", "track": "main", "status": "Reject", "tldr": "We find environment settings in which SOTA agents trained on navigation tasks display extreme failures suggesting failures in generalization.", "abstract": "Reinforcement learning agents are typically trained and evaluated according to their performance averaged over some distribution of environment settings. But does the distribution over environment settings contain important biases, and do these lead to agents that fail in certain cases despite high average-case performance? In this work, we consider worst-case analysis of agents over environment settings in order to detect whether there are directions in which agents may have failed to generalize. Specifically, we consider a 3D first-person task where agents must navigate procedurally generated mazes, and where reinforcement learning agents have recently achieved human-level average-case performance. By optimizing over the structure of mazes, we find that agents can suffer from catastrophic failures, failing to find the goal even on surprisingly simple mazes, despite their impressive average-case performance. Additionally, we find that these failures transfer between different agents and even significantly different architectures. We believe our findings highlight an important role for worst-case analysis in identifying whether there are directions in which agents have failed to generalize. Our hope is that the ability to automatically identify failures of generalization will facilitate development of more general and robust agents. To this end, we report initial results on enriching training with settings causing failure.", "keywords": "Reinforcement learning;Adversarial examples;Navigation;Evaluation;Analysis", "primary_area": "", "supplementary_material": "", "author": "Avraham Ruderman;Richard Everett;Bristy Sikder;Hubert Soyer;Jonathan Uesato;Ananya Kumar;Charlie Beattie;Pushmeet Kohli", "authorids": "aruderman@google.com;reverett@google.com;bristy@google.com;soyer@google.com;juesato@google.com;skywalker94@gmail.com;cbeattie@google.com;pushmeet@google.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nruderman2019uncovering,\ntitle={Uncovering Surprising Behaviors in Reinforcement Learning via Worst-case Analysis},\nauthor={Avraham Ruderman and Richard Everett and Bristy Sikder and Hubert Soyer and Jonathan Uesato and Ananya Kumar and Charlie Beattie and Pushmeet Kohli},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgZNnR5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkgZNnR5tX", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;2;4", "wc_review": "593;270;730", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "372;496;588", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 531.0, 192.84363268375407 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 485.3333333333333, 88.50360946813913 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16072862688373379713&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Skgge3R9FQ", "title": "Controlling Over-generalization and its Effect on Adversarial Examples Detection and Generation", "track": "main", "status": "Reject", "tldr": "Properly training CNNs with dustbin class increase their robustness to adversarial attacks and their capacity to deal with out-distribution samples.", "abstract": "Convolutional Neural Networks (CNNs) significantly improve the state-of-the-art for many applications, especially in computer vision. However, CNNs still suffer from a tendency to confidently classify out-distribution samples from unknown classes into pre-defined known classes. Further, they are also vulnerable to adversarial examples. We are relating these two issues through the tendency of CNNs to over-generalize for areas of the input space not covered well by the training set. We show that a CNN augmented with an extra output class can act as a simple yet effective end-to-end model for controlling over-generalization. As an appropriate training set for the extra class, we introduce two resources that are computationally efficient to obtain: a representative natural out-distribution set and interpolated in-distribution samples. To help select a representative natural out-distribution set among available ones, we propose a simple measurement to assess an out-distribution set's fitness. We also demonstrate that training such an augmented CNN with representative out-distribution natural datasets and some interpolated samples allows it to better handle a wide range of unseen out-distribution samples and black-box adversarial examples without training it on any adversaries. Finally, we show that generation of white-box adversarial attacks using our proposed augmented CNN can become harder, as the attack algorithms have to get around the rejection regions when generating actual adversaries.", "keywords": "Convolutional Neural Networks;Adversarial Instances;Out-distribution Samples;Rejection Option;Over-generalization", "primary_area": "", "supplementary_material": "", "author": "Mahdieh Abbasi;Arezoo Rajabi;Azadeh Sadat Mozafari;Rakesh B. Bobba;Christian Gagn\u00e9", "authorids": "mahdieh.abbasi.1@ulaval.ca;rajabia@oregonstate.edu;azadeh-sadat.mozafari.1@ulaval.ca;rakesh.bobba@oregonstate.edu;christian.gagne@gel.ulaval.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nabbasi2019controlling,\ntitle={Controlling Over-generalization and its Effect on Adversarial Examples Detection and Generation},\nauthor={Mahdieh Abbasi and Arezoo Rajabi and Azadeh Sadat Mozafari and Rakesh B. Bobba and Christian Gagn\u00e9},\nyear={2019},\nurl={https://openreview.net/forum?id=Skgge3R9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=Skgge3R9FQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;5;4", "wc_review": "135;208;160", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "285;173;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 167.66666666666666, 30.291179500896884 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 152.66666666666666, 117.2357548797389 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11099322193220015284&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SkghBoR5FX", "title": "Adversarial Attacks for Optical Flow-Based Action Recognition Classifiers", "track": "main", "status": "Reject", "tldr": "The paper describes adversarial attacks for action recognition classifiers that explicitly attack along the time dimension.", "abstract": "The success of deep learning research has catapulted deep models into production\nsystems that our society is becoming increasingly dependent on, especially in the\nimage and video domains. However, recent work has shown that these largely\nuninterpretable models exhibit glaring security vulnerabilities in the presence of\nan adversary. In this work, we develop a powerful untargeted adversarial attack\nfor action recognition systems in both white-box and black-box settings. Action\nrecognition models differ from image-classification models in that their inputs\ncontain a temporal dimension, which we explicitly target in the attack. Drawing\ninspiration from image classifier attacks, we create new attacks which achieve\nstate-of-the-art success rates on a two-stream classifier trained on the UCF-101\ndataset. We find that our attacks can significantly degrade a model\u2019s performance\nwith sparsely and imperceptibly perturbed examples. We also demonstrate the\ntransferability of our attacks to black-box action recognition systems.", "keywords": "adversarial attacks;action recognition;video classification", "primary_area": "", "supplementary_material": "", "author": "Nathan Inkawhich;Matthew Inkawhich;Hai Li;Yiran Chen", "authorids": "nathan.inkawhich@duke.edu;matthew.inkawhich@duke.edu;hai.li@duke.edu;yiran.chen@duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ninkawhich2019adversarial,\ntitle={Adversarial Attacks for Optical Flow-Based Action Recognition Classifiers},\nauthor={Nathan Inkawhich and Matthew Inkawhich and Hai Li and Yiran Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=SkghBoR5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkghBoR5FX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;4", "wc_review": "622;283;236", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 380.3333333333333, 171.9580052092823 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13965176018584055553&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SkghN205KQ", "title": "Search-Guided, Lightly-supervised Training of Structured Prediction Energy Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": " In structured output prediction tasks, labeling ground-truth training output is often expensive. However, for many tasks, even when the true output is unknown, we can evaluate predictions using a scalar reward function, which may be easily assembled from human knowledge or non-differentiable pipelines. But searching through the entire output space to find the best output with respect to this reward function is typically intractable. In this paper, we instead use efficient truncated randomized search in this reward function to train structured prediction energy networks (SPENs), which provide efficient test-time inference using gradient-based search on a smooth, learned representation of the score landscape, and have previously yielded state-of-the-art results in structured prediction. In particular, this truncated randomized search in the reward function yields previously unknown local improvements, providing effective supervision to SPENs, avoiding their traditional need for labeled training data. ", "keywords": "structured prediction energy networks;indirect supervision;search-guided training;reward functions", "primary_area": "", "supplementary_material": "", "author": "Amirmohammad Rooshenas;Dongxu Zhang;Gopal Sharma;Andrew McCallum", "authorids": "pedram@cs.umass.edu;dongxuzhang@cs.umass.edu;gopalsharma@cs.umass.edu;mccallum@cs.umass.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nrooshenas2019searchguided,\ntitle={Search-Guided, Lightly-supervised Training of Structured Prediction Energy Networks},\nauthor={Amirmohammad Rooshenas and Dongxu Zhang and Gopal Sharma and Andrew McCallum},\nyear={2019},\nurl={https://openreview.net/forum?id=SkghN205KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkghN205KQ", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;4", "wc_review": "367;411;360", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "642;460;168", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 379.3333333333333, 22.573337271116017 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 423.3333333333333, 195.2388850158242 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17962918863681053025&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "SkgiX2Aqtm", "title": "PIE: Pseudo-Invertible Encoder", "track": "main", "status": "Reject", "tldr": "New Class of Autoencoders with pseudo invertible architecture", "abstract": "We consider the problem of information compression from high dimensional data. Where many studies consider the problem of compression by non-invertible trans- formations, we emphasize the importance of invertible compression. We introduce new class of likelihood-based auto encoders with pseudo bijective architecture, which we call Pseudo Invertible Encoders. We provide the theoretical explanation of their principles. We evaluate Gaussian Pseudo Invertible Encoder on MNIST, where our model outperform WAE and VAE in sharpness of the generated images.", "keywords": "Invertible Mappings;Bijectives;Dimensionality reduction;Autoencoder", "primary_area": "", "supplementary_material": "", "author": "Jan Jetze Beitler;Ivan Sosnovik;Arnold Smeulders", "authorids": "j.j.beitler@uva.nl;i.sosnovik@uva.nl;a.w.m.smeulders@uva.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbeitler2019pie,\ntitle={{PIE}: Pseudo-Invertible Encoder},\nauthor={Jan Jetze Beitler and Ivan Sosnovik and Arnold Smeulders},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgiX2Aqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkgiX2Aqtm", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;5", "wc_review": "366;295;197", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "229;58;165", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 286.0, 69.28684338795257 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 150.66666666666666, 70.54234346987788 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2895251850317502928&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SkgkJn05YX", "title": "RANDOM MASK: Towards Robust Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "We propose a technique that modifies CNN structures to enhance robustness while keeping high test accuracy, and raise doubt on whether current definition of adversarial examples is appropriate by generating adversarial examples able to fool humans.", "abstract": "Robustness of neural networks has recently been highlighted by the adversarial examples, i.e., inputs added with well-designed perturbations which are imperceptible to humans but can cause the network to give incorrect outputs. In this paper, we design a new CNN architecture that by itself has good robustness. We introduce a simple but powerful technique, Random Mask, to modify existing CNN structures. We show that CNN with Random Mask achieves state-of-the-art performance against black-box adversarial attacks without applying any adversarial training. We next investigate the adversarial examples which \u201cfool\u201d a CNN with Random Mask. Surprisingly, we find that these adversarial examples often \u201cfool\u201d humans as well. This raises fundamental questions on how to define adversarial examples and robustness properly.", "keywords": "adversarial examples;robust machine learning;cnn structure;metric;deep feature representations", "primary_area": "", "supplementary_material": "", "author": "Tiange Luo;Tianle Cai;Mengxiao Zhang;Siyu Chen;Liwei Wang", "authorids": "luotg@pku.edu.cn;caitianle1998@pku.edu.cn;zhan147@usc.edu;siyuchen@pku.edu.cn;wanglw@cis.pku.edu.cn", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nluo2019random,\ntitle={{RANDOM} {MASK}: Towards Robust Convolutional Neural Networks},\nauthor={Tiange Luo and Tianle Cai and Mengxiao Zhang and Siyu Chen and Liwei Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgkJn05YX},\n}", "github": "[![github](/images/github_icon.svg) tiangeluo/DefectiveCNN](https://github.com/tiangeluo/DefectiveCNN)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkgkJn05YX", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;3;3", "wc_review": "254;144;613", "wc_reply_reviewers": "0;45;0", "wc_reply_authors": "1076;722;745", "reply_reviewers": "0;1;0", "reply_authors": "2;2;3", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 337.0, 200.26149571664212 ], "wc_reply_reviewers_avg": [ 15.0, 21.213203435596427 ], "wc_reply_authors_avg": [ 847.6666666666666, 161.7288540187626 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6776469327244966672&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "SkguE30ct7", "title": "Neural Model-Based Reinforcement Learning for Recommendation", "track": "main", "status": "Reject", "tldr": "A new insight of designing a RL recommendation policy based on a generative adversarial user model.", "abstract": "There are great interests as well as many challenges in applying reinforcement learning (RL) to recommendation systems. In this setting, an online user is the environment; neither the reward function nor the environment dynamics are clearly defined, making the application of RL challenging. \nIn this paper, we propose a novel model-based reinforcement learning framework for recommendation systems, where we develop a generative adversarial network to imitate user behavior dynamics and learn her reward function. Using this user model as the simulation environment, we develop a novel DQN algorithm to obtain a combinatorial recommendation policy which can handle a large number of candidate items efficiently. In our experiments with real data, we show this generative adversarial user model can better explain user behavior than alternatives, and the RL policy based on this model can lead to a better long-term reward for the user and higher click rate for the system.", "keywords": "Generative adversarial user model;Recommendation system;combinatorial recommendation policy;model-based reinforcement learning;deep Q-networks", "primary_area": "", "supplementary_material": "", "author": "Xinshi Chen;Shuang Li;Hui Li;Shaohua Jiang;Le Song", "authorids": "xinshi.chen@gatech.edu;sli370@gatech.edu;ken.lh@alibaba-inc.com;shaohua.jsh@alipay.com;lsong@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchen2019neural,\ntitle={Neural Model-Based Reinforcement Learning for Recommendation},\nauthor={Xinshi Chen and Shuang Li and Hui Li and Shaohua Jiang and Le Song},\nyear={2019},\nurl={https://openreview.net/forum?id=SkguE30ct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkguE30ct7", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;4;3", "wc_review": "483;631;187", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "758;1468;252", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 433.6666666666667, 184.58843108084776 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 826.0, 498.75311193682455 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16363420074791889178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SkgzYiRqtX", "title": "Graph Neural Networks with Generated Parameters for Relation Extraction", "track": "main", "status": "Reject", "tldr": "A graph neural network model with parameters generated from natural languages, which can perform multi-hop reasoning. ", "abstract": "Recently, progress has been made towards improving relational reasoning in machine learning field. Among existing models, graph neural networks (GNNs) is one of the most effective approaches for multi-hop relational reasoning. In fact, multi-hop relational reasoning is indispensable in many natural language processing tasks such as relation extraction. In this paper, we propose to generate the parameters of graph neural networks (GP-GNNs) according to natural language sentences, which enables GNNs to process relational reasoning on unstructured text inputs. We verify GP-GNNs in relation extraction from text. Experimental results on a human-annotated dataset and two distantly supervised datasets show that our model achieves significant improvements compared to the baselines. We also perform a qualitative analysis to demonstrate that our model could discover more accurate relations by multi-hop relational reasoning.", "keywords": "Graph Neural Networks;Relational Reasoning", "primary_area": "", "supplementary_material": "", "author": "Hao Zhu;Yankai Lin;Zhiyuan Liu;Jie Fu;Tat-seng Chua and Maosong Sun", "authorids": ";;;;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhu2019graph,\ntitle={Graph Neural Networks with Generated Parameters for Relation Extraction},\nauthor={Hao Zhu and Yankai Lin and Zhiyuan Liu and Jie Fu and Tat-seng Chua and Maosong Sun},\nyear={2019},\nurl={https://openreview.net/forum?id=SkgzYiRqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkgzYiRqtX", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "wc_review": "340;646;361", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "555;892;370", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 449.0, 139.56360557108002 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 605.6666666666666, 216.09617200578901 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 186, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18227538214871902721&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "title": "Understanding Straight-Through Estimator in Training Activation Quantized Neural Nets", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/671", "id": "Skh4jRcKQ", "author_site": "Penghang Yin, Jiancheng Lyu, shuai zhang, Stanley J Osher, YINGYONG QI, Jack Xin", "tldr": "We make theoretical justification for the concept of straight-through estimator.", "abstract": "Training activation quantized neural networks involves minimizing a piecewise constant training loss whose gradient vanishes almost everywhere, which is undesirable for the standard back-propagation or chain rule. An empirical way around this issue is to use a straight-through estimator (STE) (Bengio et al., 2013) in the backward pass only, so that the \"gradient\" through the modified chain rule becomes non-trivial. Since this unusual \"gradient\" is certainly not the gradient of loss function, the following question arises: why searching in its negative direction minimizes the training loss? In this paper, we provide the theoretical justification of the concept of STE by answering this question. We consider the problem of learning a two-linear-layer network with binarized ReLU activation and Gaussian input data. We shall refer to the unusual \"gradient\" given by the STE-modifed chain rule as coarse gradient. The choice of STE is not unique. We prove that if the STE is properly chosen, the expected coarse gradient correlates positively with the population gradient (not available for the training), and its negation is a descent direction for minimizing the population loss. We further show the associated coarse gradient descent algorithm converges to a critical point of the population loss minimization problem. Moreover, we show that a poor choice of STE leads to instability of the training algorithm near certain local minima, which is verified with CIFAR-10 experiments.", "keywords": "straight-through estimator;quantized activation;binary neuron", "primary_area": "", "supplementary_material": "", "author": "Penghang Yin;Jiancheng Lyu;Shuai Zhang;Stanley Osher;Yingyong Qi;Jack Xin", "authorids": "yph@ucla.edu;jianchel@uci.edu;shuazhan@qti.qualcomm.com;sjo@math.ucla.edu;yingyong@qti.qualcomm.com;jxin@math.uci.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nyin2018understanding,\ntitle={Understanding Straight-Through Estimator in Training Activation Quantized Neural Nets},\nauthor={Penghang Yin and Jiancheng Lyu and Shuai Zhang and Stanley J. Osher and Yingyong Qi and Jack Xin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Skh4jRcKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "wc_review": "284;882;406", "wc_reply_reviewers": "49;0;120", "wc_reply_authors": "564;1046;877", "reply_reviewers": "1;0;1", "reply_authors": "2;3;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 524.0, 257.9974160077319 ], "wc_reply_reviewers_avg": [ 56.333333333333336, 49.2634640366356 ], "wc_reply_authors_avg": [ 829.0, 199.68141292235154 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 382, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3760388634450301972&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=Skh4jRcKQ", "pdf": "https://openreview.net/pdf?id=Skh4jRcKQ", "email": ";;;;;", "author_num": 6 }, { "id": "Skl3M20qYQ", "title": "Non-Synergistic Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "Minimising the synergistic mutual information within the latents and the data for the task of disentanglement using the VAE framework.", "abstract": "Learning disentangling representations of the independent factors of variations that explain the data in an unsupervised setting is still a major challenge. In the following paper we address the task of disentanglement and introduce a new state-of-the-art approach called Non-synergistic variational Autoencoder (Non-Syn VAE). Our model draws inspiration from population coding, where the notion of synergy arises when we describe the encoded information by neurons in the form of responses from the stimuli. If those responses convey more information together than separate as independent sources of encoding information, they are acting synergetically. By penalizing the synergistic mutual information within the latents we encourage information independence and by doing that disentangle the latent factors. Notably, our approach could be added to the VAE framework easily, where the new ELBO function is still a lower bound on the log likelihood. In addition, we qualitatively compare our model with Factor VAE and show that this one implicitly minimises the synergy of the latents.", "keywords": "vae;unsupervised learning", "primary_area": "", "supplementary_material": "", "author": "Gonzalo Barrientos;Sten Sootla", "authorids": "gonzalo.ayquipa.16@ucl.ac.uk;sten.sootla.17@ucl.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbarrientos2019nonsynergistic,\ntitle={Non-Synergistic Variational Autoencoders},\nauthor={Gonzalo Barrientos and Sten Sootla},\nyear={2019},\nurl={https://openreview.net/forum?id=Skl3M20qYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Skl3M20qYQ", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;3", "wc_review": "902;184;206", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 430.6666666666667, 333.40399251092094 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=566732554158617594&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Skl6k209Ym", "title": "Alignment Based Mathching Networks for One-Shot Classification and Open-Set Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep learning for object classification relies heavily on convolutional models. While effective, CNNs are rarely interpretable after the fact. An attention mechanism can be used to highlight the area of the image that the model focuses on thus offering a narrow view into the mechanism of classification. We expand on this idea by forcing the method to explicitly align images to be classified to reference images representing the classes. The mechanism of alignment is learned and therefore does not require that the reference objects are anything like those being classified. Beyond explanation, our exemplar based cross-alignment method enables classification with only a single example per category (one-shot). Our model cuts the 5-way, 1-shot error rate in Omniglot from 2.1\\% to 1.4\\% and in MiniImageNet from 53.5\\% to 46.5\\% while simultaneously providing point-wise alignment information providing some understanding on what the network is capturing. This method of alignment also enables the recognition of an unsupported class (open-set) in the one-shot setting while maintaining an F1-score of above 0.5 for Omniglot even with 19 other distracting classes while baselines completely fail to separate the open-set class in the one-shot setting.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Paresh Malalur;Tommi Jaakkola", "authorids": "pareshmg@csail.mit.edu;tommi@csail.mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmalalur2019alignment,\ntitle={Alignment Based Mathching Networks for One-Shot Classification and Open-Set Recognition},\nauthor={Paresh Malalur and Tommi Jaakkola},\nyear={2019},\nurl={https://openreview.net/forum?id=Skl6k209Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=Skl6k209Ym", "pdf_size": 0, "rating": "4;6;7;7", "confidence": "4;2;4;3", "wc_review": "374;334;107;631", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 6.0, 1.224744871391589 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "wc_review_avg": [ 361.5, 185.95227882443388 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.24618298195866545, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7832426139395356502&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "DISTRIBUTIONAL CONCAVITY REGULARIZATION FOR GANS", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/842", "id": "SklEEnC5tQ", "author_site": "Shoichiro Yamaguchi, Masanori Koyama", "tldr": "", "abstract": "We propose Distributional Concavity (DC) regularization for Generative Adversarial Networks (GANs), a functional gradient-based method that promotes the entropy of the generator distribution and works against mode collapse. \nOur DC regularization is an easy-to-implement method that can be used in combination with the current state of the art methods like Spectral Normalization and Wasserstein GAN with gradient penalty to further improve the performance.\nWe will not only show that our DC regularization can achieve highly competitive results on ILSVRC2012 and CIFAR datasets in terms of Inception score and Fr\\'echet inception distance, but also provide a mathematical guarantee that our method can always increase the entropy of the generator distribution. We will also show an intimate theoretical connection between our method and the theory of optimal transport.", "keywords": "Generative Adversarial Networks;regularization;optimal transport;functional gradient;convex analysis", "primary_area": "", "supplementary_material": "", "author": "Shoichiro Yamaguchi;Masanori Koyama", "authorids": "guguchi@preferred.jp;masomatics@preferred.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nyamaguchi2018distributional,\ntitle={{DISTRIBUTIONAL} {CONCAVITY} {REGULARIZATION} {FOR} {GANS}},\nauthor={Shoichiro Yamaguchi and Masanori Koyama},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SklEEnC5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7;8", "confidence": "4;1;4;1", "wc_review": "268;113;371;65", "wc_reply_reviewers": "0;0;53;0", "wc_reply_authors": "1062;56;754;35", "reply_reviewers": "0;0;1;0", "reply_authors": "2;1;1;1", "rating_avg": [ 7.0, 0.7071067811865476 ], "confidence_avg": [ 2.5, 1.5 ], "wc_review_avg": [ 204.25, 122.05198687444626 ], "wc_reply_reviewers_avg": [ 13.25, 22.949673200287624 ], "wc_reply_authors_avg": [ 476.75, 444.84793750224355 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7071067811865475, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18260150873860031631&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=SklEEnC5tQ", "pdf": "https://openreview.net/pdf?id=SklEEnC5tQ", "email": ";", "author_num": 2 }, { "id": "SklR_iCcYm", "title": "Faster Training by Selecting Samples Using Embeddings", "track": "main", "status": "Reject", "tldr": "Training is sped up by using a dataset that has been subsampled through embedding analysis.", "abstract": "Long training times have increasingly become a burden for researchers by slowing down the pace of innovation, with some models taking days or weeks to train. In this paper, a new, general technique is presented that aims to speed up the training process by using a thinned-down training dataset. By leveraging autoencoders and the unique properties of embedding spaces, we are able to filter training datasets to include only those samples that matter the most. Through evaluation on a standard CIFAR-10 image classification task, this technique is shown to be effective. With this technique, training times can be reduced with a minimal loss in accuracy. Conversely, given a fixed training time budget, the technique was shown to improve accuracy by over 50%. This technique is a practical tool for achieving better results with large datasets and limited computational budgets.", "keywords": "Machine Learning;Embeddings;Training Time;Optimization;Autoencoders", "primary_area": "", "supplementary_material": "", "author": "Santiago Gonzalez;Joshua Landgraf;Risto Miikkulainen", "authorids": "slgonzalez@utexas.edu;jland@cs.utexas.edu;risto@cs.utexas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngonzalez2019faster,\ntitle={Faster Training by Selecting Samples Using Embeddings},\nauthor={Santiago Gonzalez and Joshua Landgraf and Risto Miikkulainen},\nyear={2019},\nurl={https://openreview.net/forum?id=SklR_iCcYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SklR_iCcYm", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;3;5", "wc_review": "647;1028;740", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 805.0, 162.191245139804 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9661755534862710171&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "SklVEnR5K7", "title": "Making Convolutional Networks Shift-Invariant Again", "track": "main", "status": "Reject", "tldr": "Modern networks are not shift-invariant, due to naive downsampling; we apply a signal processing tool -- anti-aliasing low-pass filtering before downsampling -- to improve shift-invariance", "abstract": "Modern convolutional networks are not shift-invariant, despite their convolutional nature: small shifts in the input can cause drastic changes in the internal feature maps and output. In this paper, we isolate the cause -- the downsampling operation in convolutional and pooling layers -- and apply the appropriate signal processing fix -- low-pass filtering before downsampling. This simple architectural modification boosts the shift-equivariance of the internal representations and consequently, shift-invariance of the output. Importantly, this is achieved while maintaining downstream classification performance. In addition, incorporating the inductive bias of shift-invariance largely removes the need for shift-based data augmentation. Lastly, we observe that the modification induces spatially-smoother learned convolutional kernels. Our results suggest that this classical signal processing technique has a place in modern deep networks.", "keywords": "convolutional networks;signal processing;shift;translation;invariance;equivariance", "primary_area": "", "supplementary_material": "", "author": "Richard Zhang", "authorids": "rich.zhang@eecs.berkeley.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nzhang2019making,\ntitle={Making Convolutional Networks Shift-Invariant Again},\nauthor={Richard Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=SklVEnR5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SklVEnR5K7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "599;253;333", "wc_reply_reviewers": "0;0;139", "wc_reply_authors": "1176;655;864", "reply_reviewers": "0;0;2", "reply_authors": "2;1;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 395.0, 147.9008677008579 ], "wc_reply_reviewers_avg": [ 46.333333333333336, 65.5252283899534 ], "wc_reply_authors_avg": [ 898.3333333333334, 214.07838647457046 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1050, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6405795848737680233&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "SklXvs0qt7", "title": "Curiosity-Driven Experience Prioritization via Density Estimation", "track": "main", "status": "Reject", "tldr": "Our paper proposes a curiosity-driven prioritization framework for RL agents, which improves both performance and sample-efficiency.", "abstract": "In Reinforcement Learning (RL), an agent explores the environment and collects trajectories into the memory buffer for later learning. However, the collected trajectories can easily be imbalanced with respect to the achieved goal states. The problem of learning from imbalanced data is a well-known problem in supervised learning, but has not yet been thoroughly researched in RL. To address this problem, we propose a novel Curiosity-Driven Prioritization (CDP) framework to encourage the agent to over-sample those trajectories that have rare achieved goal states. The CDP framework mimics the human learning process and focuses more on relatively uncommon events. We evaluate our methods using the robotic environment provided by OpenAI Gym. The environment contains six robot manipulation tasks. In our experiments, we combined CDP with Deep Deterministic Policy Gradient (DDPG) with or without Hindsight Experience Replay (HER). The experimental results show that CDP improves both performance and sample-efficiency of reinforcement learning agents, compared to state-of-the-art methods.", "keywords": "Curiosity-Driven;Experience Prioritization;Hindsight Experience;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Rui Zhao;Volker Tresp", "authorids": "zhaorui.in.germany@gmail.com;volker.tresp@siemens.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhao2019curiositydriven,\ntitle={Curiosity-Driven Experience Prioritization via Density Estimation},\nauthor={Rui Zhao and Volker Tresp},\nyear={2019},\nurl={https://openreview.net/forum?id=SklXvs0qt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SklXvs0qt7", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "wc_review": "547;298;215", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "383;341;268", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 353.3333333333333, 141.0728732094004 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 330.6666666666667, 47.51374070261734 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14368014752452511684&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SklcFsAcKX", "title": "Deep Denoising: Rate-Optimal Recovery of Structured Signals with a Deep Prior", "track": "main", "status": "Reject", "tldr": "By analyzing an algorithms minimizing a non-convex loss, we show that all but a small fraction of noise can be removed from an image using a deep neural network based generative prior.", "abstract": "Deep neural networks provide state-of-the-art performance for image denoising, where the goal is to recover a near noise-free image from a noisy image.\nThe underlying principle is that neural networks trained on large datasets have empirically been shown to be able to generate natural images well from a low-dimensional latent representation of the image.\nGiven such a generator network, or prior, a noisy image can be denoised by finding the closest image in the range of the prior.\nHowever, there is little theory to justify this success, let alone to predict the denoising performance as a function of the networks parameters.\nIn this paper we consider the problem of denoising an image from additive Gaussian noise, assuming the image is well described by a deep neural network with ReLu activations functions, mapping a k-dimensional latent space to an n-dimensional image.\nWe state and analyze a simple gradient-descent-like iterative algorithm that minimizes a non-convex loss function, and provably removes a fraction of (1 - O(k/n)) of the noise energy.\nWe also demonstrate in numerical experiments that this denoising performance is, indeed, achieved by generative priors learned from data.", "keywords": "non-convex optimization;denoising;generative neural network", "primary_area": "", "supplementary_material": "", "author": "Reinhard Heckel;Wen Huang;Paul Hand;Vladislav Voroninski", "authorids": "rh43@rice.edu;wen.huang@xmu.edu.cn;p.hand@northeastern.edu;vlad@helm.ai", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nheckel2019deep,\ntitle={Deep Denoising: Rate-Optimal Recovery of Structured Signals with a Deep Prior},\nauthor={Reinhard Heckel and Wen Huang and Paul Hand and Vladislav Voroninski},\nyear={2019},\nurl={https://openreview.net/forum?id=SklcFsAcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer5;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=SklcFsAcKX", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;4;3", "wc_review": "162;379;378", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "222;240;507", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 306.3333333333333, 102.05989526852466 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 323.0, 130.31500297356402 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12513892187982087418&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SklckhR5Ym", "title": "Improved Language Modeling by Decoding the Past", "track": "main", "status": "Reject", "tldr": "Decoding the last token in the context using the predicted next token distribution acts as a regularizer and improves language modeling.", "abstract": "Highly regularized LSTMs achieve impressive results on several benchmark datasets in language modeling. We propose a new regularization method based on decoding the last token in the context using the predicted distribution of the next token. This biases the model towards retaining more contextual information, in turn improving its ability to predict the next token. With negligible overhead in the number of parameters and training time, our Past Decode Regularization (PDR) method achieves a word level perplexity of 55.6 on the Penn Treebank and 63.5 on the WikiText-2 datasets using a single softmax. We also show gains by using PDR in combination with a mixture-of-softmaxes, achieving a word level perplexity of 53.8 and 60.5 on these datasets. In addition, our method achieves 1.169 bits-per-character on the Penn Treebank Character dataset for character level language modeling. These results constitute a new state-of-the-art in their respective settings.", "keywords": "language modeling;regularization;LSTM", "primary_area": "", "supplementary_material": "", "author": "Siddhartha Brahma", "authorids": "sidbrahma@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nbrahma2019improved,\ntitle={Improved Language Modeling by Decoding the Past},\nauthor={Siddhartha Brahma},\nyear={2019},\nurl={https://openreview.net/forum?id=SklckhR5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SklckhR5Ym", "pdf_size": 0, "rating": "3;6;7", "confidence": "5;3;5", "wc_review": "207;341;131", "wc_reply_reviewers": "147;0;22", "wc_reply_authors": "533;409;222", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 226.33333333333334, 86.81525723563162 ], "wc_reply_reviewers_avg": [ 56.333333333333336, 64.73707507208593 ], "wc_reply_authors_avg": [ 388.0, 127.8306170941323 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.2773500981126145, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5372352536712517550&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SklgHoRqt7", "title": "Metric-Optimized Example Weights", "track": "main", "status": "Reject", "tldr": "", "abstract": "Real-world machine learning applications often have complex test metrics, and may have training and test data that follow different distributions. We propose addressing these issues by using a weighted loss function with a standard convex loss, but with weights on the training examples that are learned to optimize the test metric of interest on the validation set. These metric-optimized example weights can be learned for any test metric, including black box losses and customized metrics for specific applications. We illustrate the performance of our proposal with public benchmark datasets and real-world applications with domain shift and custom loss functions that balance multiple objectives, impose fairness policies, and are non-convex and non-decomposable.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sen Zhao;Mahdi Milani Fard;Maya Gupta", "authorids": "senzhao@google.com;mmilanifard@google.com;mayagupta@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhao2019metricoptimized,\ntitle={Metric-Optimized Example Weights},\nauthor={Sen Zhao and Mahdi Milani Fard and Maya Gupta},\nyear={2019},\nurl={https://openreview.net/forum?id=SklgHoRqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SklgHoRqt7", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;4;3", "wc_review": "525;262;746", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 511.0, 197.84000269578107 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5876377887875144782&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "title": "LeMoNADe: Learned Motif and Neuronal Assembly Detection in calcium imaging videos", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/763", "id": "SkloDjAqYm", "author_site": "Elke Kirschbaum, Manuel Haussmann, Steffen Wolf, Hannah Sonntag, Justus Schneider, Shehabeldin Elzoheiry, Oliver Kann, Daniel Durstewitz, Fred A Hamprecht", "tldr": "We present LeMoNADe, an end-to-end learned motif detection method directly operating on calcium imaging videos.", "abstract": "Neuronal assemblies, loosely defined as subsets of neurons with reoccurring spatio-temporally coordinated activation patterns, or \"motifs\", are thought to be building blocks of neural representations and information processing. We here propose LeMoNADe, a new exploratory data analysis method that facilitates hunting for motifs in calcium imaging videos, the dominant microscopic functional imaging modality in neurophysiology. Our nonparametric method extracts motifs directly from videos, bypassing the difficult intermediate step of spike extraction. Our technique augments variational autoencoders with a discrete stochastic node, and we show in detail how a differentiable reparametrization and relaxation can be used. An evaluation on simulated data, with available ground truth, reveals excellent quantitative performance. In real video data acquired from brain slices, with no ground truth available, LeMoNADe uncovers nontrivial candidate motifs that can help generate hypotheses for more focused biological investigations.", "keywords": "VAE;unsupervised learning;neuronal assemblies;calcium imaging analysis", "primary_area": "", "supplementary_material": "", "author": "Elke Kirschbaum;Manuel Hau\u00dfmann;Steffen Wolf;Hannah Sonntag;Justus Schneider;Shehabeldin Elzoheiry;Oliver Kann;Daniel Durstewitz;Fred A Hamprecht", "authorids": "elke.kirschbaum@iwr.uni-heidelberg.de;manuel.haussmann@iwr.uni-heidelberg.de;steffen.wolf@iwr.uni-heidelberg.de;hannah.sonntag@mpimf-heidelberg.mpg.de;justus.schneider@physiologie.uni-heidelberg.de;shehab.elzoheiry@physiologie.uni-heidelberg.de;oliver.kann@physiologie.uni-heidelberg.de;daniel.durstewitz@zi-mannheim.de;fred.hamprecht@iwr.uni-heidelberg.de", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nkirschbaum2018lemonade,\ntitle={LeMo{NAD}e: Learned Motif and Neuronal Assembly Detection in calcium imaging videos},\nauthor={Elke Kirschbaum and Manuel Hau\u00dfmann and Steffen Wolf and Hannah Sonntag and Justus Schneider and Shehabeldin Elzoheiry and Oliver Kann and Daniel Durstewitz and Fred A Hamprecht},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkloDjAqYm},\n}", "github": "[![github](/images/github_icon.svg) EKirschbaum/LeMoNADe](https://github.com/EKirschbaum/LeMoNADe)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "5;8;8", "confidence": "4;5;4", "wc_review": "260;128;107", "wc_reply_reviewers": "0;470;219", "wc_reply_authors": "807;150;709", "reply_reviewers": "0;1;1", "reply_authors": "1;1;2", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 165.0, 67.72001181334805 ], "wc_reply_reviewers_avg": [ 229.66666666666666, 192.02488264689958 ], "wc_reply_authors_avg": [ 555.3333333333334, 289.39285102127565 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16794354699308703573&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SkloDjAqYm", "pdf": "https://openreview.net/pdf?id=SkloDjAqYm", "email": ";;;;;;;;", "author_num": 9 }, { "id": "Sklqvo0qt7", "title": "A Priori Estimates of the Generalization Error for Two-layer Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "New estimates for the generalization error are established for a nonlinear regression problem using a two-layer neural network model. These new estimates are a priori in nature in the sense that the bounds depend only on some norms of the underlying functions to be fitted, not the parameters in the model. In contrast, most existing results for neural networks are a posteriori in nature in the sense that the bounds depend on some norms of the model parameters. The error rates are comparable to that of the Monte Carlo method in terms of the size of the dataset. Moreover, these bounds are equally effective in the over-parametrized regime when the network size is much larger than the size of the dataset. ", "keywords": "Over-parameterization;A priori estimates;Path norm;Neural networks;Generalization error;Approximation error", "primary_area": "", "supplementary_material": "", "author": "Lei Wu;Chao Ma;Weinan E", "authorids": "leiwu@pku.edu.cn;chaom@princeton.edu;weinan@math.princeton.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwu2019a,\ntitle={A Priori Estimates of the Generalization Error for Two-layer Neural Networks},\nauthor={Lei Wu and Chao Ma and Weinan E},\nyear={2019},\nurl={https://openreview.net/forum?id=Sklqvo0qt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=Sklqvo0qt7", "pdf_size": 0, "rating": "4;4;4;5", "confidence": "4;3;3;3", "wc_review": "441;125;558;580", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 4.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 426.0, 181.6356242591194 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7739564448991247698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Sklr9i09KQ", "title": "Neural Networks for Modeling Source Code Edits", "track": "main", "status": "Reject", "tldr": "Neural networks for source code that model changes being made to the code-base rather than static snapshots of code.", "abstract": "Programming languages are emerging as a challenging and interesting domain for machine learning. A core task, which has received significant attention in recent years, is building generative models of source code. However, to our knowledge, previous generative models have always been framed in terms of generating static snapshots of code. In this work, we instead treat source code as a dynamic object and tackle the problem of modeling the edits that software developers make to source code files. This requires extracting intent from previous edits and leveraging it to generate subsequent edits. We develop several neural networks and use synthetic data to test their ability to learn challenging edit patterns that require strong generalization. We then collect and train our models on a large-scale dataset consisting of millions of fine-grained edits from thousands of Python developers.", "keywords": "Neural Networks;Program Synthesis;Source Code Modeling", "primary_area": "", "supplementary_material": "", "author": "Rui Zhao;David Bieber;Kevin Swersky;Daniel Tarlow", "authorids": "oahziur@gmail.com;dbieber@google.com;kswersky@google.com;dtarlow@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2019neural,\ntitle={Neural Networks for Modeling Source Code Edits},\nauthor={Rui Zhao and David Bieber and Kevin Swersky and Daniel Tarlow},\nyear={2019},\nurl={https://openreview.net/forum?id=Sklr9i09KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sklr9i09KQ", "pdf_size": 0, "rating": "5;6;6;6", "confidence": "4;4;4;2", "wc_review": "495;771;377;263", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "710;797;590;222", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 5.75, 0.4330127018922193 ], "confidence_avg": [ 3.5, 0.8660254037844386 ], "wc_review_avg": [ 476.5, 188.7822820076079 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 579.75, 219.233180654754 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.3333333333333333, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3006943601367901885&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SklrrhRqFX", "title": "Learning Physics Priors for Deep Reinforcement Learing", "track": "main", "status": "Reject", "tldr": "We propose a new approach to pre-train a physics prior from raw videos and incorporate it into an RL framework that allows for better learning and efficient generalization.", "abstract": "While model-based deep reinforcement learning (RL) holds great promise for sample efficiency and generalization, learning an accurate dynamics model is challenging and often requires substantial interactions with the environment. Further, a wide variety of domains have dynamics that share common foundations like the laws of physics, which are rarely exploited by these algorithms. Humans often acquire such physics priors that allow us to easily adapt to the dynamics of any environment. In this work, we propose an approach to learn such physics priors and incorporate them into an RL agent. Our method involves pre-training a frame predictor on raw videos and then using it to initialize the dynamics prediction model on a target task. Our prediction model, SpatialNet, is designed to implicitly capture localized physical phenomena and interactions. We show the value of incorporating this prior through empirical experiments on two different domains \u2013 a newly created PhysWorld and games from the Atari benchmark, outperforming competitive approaches and demonstrating effective transfer learning.", "keywords": "Model-Based Reinforcement Learning;Intuitive Physics", "primary_area": "", "supplementary_material": "", "author": "Yilun Du;Karthik Narasimhan", "authorids": "yilundu@openai.com;karthikn@cs.princeton.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndu2019learning,\ntitle={Learning Physics Priors for Deep Reinforcement Learing},\nauthor={Yilun Du and Karthik Narasimhan},\nyear={2019},\nurl={https://openreview.net/forum?id=SklrrhRqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SklrrhRqFX", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;5;4", "wc_review": "222;419;400", "wc_reply_reviewers": "113;71;228", "wc_reply_authors": "526;598;458", "reply_reviewers": "1;1;1", "reply_authors": "2;2;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 347.0, 88.72804892854721 ], "wc_reply_reviewers_avg": [ 137.33333333333334, 66.36431437317968 ], "wc_reply_authors_avg": [ 527.3333333333334, 57.16253629393605 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cnRs5Pcw2Z0J:scholar.google.com/&scioq=Learning+Physics+Priors+for+Deep+Reinforcement+Learing&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "title": "Competitive experience replay", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1091", "id": "Sklsm20ctX", "author_site": "Hao Liu, Alexander Trott, richard socher, Caiming Xiong", "tldr": "a novel method to learn with sparse reward using adversarial reward re-labeling", "abstract": "Deep learning has achieved remarkable successes in solving challenging reinforcement learning (RL) problems when dense reward function is provided. However, in sparse reward environment it still often suffers from the need to carefully shape reward function to guide policy optimization. This limits the applicability of RL in the real world since both reinforcement learning and domain-specific knowledge are required. It is therefore of great practical importance to develop algorithms which can learn from a binary signal indicating successful task completion or other unshaped, sparse reward signals. We propose a novel method called competitive experience replay, which efficiently supplements a sparse reward by placing learning in the context of an exploration competition between a pair of agents. Our method complements the recently proposed hindsight experience replay (HER) by inducing an automatic exploratory curriculum. We evaluate our approach on the tasks of reaching various goal locations in an ant maze and manipulating objects with a robotic arm. Each task provides only binary rewards indicating whether or not the goal is achieved. Our method asymmetrically augments these sparse rewards for a pair of agents each learning the same task, creating a competitive game designed to drive exploration. Extensive experiments demonstrate that this method leads to faster converge and improved task performance.", "keywords": "reinforcement learning;sparse reward;goal-based learning", "primary_area": "", "supplementary_material": "", "author": "Hao Liu;Alexander Trott;Richard Socher;Caiming Xiong", "authorids": "lhao499@gmail.com;atrott@salesforce.com;rsocher@salesforce.com;cxiong@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nliu2018competitive,\ntitle={Competitive experience replay},\nauthor={Hao Liu and Alexander Trott and Richard Socher and Caiming Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Sklsm20ctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;7;7", "confidence": "4;4;5;4", "wc_review": "314;298;232;976", "wc_reply_reviewers": "87;0;0;0", "wc_reply_authors": "488;358;381;569", "reply_reviewers": "1;0;0;0", "reply_authors": "2;1;1;1", "rating_avg": [ 6.25, 0.82915619758885 ], "confidence_avg": [ 4.25, 0.4330127018922193 ], "wc_review_avg": [ 455.0, 302.3656726548171 ], "wc_reply_reviewers_avg": [ 21.75, 37.67210506462308 ], "wc_reply_authors_avg": [ 449.0, 84.89110671913755 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5222329678670935, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8704818254702169597&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=Sklsm20ctX", "pdf": "https://openreview.net/pdf?id=Sklsm20ctX", "email": ";;;", "author_num": 4 }, { "id": "Skluy2RcK7", "title": "Selectivity metrics can overestimate the selectivity of units: a case study on AlexNet", "track": "main", "status": "Reject", "tldr": "Common selectivity metrics overestimate the selectivity of units, true object detectors are extremely rare, but class selectivity does increase with depth. ", "abstract": "Various methods of measuring unit selectivity have been developed in order to understand the representations learned by neural networks (NNs). Here we undertake a comparison of four such measures on AlexNet, namely, localist selectivity, \\precision (Zhou et al, ICLR 2015), class-conditional mean activity selectivity CCMAS; (Morcos et al, ICLR 2018), and a new measure called top-class selectivity. In contrast with previous work on recurrent neural networks (RNNs), we fail to find any 100\\% selective `localist units' in AlexNet, and demonstrate that the \\precision and CCMAS measures provide a much higher level of selectivity than is warranted, with the most selective hidden units only responding strongly to a small minority of images from within a category. We also generated activation maximization (AM) images that maximally activated individual units and found that under (5\\%) of units in fc6 and conv5 produced interpretable images of objects, whereas fc8 produced over 50\\% interpretable images. Furthermore, the interpretable images in the hidden layers were not associated with highly selective units. These findings highlight the problem with current selectivity measures and show that new measures are required in order to provide a better assessment of learned representations in NNs. We also consider why localist representations are learned in RNNs and not AlexNet.", "keywords": "AlexNet;neural networks;selectivity;localist;distributed;represenataion;precision;measures of selectivity;object detectors;single directions;network analysis", "primary_area": "", "supplementary_material": "", "author": "Ella M. Gale;Anh Nguyen;Ryan Blything;Nicholas Martin and Jeffrey S. Bowers", "authorids": "ella.gale@gmail.com;anhnguyen@auburn.edu;ryan.blything@bristol.ac.uk;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\ngale2019selectivity,\ntitle={Selectivity metrics can overestimate the selectivity of units: a case study on AlexNet},\nauthor={Ella M. Gale and Anh Nguyen and Ryan Blything and Nicholas Martin and Jeffrey S. Bowers},\nyear={2019},\nurl={https://openreview.net/forum?id=Skluy2RcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Skluy2RcK7", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;3;3", "wc_review": "279;406;473", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "824;288;235", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 386.0, 80.45288476286396 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 449.0, 266.0463618745174 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:XTpP02lk59AJ:scholar.google.com/&scioq=Selectivity+metrics+can+overestimate+the+selectivity+of+units:+a+case+study+on+AlexNet&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Multi-Domain Adversarial Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/952", "id": "Sklv5iRqYX", "author_site": "Alice Schoenauer Sebag, Louise E Heinrich, Marc Schoenauer, Michele Sebag, Lani Wu, Steven Altschuler", "tldr": "Adversarial Domain adaptation and Multi-domain learning: a new loss to handle multi- and single-domain classes in the semi-supervised setting.", "abstract": "Multi-domain learning (MDL) aims at obtaining a model with minimal average risk across multiple domains. Our empirical motivation is automated microscopy data, where cultured cells are imaged after being exposed to known and unknown chemical perturbations, and each dataset displays significant experimental bias. This paper presents a multi-domain adversarial learning approach, MuLANN, to leverage multiple datasets with overlapping but distinct class sets, in a semi-supervised setting. Our contributions include: i) a bound on the average- and worst-domain risk in MDL, obtained using the H-divergence; ii) a new loss to accommodate semi-supervised multi-domain learning and domain adaptation; iii) the experimental validation of the approach, improving on the state of the art on two standard image benchmarks, and a novel bioimage dataset, Cell.", "keywords": "multi-domain learning;domain adaptation;adversarial learning;H-divergence;deep representation learning;high-content microscopy", "primary_area": "", "supplementary_material": "", "author": "Alice Schoenauer-Sebag;Louise Heinrich;Marc Schoenauer;Michele Sebag;Lani F. Wu;Steve J. Altschuler", "authorids": "alice.schoenauer@polytechnique.org;louise.heinrich@ucsf.edu;marc.schoenauer@inria.fr;sebag@lri.fr;lani.wu@ucsf.edu;steven.altschuler@ucsf.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nschoenauer-sebag2018multidomain,\ntitle={Multi-Domain Adversarial Learning},\nauthor={Alice Schoenauer-Sebag and Louise Heinrich and Marc Schoenauer and Michele Sebag and Lani Wu and Steve Altschuler},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Sklv5iRqYX},\n}", "github": "[![github](/images/github_icon.svg) AltschulerWu-Lab/MuLANN](https://github.com/AltschulerWu-Lab/MuLANN)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;5;5", "wc_review": "185;534;127", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "305;767;15", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 282.0, 179.7572437112526 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 362.3333333333333, 309.6679224947625 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.7559289460184544, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12918642192245741417&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Sklv5iRqYX", "pdf": "https://openreview.net/pdf?id=Sklv5iRqYX", "email": ";;;;;", "author_num": 6 }, { "id": "SklzIjActX", "title": "HIGHLY EFFICIENT 8-BIT LOW PRECISION INFERENCE OF CONVOLUTIONAL NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "We present a general technique toward 8-bit low precision inference of convolutional neural networks. ", "abstract": "High throughput and low latency inference of deep neural networks are critical for the deployment of deep learning applications. This paper presents a general technique toward 8-bit low precision inference of convolutional neural networks, including 1) channel-wise scale factors of weights, especially for depthwise convolution, 2) Winograd convolution, and 3) topology-wise 8-bit support. We experiment the techniques on top of a widely-used deep learning framework. The 8-bit optimized model is automatically generated with a calibration process from FP32 model without the need of fine-tuning or retraining. We perform a systematical and comprehensive study on 18 widely-used convolutional neural networks and demonstrate the effectiveness of 8-bit low precision inference across a wide range of applications and use cases, including image classification, object detection, image segmentation, and super resolution. We show that the inference throughput\nand latency are improved by 1.6X and 1.5X respectively with minimal within 0.6%1to no loss in accuracy from FP32 baseline. We believe the methodology can provide the guidance and reference design of 8-bit low precision inference for other frameworks. All the code and models will be publicly available soon.", "keywords": "8-bit low precision inference;convolutional neural networks;statistical accuracy;8-bit Winograd convolution", "primary_area": "", "supplementary_material": "", "author": "Haihao Shen;Jiong Gong;Xiaoli Liu;Guoming Zhang;Ge Jin;and Eric Lin", "authorids": "haihao.shen@intel.com;jiong.gong@intel.com;xiaoli.liu@intel.com;guoming.zhang@intel.com;ge.jin@intel.com;eric.lin@intel.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nshen2019highly,\ntitle={{HIGHLY} {EFFICIENT} 8-{BIT} {LOW} {PRECISION} {INFERENCE} {OF} {CONVOLUTIONAL} {NEURAL} {NETWORKS}},\nauthor={Haihao Shen and Jiong Gong and Xiaoli Liu and Guoming Zhang and Ge Jin and and Eric Lin},\nyear={2019},\nurl={https://openreview.net/forum?id=SklzIjActX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SklzIjActX", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;4", "wc_review": "163;187;261", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 203.66666666666666, 41.7079795189788 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13513081191528891210&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SkxANsC9tQ", "title": "Learning Graph Representations by Dendrograms", "track": "main", "status": "Reject", "tldr": "Novel quality metric for hierarchical graph clustering", "abstract": "Hierarchical clustering is a common approach to analysing the \nmulti-scale structure of graphs observed in practice. \nWe propose a novel metric for assessing the quality of a hierarchical clustering. This metric reflects the ability to reconstruct the graph from the dendrogram encoding the hierarchy. The best representation of the graph for this metric in turn yields a novel hierarchical clustering algorithm. Experiments on both real and synthetic data illustrate the efficiency of the approach. \n", "keywords": "Graph;hierarchical clustering;dendrogram;quality metric;reconstruction;entropy", "primary_area": "", "supplementary_material": "", "author": "Thomas Bonald;Bertrand Charpentier", "authorids": "thomas.bonald@telecom-paristech.fr;bertrand.charpentier@live.fr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbonald2019learning,\ntitle={Learning Graph Representations by Dendrograms},\nauthor={Thomas Bonald and Bertrand Charpentier},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxANsC9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SkxANsC9tQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;3;4", "wc_review": "158;310;294", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 254.0, 68.19579654690358 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5057290520051998731&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SkxJ-309FQ", "title": "Hallucinations in Neural Machine Translation", "track": "main", "status": "Reject", "tldr": "We introduce and analyze the phenomenon of \"hallucinations\" in NMT, or spurious translations unrelated to source text, and propose methods to reduce its frequency.", "abstract": "Neural machine translation (NMT) systems have reached state of the art performance in translating text and are in wide deployment. Yet little is understood about how these systems function or break. Here we show that NMT systems are susceptible to producing highly pathological translations that are completely untethered from the source material, which we term hallucinations. Such pathological translations are problematic because they are are deeply disturbing of user trust and easy to find with a simple search. We describe a method to generate hallucinations and show that many common variations of the NMT architecture are susceptible to them. We study a variety of approaches to reduce the frequency of hallucinations, including data augmentation, dynamical systems and regularization techniques, showing that data augmentation significantly reduces hallucination frequency. Finally, we analyze networks that produce hallucinations and show that there are signatures in the attention matrix as well as in the hidden states of the decoder.", "keywords": "nmt;translate;dynamics;rnn", "primary_area": "", "supplementary_material": "", "author": "Katherine Lee;Orhan Firat;Ashish Agarwal;Clara Fannjiang;David Sussillo", "authorids": "katherinelee@google.com;orhanf@google.com;agarwal@google.com;clarafy@berkeley.edu;sussillo@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlee2019hallucinations,\ntitle={Hallucinations in Neural Machine Translation},\nauthor={Katherine Lee and Orhan Firat and Ashish Agarwal and Clara Fannjiang and David Sussillo},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxJ-309FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SkxJ-309FQ", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;5;4", "wc_review": "265;100;79", "wc_reply_reviewers": "76;0;0", "wc_reply_authors": "1083;386;112", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 148.0, 83.17451532771321 ], "wc_reply_reviewers_avg": [ 25.333333333333332, 35.82674358011841 ], "wc_reply_authors_avg": [ 527.0, 408.75502035652926 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.18898223650461357, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13047383023392116047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "ProMP: Proximal Meta-Policy Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/788", "id": "SkxXCi0qFX", "author_site": "Jonas Rothfuss, Dennis Lee, Ignasi Clavera, Tamim Asfour, Pieter Abbeel", "tldr": "A novel and theoretically grounded meta-reinforcement learning algorithm", "abstract": "Credit assignment in Meta-reinforcement learning (Meta-RL) is still poorly understood. Existing methods either neglect credit assignment to pre-adaptation behavior or implement it naively. This leads to poor sample-efficiency during meta-training as well as ineffective task identification strategies.\nThis paper provides a theoretical analysis of credit assignment in gradient-based Meta-RL. Building on the gained insights we develop a novel meta-learning algorithm that overcomes both the issue of poor credit assignment and previous difficulties in estimating meta-policy gradients. By controlling the statistical distance of both pre-adaptation and adapted policies during meta-policy search, the proposed algorithm endows efficient and stable meta-learning. Our approach leads to superior pre-adaptation policy behavior and consistently outperforms previous Meta-RL algorithms in sample-efficiency, wall-clock time, and asymptotic performance.", "keywords": "Meta-Reinforcement Learning;Meta-Learning;Reinforcement-Learning", "primary_area": "", "supplementary_material": "", "author": "Jonas Rothfuss;Dennis Lee;Ignasi Clavera;Tamim Asfour;Pieter Abbeel", "authorids": "jonas.rothfuss@gmail.com;dennisl88@berkeley.edu;iclavera@berkeley.edu;asfour@kit.edu;pabbeel@cs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nrothfuss2018promp,\ntitle={Pro{MP}: Proximal Meta-Policy Search},\nauthor={Jonas Rothfuss and Dennis Lee and Ignasi Clavera and Tamim Asfour and Pieter Abbeel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxXCi0qFX},\n}", "github": "[![github](/images/github_icon.svg) jonasrothfuss/promp](https://github.com/jonasrothfuss/promp) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=SkxXCi0qFX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "3;3;3", "wc_review": "403;225;265", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "484;187;148", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 297.6666666666667, 76.25104735164116 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 273.0, 150.04665940966495 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 255, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5271959514847376578&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SkxXCi0qFX", "pdf": "https://openreview.net/pdf?id=SkxXCi0qFX", "email": ";;;;", "author_num": 5 }, { "title": "Don't Settle for Average, Go for the Max: Fuzzy Sets and Max-Pooled Word Vectors", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/705", "id": "SkxXg2C5FX", "author_site": "Vitalii Zhelezniak, Aleksandar D Savkov, April Shen, Francesco Moramarco, Jack Flann, Nils Hammerla", "tldr": "Max-pooled word vectors with fuzzy Jaccard set similarity are an extremely competitive baseline for semantic similarity; we propose a simple dynamic variant that performs even better.", "abstract": "Recent literature suggests that averaged word vectors followed by simple post-processing outperform many deep learning methods on semantic textual similarity tasks. Furthermore, when averaged word vectors are trained supervised on large corpora of paraphrases, they achieve state-of-the-art results on standard STS benchmarks. Inspired by these insights, we push the limits of word embeddings even further. We propose a novel fuzzy bag-of-words (FBoW) representation for text that contains all the words in the vocabulary simultaneously but with different degrees of membership, which are derived from similarities between word vectors. We show that max-pooled word vectors are only a special case of fuzzy BoW and should be compared via fuzzy Jaccard index rather than cosine similarity. Finally, we propose DynaMax, a completely unsupervised and non-parametric similarity measure that dynamically extracts and max-pools good features depending on the sentence pair. This method is both efficient and easy to implement, yet outperforms current baselines on STS tasks by a large margin and is even competitive with supervised word vectors trained to directly optimise cosine similarity.", "keywords": "word vectors;sentence representations;distributed representations;fuzzy sets;bag-of-words;unsupervised learning;word vector compositionality;max-pooling;Jaccard index", "primary_area": "", "supplementary_material": "", "author": "Vitalii Zhelezniak;Aleksandar Savkov;April Shen;Francesco Moramarco;Jack Flann;Nils Y. Hammerla", "authorids": "vitali.zhelezniak@babylonhealth.com;sasho.savkov@babylonhealth.com;april.shen@babylonhealth.com;francesco.moramarco@babylonhealth.com;jack.flann@babylonhealth.com;nils.hammerla@babylonhealth.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nzhelezniak2018dont,\ntitle={Don't Settle for Average, Go for the Max: Fuzzy Sets and Max-Pooled Word Vectors},\nauthor={Vitalii Zhelezniak and Aleksandar Savkov and April Shen and Francesco Moramarco and Jack Flann and Nils Y. Hammerla},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxXg2C5FX},\n}", "github": "[![github](/images/github_icon.svg) Babylonpartners/fuzzymax](https://github.com/Babylonpartners/fuzzymax) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SkxXg2C5FX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;8;8", "confidence": "3;4;3", "wc_review": "185;378;267", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "855;683;576", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 276.6666666666667, 79.08785550821875 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 704.6666666666666, 114.92702998956436 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17199150617564073243&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SkxXg2C5FX", "pdf": "https://openreview.net/pdf?id=SkxXg2C5FX", "email": ";;;;;", "author_num": 6 }, { "id": "SkxXwo0qYm", "title": "An Automatic Operation Batching Strategy for the Backward Propagation of Neural Networks Having Dynamic Computation Graphs", "track": "main", "status": "Reject", "tldr": "", "abstract": "Organizing the same operations in the computation graph of a neural network into batches is one of the important methods to improve the speed of training deep learning models and applications since it helps to execute operations with the same type in parallel and to make full use of the available hardware resources. This batching task is usually done by the developers manually and it becomes more dif- ficult when the neural networks have dynamic computation graphs because of the input data with varying structures or the dynamic flow control. Several automatic batching strategies were proposed and integrated into some deep learning toolkits so that the programmers don\u2019t have to be responsible for this task. These strategies, however, will miss some important opportunities to group the operations in the backward propagation of training neural networks. In this paper, we proposed a strategy which provides more efficient automatic batching and brings benefits to the memory access in the backward propagation. We also test our strategy on a variety of benchmarks with dynamic computation graphs. The result shows that it really brings further improvements in the training speed when our strategy is working with the existing automatic strategies.", "keywords": "Automatic Operation Batching;Dynamic Computation Graphs", "primary_area": "", "supplementary_material": "", "author": "Yuchen Qiao;Kenjiro Taura", "authorids": "qiao@eidos.ic.i.u-tokyo.ac.jp;tau@eidos.ic.i.u-tokyo.ac.jp", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nqiao2019an,\ntitle={An Automatic Operation Batching Strategy for the Backward Propagation of Neural Networks Having Dynamic Computation Graphs},\nauthor={Yuchen Qiao and Kenjiro Taura},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxXwo0qYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkxXwo0qYm", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;5", "wc_review": "838;232;359", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 476.3333333333333, 260.9397546476112 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15354022096230078545&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SkxYOiCqKX", "title": "Pixel Chem: A Representation for Predicting Material Properties with Neural Network", "track": "main", "status": "Reject", "tldr": "Proposed a unified, physics based representation of material structures to predict various properties with neural netwoek.", "abstract": "In this work we developed a new representation of the chemical information for the machine learning models, with benefits from both the real space (R-space) and energy space (K-space). Different from the previous symmetric matrix presentations, the charge transfer channel based on Pauling\u2019s electronegativity is derived from the dependence on real space distance and orbitals for the hetero atomic structures. This representation can work for the bulk materials as well as the low dimensional nano materials, and can map the R-space and K-space into the pixel space (P-space) by training and testing 130k structures. P-space can well reproduce the R-space quantities within error 0.53. This new asymmetric matrix representation double the information storage than the previous symmetric representations.This work provides a new dimension for the computational chemistry towards the machine learning architecture. ", "keywords": "material property prediction;neural network;material structure representation;chemistry", "primary_area": "", "supplementary_material": "", "author": "Shuqian Ye;Yanheng Xu;Jiechun Liang;Hao Xu;Shuhong Cai;Shixin Liu;Xi Zhu", "authorids": "115010269@link.cuhk.edu.cn;115010252@link.cuhk.edu.cn;116010125@link.cuhk.edu.cn;115010250@link.cuhk.edu.cn;115010111@link.cuhk.edu.cn;115010194@link.cuhk.edu.cn;zhuxi@cuhk.edu.cn", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nye2019pixel,\ntitle={Pixel Chem: A Representation for Predicting Material Properties with Neural Network},\nauthor={Shuqian Ye and Yanheng Xu and Jiechun Liang and Hao Xu and Shuhong Cai and Shixin Liu and Xi Zhu},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxYOiCqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkxYOiCqKX", "pdf_size": 0, "rating": "1;3;3", "confidence": "5;5;3", "wc_review": "438;437;200", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "126;128;57", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 2.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 358.3333333333333, 111.95931800832341 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 103.66666666666667, 33.00841643513902 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Q_kU7cbyd_cJ:scholar.google.com/&scioq=Pixel+Chem:+A+Representation+for+Predicting+Material+Properties+with+Neural+Network&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkxZFoAqtQ", "title": "Improving Composition of Sentence Embeddings through the Lens of Statistical Relational Learning", "track": "main", "status": "Reject", "tldr": "We apply ideas from Statistical Relational Learning to compose sentence embeddings with more expressivity", "abstract": "Various NLP problems -- such as the prediction of sentence similarity, entailment, and discourse relations -- are all instances of the same general task: the modeling of semantic relations between a pair of textual elements. We call them textual relational problems. A popular model for textual relational problems is to embed sentences into fixed size vectors and use composition functions (e.g. difference or concatenation) of those vectors as features for the prediction. Meanwhile, composition of embeddings has been a main focus within the field of Statistical Relational Learning (SRL) whose goal is to predict relations between entities (typically from knowledge base triples). In this work, we show that textual relational models implicitly use compositions from baseline SRL models. We show that such compositions are not expressive enough for several tasks (e.g. natural language inference). We build on recent SRL models to address textual relational problems, showing that they are more expressive, and can alleviate issues from simpler compositions. The resulting models significantly improve the state of the art in both transferable sentence representation learning and relation prediction.", "keywords": "Statistical Relational Learning;Sentence Embedding;Composition functions;Natural Language Inference;InferSent;SentEval;ComplEx", "primary_area": "", "supplementary_material": "", "author": "Damien Sileo;Tim Van de Cruys;Camille Pradel;Philippe Muller", "authorids": "damien.sileo@synapse-fr.com;tim.van-de-cruys@irit.fr;camille.pradel@synapse-fr.com;philippe.muller@irit.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsileo2019improving,\ntitle={Improving Composition of Sentence Embeddings through the Lens of Statistical Relational Learning},\nauthor={Damien Sileo and Tim Van de Cruys and Camille Pradel and Philippe Muller},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxZFoAqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkxZFoAqtQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;4", "wc_review": "239;547;771", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 519.0, 218.0886669835612 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3826511587403832930&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SkxbDsR9Ym", "title": "RelWalk -- A Latent Variable Model Approach to Knowledge Graph Embedding", "track": "main", "status": "Reject", "tldr": "We present a theoretically proven generative model of knowledge graph embedding. ", "abstract": "Knowledge Graph Embedding (KGE) is the task of jointly learning entity and relation embeddings for a given knowledge graph. Existing methods for learning KGEs can be seen as a two-stage process where (a) entities and relations in the knowledge graph are represented using some linear algebraic structures (embeddings), and (b) a scoring function is defined that evaluates the strength of a relation that holds between two entities using the corresponding relation and entity embeddings. Unfortunately, prior proposals for the scoring functions in the first step have been heuristically motivated, and it is unclear as to how the scoring functions in KGEs relate to the generation process of the underlying knowledge graph. To address this issue, we propose a generative account of the KGE learning task. Specifically, given a knowledge graph represented by a set of relational triples (h, R, t), where the semantic relation R holds between the two entities h (head) and t (tail), we extend the random walk model (Arora et al., 2016a) of word embeddings to KGE. We derive a theoretical relationship between the joint probability p(h, R, t) and the embeddings of h, R and t. Moreover, we show that marginal loss minimisation, a popular objective used by much prior work in KGE, follows naturally from the log-likelihood ratio maximisation under the probabilities estimated from the KGEs according to our theoretical relationship. We propose a learning objective motivated by the theoretical analysis to learn KGEs from a given knowledge graph. The KGEs learnt by our proposed method obtain state-of-the-art performance on FB15K237 and WN18RR benchmark datasets, providing empirical evidence in support of the theory.\n", "keywords": "relation representations;natural language processing;theoretical analysis;knowledge graphs", "primary_area": "", "supplementary_material": "", "author": "Danushka Bollegala;Huda Hakami;Yuichi Yoshida;Ken-ichi Kawarabayashi", "authorids": "danushka@liverpool.ac.uk;h.a.hakami@liverpool.ac.uk;yyoshida@nii.ac.jp;k_keniti@nii.ac.jp", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbollegala2019relwalk,\ntitle={RelWalk -- A Latent Variable Model Approach to Knowledge Graph Embedding},\nauthor={Danushka Bollegala and Huda Hakami and Yuichi Yoshida and Ken-ichi Kawarabayashi},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxbDsR9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SkxbDsR9Ym", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "wc_review": "280;237;314", "wc_reply_reviewers": "0;0;83", "wc_reply_authors": "197;493;299", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 277.0, 31.506613062445584 ], "wc_reply_reviewers_avg": [ 27.666666666666668, 39.12657522565563 ], "wc_reply_authors_avg": [ 329.6666666666667, 122.77169416803243 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2743542978786764388&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "Skxqni09KX", "title": "Online Bellman Residue Minimization via Saddle Point Optimization", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study the problem of Bellman residual minimization with nonlinear function approximation in general. \n Based on a nonconvex saddle point formulation of Bellman residual minimization via Fenchel duality, we propose an online first-order algorithm with two-timescale learning rates. Using tools from stochastic approximation, we establish the convergence of our problem by approximating the dynamics of the iterates using two ordinary differential equations. Moreover, as a byproduct, we establish a finite-time convergence result under the assumption that the dual problem can be solved up to some error. Finally, numerical experiments are provided to back up our theory.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zhuoran Yang;Cheng Zhou;Tong Zhang;Han Liu", "authorids": "zy6@princeton.edu;mikechzhou@tencent.com;tongzhang@tongzhang-ml.org;hanliu.cmu@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Skxqni09KX", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "259;902;445", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "239;399;666", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 535.3333333333334, 270.1633250872928 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 434.6666666666667, 176.13694167386413 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ywXlPUej_tUJ:scholar.google.com/&scioq=Online+Bellman+Residue+Minimization+via+Saddle+Point+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkxxIs0qY7", "title": "CoT: Cooperative Training for Generative Modeling of Discrete Data", "track": "main", "status": "Reject", "tldr": "We proposed Cooperative Training, a novel training algorithm for generative modeling of discrete data.", "abstract": "We propose Cooperative Training (CoT) for training generative models that measure a tractable density for discrete data. CoT coordinately trains a generator G and an auxiliary predictive mediator M. The training target of M is to estimate a mixture density of the learned distribution G and the target distribution P, and that of G is to minimize the Jensen-Shannon divergence estimated through M. CoT achieves independent success without the necessity of pre-training via Maximum Likelihood Estimation or involving high-variance algorithms like REINFORCE. This low-variance algorithm is theoretically proved to be superior for both sample generation and likelihood prediction. We also theoretically and empirically show the superiority of CoT over most previous algorithms in terms of generative quality and diversity, predictive generalization ability and computational cost.", "keywords": "Generative Models;Sequence Modeling;Text Generation", "primary_area": "", "supplementary_material": "", "author": "Sidi Lu;Lantao Yu;Siyuan Feng;Yaoming Zhu;Weinan Zhang;Yong Yu", "authorids": "steve_lu@apex.sjtu.edu.cn;yulantao@apex.sjtu.edu.cn;siyuanfeng@apex.sjtu.edu;ymzhu@apex.sjtu.edu.cn;wnzhang@apex.sjtu.edu.cn;yyu@apex.sjtu.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlu2019cot,\ntitle={CoT: Cooperative Training for Generative Modeling of Discrete Data},\nauthor={Sidi Lu and Lantao Yu and Siyuan Feng and Yaoming Zhu and Weinan Zhang and Yong Yu},\nyear={2019},\nurl={https://openreview.net/forum?id=SkxxIs0qY7},\n}", "github": "[![github](/images/github_icon.svg) desire2020/Cooperative-Training](https://github.com/desire2020/Cooperative-Training) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SkxxIs0qY7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SkxxIs0qY7", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;2;2", "wc_review": "202;512;151", "wc_reply_reviewers": "127;0;5", "wc_reply_authors": "272;136;98", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 288.3333333333333, 159.52081020634546 ], "wc_reply_reviewers_avg": [ 44.0, 58.72534943843814 ], "wc_reply_authors_avg": [ 168.66666666666666, 74.6964226422896 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 34, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4231322493080735140&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "Skz-3j05tm", "title": "Graph Convolutional Network with Sequential Attention For Goal-Oriented Dialogue Systems", "track": "main", "status": "Reject", "tldr": "We propose a Graph Convolutional Network based encoder-decoder model with sequential attention for goal-oriented dialogue systems.", "abstract": "Domain specific goal-oriented dialogue systems typically require modeling three types of inputs, viz., (i) the knowledge-base associated with the domain, (ii) the history of the conversation, which is a sequence of utterances and (iii) the current utterance for which the response needs to be generated. While modeling these inputs, current state-of-the-art models such as Mem2Seq typically ignore the rich structure inherent in the knowledge graph and the sentences in the conversation context. Inspired by the recent success of structure-aware Graph Convolutional Networks (GCNs) for various NLP tasks such as machine translation, semantic role labeling and document dating, we propose a memory augmented GCN for goal-oriented dialogues. Our model exploits (i) the entity relation graph in a knowledge-base and (ii) the dependency graph associated with an utterance to compute richer representations for words and entities. Further, we take cognizance of the fact that in certain situations, such as, when the conversation is in a code-mixed language, dependency parsers may not be available. We show that in such situations we could use the global word co-occurrence graph and use it to enrich the representations of utterances. We experiment with the modified DSTC2 dataset and its recently released code-mixed versions in four languages and show that our method outperforms existing state-of-the-art methods, using a wide range of evaluation metrics.", "keywords": "Goal-oriented Dialogue Systems;Graph Convolutional Networks", "primary_area": "", "supplementary_material": "", "author": "Suman Banerjee;Mitesh M. Khapra", "authorids": "suman@cse.iitm.ac.in;miteshk@cse.iitm.ac.in", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbanerjee2019graph,\ntitle={Graph Convolutional Network with Sequential Attention For Goal-Oriented Dialogue Systems},\nauthor={Suman Banerjee and Mitesh M. Khapra},\nyear={2019},\nurl={https://openreview.net/forum?id=Skz-3j05tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Skz-3j05tm", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;4;2", "wc_review": "502;441;262", "wc_reply_reviewers": "0;0;25", "wc_reply_authors": "790;399;529", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 401.6666666666667, 101.85065319159989 ], "wc_reply_reviewers_avg": [ 8.333333333333334, 11.785113019775793 ], "wc_reply_authors_avg": [ 572.6666666666666, 162.58399538563307 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10326130881445805797&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "Skz3Q2CcFX", "title": "Visualizing and Understanding the Semantics of Embedding Spaces via Algebraic Formulae", "track": "main", "status": "Reject", "tldr": "We propose to use explicit vector algebraic formulae projection as an alternative way to visualize embedding spaces specifically tailored for goal-oriented analysis tasks and it outperforms t-SNE in our user study.", "abstract": "Embeddings are a fundamental component of many modern machine learning and natural language processing models.\nUnderstanding them and visualizing them is essential for gathering insights about the information they capture and the behavior of the models.\nState of the art in analyzing embeddings consists in projecting them in two-dimensional planes without any interpretable semantics associated to the axes of the projection, which makes detailed analyses and comparison among multiple sets of embeddings challenging.\nIn this work, we propose to use explicit axes defined as algebraic formulae over embeddings to project them into a lower dimensional, but semantically meaningful subspace, as a simple yet effective analysis and visualization methodology.\nThis methodology assigns an interpretable semantics to the measures of variability and the axes of visualizations, allowing for both comparisons among different sets of embeddings and fine-grained inspection of the embedding spaces.\nWe demonstrate the power of the proposed methodology through a series of case studies that make use of visualizations constructed around the underlying methodology and through a user study. The results show how the methodology is effective at providing more profound insights than classical projection methods and how it is widely applicable to many other use cases.", "keywords": "visualization;embeddings;representations;t-sne;natural;language;processing;machine;learning;algebra", "primary_area": "", "supplementary_material": "", "author": "Piero Molino;Yang Wang;Jiawei Zhang", "authorids": "piero@uber.com;gnavvy@uber.com;rivulet.zhang@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmolino2019visualizing,\ntitle={Visualizing and Understanding the Semantics of Embedding Spaces via Algebraic Formulae},\nauthor={Piero Molino and Yang Wang and Jiawei Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=Skz3Q2CcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Skz3Q2CcFX", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;4;3", "wc_review": "268;147;169", "wc_reply_reviewers": "0;83;0", "wc_reply_authors": "1036;856;387", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 194.66666666666666, 52.626567012826854 ], "wc_reply_reviewers_avg": [ 27.666666666666668, 39.12657522565563 ], "wc_reply_authors_avg": [ 759.6666666666666, 273.56941024577696 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uNnbA1HXxV4J:scholar.google.com/&scioq=Visualizing+and+Understanding+the+Semantics+of+Embedding+Spaces+via+Algebraic+Formulae&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SkzK4iC5Ym", "title": "Diminishing Batch Normalization", "track": "main", "status": "Reject", "tldr": "We propose a extension of the batch normalization, show a first-of-its-kind convergence analysis for this extension and show in numerical experiments that it has better performance than the original batch normalizatin.", "abstract": "In this paper, we propose a generalization of the BN algorithm, diminishing batch normalization (DBN), where we update the BN parameters in a diminishing moving average way. Batch normalization (BN) is very effective in accelerating the convergence of a neural network training phase that it has become a common practice. \nOur proposed DBN algorithm remains the overall structure of the original BN algorithm while introduces a weighted averaging update to some trainable parameters. \nWe provide an analysis of the convergence of the DBN algorithm that converges to a stationary point with respect to trainable parameters. Our analysis can be easily generalized for original BN algorithm by setting some parameters to constant. To the best knowledge of authors, this analysis is the first of its kind for convergence with Batch Normalization introduced. We analyze a two-layer model with arbitrary activation function. \nThe primary challenge of the analysis is the fact that some parameters are updated by gradient while others are not. \nThe convergence analysis applies to any activation function that satisfies our common assumptions.\nFor the analysis, we also show the sufficient and necessary conditions for the stepsizes and diminishing weights to ensure the convergence. \nIn the numerical experiments, we use more complex models with more layers and ReLU activation. We observe that DBN outperforms the original BN algorithm on Imagenet, MNIST, NI and CIFAR-10 datasets with reasonable complex FNN and CNN models.", "keywords": "deep learning;learning theory;convergence analysis;batch normalization", "primary_area": "", "supplementary_material": "", "author": "Yintai Ma;Diego Klabjan", "authorids": "yintaima2020@u.northwestern.edu;d-klabjan@northwestern.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nma2019diminishing,\ntitle={Diminishing Batch Normalization},\nauthor={Yintai Ma and Diego Klabjan},\nyear={2019},\nurl={https://openreview.net/forum?id=SkzK4iC5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SkzK4iC5Ym", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;5;4", "wc_review": "373;432;363", "wc_reply_reviewers": "31;0;0", "wc_reply_authors": "174;279;238", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 389.3333333333333, 30.44484995674784 ], "wc_reply_reviewers_avg": [ 10.333333333333334, 14.613540144521982 ], "wc_reply_authors_avg": [ 230.33333333333334, 43.2075096353503 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10068619178616430856&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "SkzeJ3A9F7", "title": "Beyond Games: Bringing Exploration to Robots in Real-world", "track": "main", "status": "Reject", "tldr": "", "abstract": "Exploration has been a long standing problem in both model-based and model-free learning methods for sensorimotor control. While there has been major advances over the years, most of these successes have been demonstrated in either video games or simulation environments. This is primarily because the rewards (even the intrinsic ones) are non-differentiable since they are function of the environment (which is a black-box). In this paper, we focus on the policy optimization aspect of the intrinsic reward function. Specifically, by using a local approximation, we formulate intrinsic reward as a differentiable function so as to perform policy optimization using likelihood maximization -- much like supervised learning instead of reinforcement learning. This leads to a significantly sample efficient exploration policy. Our experiments clearly show that our approach outperforms both on-policy and off-policy optimization approaches like REINFORCE and DQN respectively. But most importantly, we are able to implement an exploration policy on a robot which learns to interact with objects completely from scratch just using data collected via the differentiable exploration module. See project videos at https://doubleblindICLR.github.io/robot-exploration/", "keywords": "Exploration;curiosity;manipulation", "primary_area": "", "supplementary_material": "", "author": "Deepak Pathak;Dhiraj Gandhi;Abhinav Gupta", "authorids": "pathak@berkeley.edu;dgandhi@andrew.cmu.edu;abhinavg@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npathak2019beyond,\ntitle={Beyond Games: Bringing Exploration to Robots in Real-world},\nauthor={Deepak Pathak and Dhiraj Gandhi and Abhinav Gupta},\nyear={2019},\nurl={https://openreview.net/forum?id=SkzeJ3A9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SkzeJ3A9F7", "pdf_size": 0, "rating": "3;3;5", "confidence": "4;5;3", "wc_review": "913;595;931", "wc_reply_reviewers": "496;0;346", "wc_reply_authors": "1661;669;872", "reply_reviewers": "2;0;1", "reply_authors": "4;1;2", "rating_avg": [ 3.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 813.0, 154.3243337908834 ], "wc_reply_reviewers_avg": [ 280.6666666666667, 207.6942196809745 ], "wc_reply_authors_avg": [ 1067.3333333333333, 427.8880954434491 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11043793608095336013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "Sy4G8sC9KX", "title": "N/A", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "N/A", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "N/A", "authorids": "vladymyrov@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Sy4G8sC9KX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "406;270;597", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 424.3333333333333, 134.12514885566972 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "Sy4lojC9tm", "title": "Dataset Distillation", "track": "main", "status": "Reject", "tldr": "We propose to distill a large dataset into a small set of synthetic data , so networks can achieve close to original performance when trained on these data.", "abstract": "Model distillation aims to distill the knowledge of a complex model into a simpler one. In this paper, we consider an alternative formulation called {\\em dataset distillation}: we keep the model fixed and instead attempt to distill the knowledge from a large training dataset into a small one. The idea is to {\\em synthesize} a small number of data points that do not need to come from the correct data distribution, but will, when given to the learning algorithm as training data, approximate the model trained on the original data. For example, we show that it is possible to compress $60,000$ MNIST training images into just $10$ synthetic {\\em distilled images} (one per class) and achieve close to original performance with only a few steps of gradient descent, given a particular fixed network initialization. We evaluate our method in a wide range of initialization settings and with different learning objectives. Experiments on multiple datasets show the advantage of our approach compared to alternative methods in most settings. ", "keywords": "knowledge distillation;deep learning;few-shot learning;adversarial attack", "primary_area": "", "supplementary_material": "", "author": "Tongzhou Wang;Jun-Yan Zhu;Antonio Torralba;Alexei A. Efros", "authorids": "tongzhou.wang.1994@gmail.com;junyanz@mit.edu;torralba@mit.edu;efros@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwang2019dataset,\ntitle={Dataset Distillation},\nauthor={Tongzhou Wang and Jun-Yan Zhu and Antonio Torralba and Alexei A. Efros},\nyear={2019},\nurl={https://openreview.net/forum?id=Sy4lojC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sy4lojC9tm", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;4", "wc_review": "398;158;844", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "711;204;455", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 466.6666666666667, 284.23620849958974 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 456.6666666666667, 206.9852383357057 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 804, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15986122155784801061&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "SyEGUi05Km", "title": "CrystalGAN: Learning to Discover Crystallographic Structures with Generative Adversarial Networks", "track": "main", "status": "Withdraw", "tldr": "\"Generating new chemical materials using novel cross-domain GANs.\"", "abstract": "Our main motivation is to propose an efficient approach to generate novel multi-element stable chemical compounds that can be used in real world applications. This task can be formulated as a combinatorial problem, and it takes many hours of human experts to construct, and to evaluate new data. Unsupervised learning methods such as Generative Adversarial Networks (GANs) can be efficiently used to produce new data. Cross-domain Generative Adversarial Networks were reported to achieve exciting results in image processing applications. However, in the domain of materials science, there is a need to synthesize data with higher order complexity compared to observed samples, and the state-of-the-art cross-domain GANs can not be adapted directly. \n\nIn this contribution, we propose a novel GAN called CrystalGAN which generates new chemically stable crystallographic structures with increased domain complexity. We introduce an original architecture, we provide the corresponding loss functions, and we show that the CrystalGAN generates very reasonable data. We illustrate the efficiency of the proposed method on a real original problem of novel hydrides discovery that can be further used in development of hydrogen storage materials.", "keywords": "Generative Adversarial Nets;Cross-Domain Learning;Materials Science;Higher-order Complexity", "primary_area": "", "supplementary_material": "", "author": "Asma Nouira;Nataliya Sokolovska;Jean-Claude Crivello", "authorids": "asma.nouira.91@gmail.com;nataliya.sokolovska@upmc.fr;jccrivello@icmpe.cnrs.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyEGUi05Km", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;2;2", "wc_review": "443;339;232", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 338.0, 86.14329147801742 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.6933752452815365, "gs_citation": 151, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=156415901944925105&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "SyG1QnRqF7", "title": "Towards Resisting Large Data Variations via Introspective Learning", "track": "main", "status": "Withdraw", "tldr": "We propose a principled approach that endows classifiers with the ability to resist larger variations between training and testing data in an intelligent and efficient manner.", "abstract": "Learning deep networks which can resist large variations between training andtesting data is essential to build accurate and robust image classifiers. Towardsthis end, a typical strategy is to apply data augmentation to enlarge the trainingset. However, standard data augmentation is essentially a brute-force strategywhich is inefficient, as it performs all the pre-defined transformations to everytraining sample. In this paper, we propose a principled approach to train networkswith significantly improved resistance to large variations between training andtesting data. This is achieved by embedding a learnable transformation moduleinto the introspective networks (Jin et al., 2017; Lazarow et al., 2017; Lee et al.,2018), which is a convolutional neural network (CNN) classifier empowered withgenerative capabilities. Our approach alternatively synthesizes pseudo-negativesamples with learned transformations and enhances the classifier by retraining itwith synthesized samples. Experimental results verify that our approach signif-icantly improves the ability of deep networks to resist large variations betweentraining and testing data and achieves classification accuracy improvements onseveral benchmark datasets, including MNIST, affNIST, SVHN and CIFAR-10.", "keywords": "Introspective learning;Large variations resistance;Image classification;Generative models", "primary_area": "", "supplementary_material": "", "author": "Yunhan Zhao;Ye Tian;Wei Shen;Alan Yuille", "authorids": "yzhao83@jhu.edu;tytian@outlook.com;shenwei1231@gmail.com;alan.l.yuille@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyG1QnRqF7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "294;777;159", "wc_reply_reviewers": "0;125;0", "wc_reply_authors": "843;1489;410", "reply_reviewers": "0;1;0", "reply_authors": "2;3;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 410.0, 265.2960610337063 ], "wc_reply_reviewers_avg": [ 41.666666666666664, 58.92556509887896 ], "wc_reply_authors_avg": [ 914.0, 443.3516286951776 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qhRbAgB7ZbsJ:scholar.google.com/&scioq=Towards+Resisting+Large+Data+Variations+via+Introspective+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SyG4RiR5Ym", "title": "Neural Distribution Learning for generalized time-to-event prediction", "track": "main", "status": "Reject", "tldr": "We present a general solution to event prediction that has been there all along; Discrete Time Parametric Survival Analysis.", "abstract": "Predicting the time to the next event is an important task in various domains. \nHowever, due to censoring and irregularly sampled sequences, time-to-event prediction has resulted in limited success only for particular tasks, architectures and data. Using recent advances in probabilistic programming and density networks, we make the case for a generalized parametric survival approach, sequentially predicting a distribution over the time to the next event. \nUnlike previous work, the proposed method can use asynchronously sampled features for censored, discrete, and multivariate data. \nFurthermore, it achieves good performance and near perfect calibration for probabilistic predictions without using rigid network-architectures, multitask approaches, complex learning schemes or non-trivial adaptations of cox-models. \nWe firmly establish that this can be achieved in the standard neural network framework by simply switching out the output layer and loss function.", "keywords": "Deep Learning;Survival Analysis;Event prediction;Time Series;Probabilistic Programming;Density Networks", "primary_area": "", "supplementary_material": "", "author": "Egil Martinsson;Adrian Kim;Jaesung Huh;Jaegul Choo;Jung-Woo Ha", "authorids": "egil.martinsson@gmail.com;adrian.kim@navercorp.com;jaesung.huh@navercorp.com;jchoo@korea.ac.kr;jungwoo.ha@navercorp.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmartinsson2019neural,\ntitle={Neural Distribution Learning for generalized time-to-event prediction},\nauthor={Egil Martinsson and Adrian Kim and Jaesung Huh and Jaegul Choo and Jung-Woo Ha},\nyear={2019},\nurl={https://openreview.net/forum?id=SyG4RiR5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyG4RiR5Ym", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;4;5", "wc_review": "286;250;402", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1458;887;1504", "reply_reviewers": "0;0;0", "reply_authors": "2;2;3", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 312.6666666666667, 64.85539470408165 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1283.0, 280.6433086083947 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PZdKuq2Qo5IJ:scholar.google.com/&scioq=Neural+Distribution+Learning+for+generalized+time-to-event+prediction&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SyGjQ30qFX", "title": "TopicGAN: Unsupervised Text Generation from Explainable Latent Topics", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning discrete representations of data and then generating data from the discovered representations have been increasingly studied because the obtained discrete representations can benefit unsupervised learning. However, the performance of learning discrete representations of textual data with deep generative models has not been widely explored. In addition, although generative adversarial networks(GAN) have shown impressing results in many areas such as image generation, for text generation, it is notorious for extremely difficult to train. In this work, we propose TopicGAN, a two-step text generative model, which is able to solve those two important problems simultaneously. In the first step, it discovers the latent topics and produced bag-of-words according to the latent topics. In the second step, it generates text from the produced bag-of-words. In our experiments, we show our model can discover meaningful discrete latent topics of texts in an unsupervised fashion and generate high quality natural language from the discovered latent topics.", "keywords": "unsupervised learning;topic model;text generation", "primary_area": "", "supplementary_material": "", "author": "Yau-Shian Wang;Yun-Nung Chen;Hung-Yi Lee", "authorids": "king6101@gmail.com;y.v.chen@ieee.org;tlkagkb93901106@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2019topicgan,\ntitle={Topic{GAN}: Unsupervised Text Generation from Explainable Latent Topics},\nauthor={Yau-Shian Wang and Yun-Nung Chen and Hung-Yi Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=SyGjQ30qFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SyGjQ30qFX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;2;4", "wc_review": "447;504;248", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "513;305;220", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 399.6666666666667, 109.74009699689941 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 346.0, 123.07991983531134 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4840383335794097172&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Stable Opponent Shaping in Differentiable Games", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/642", "id": "SyGjjsC5tQ", "author_site": "Alistair Letcher, Jakob Foerster, David Balduzzi, Tim Rocktaeschel, Shimon Whiteson", "tldr": "Opponent shaping is a powerful approach to multi-agent learning but can prevent convergence; our SOS algorithm fixes this with strong guarantees in all differentiable games.", "abstract": "A growing number of learning methods are actually differentiable games whose players optimise multiple, interdependent objectives in parallel \u2013 from GANs and intrinsic curiosity to multi-agent RL. Opponent shaping is a powerful approach to improve learning dynamics in these games, accounting for player influence on others\u2019 updates. Learning with Opponent-Learning Awareness (LOLA) is a recent algorithm that exploits this response and leads to cooperation in settings like the Iterated Prisoner\u2019s Dilemma. Although experimentally successful, we show that LOLA agents can exhibit \u2018arrogant\u2019 behaviour directly at odds with convergence. In fact, remarkably few algorithms have theoretical guarantees applying across all (n-player, non-convex) games. In this paper we present Stable Opponent Shaping (SOS), a new method that interpolates between LOLA and a stable variant named LookAhead. We prove that LookAhead converges locally to equilibria and avoids strict saddles in all differentiable games. SOS inherits these essential guarantees, while also shaping the learning of opponents and consistently either matching or outperforming LOLA experimentally.", "keywords": "multi-agent learning;multiple interacting losses;opponent shaping;exploitation;convergence", "primary_area": "", "supplementary_material": "", "author": "Alistair Letcher;Jakob Foerster;David Balduzzi;Tim Rockt\u00e4schel;Shimon Whiteson", "authorids": "ahp.letcher@gmail.com;jakobfoerster@gmail.com;dbalduzzi@google.com;tim.rocktaeschel@gmail.com;shimon.whiteson@cs.ox.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nletcher2018stable,\ntitle={Stable Opponent Shaping in Differentiable Games},\nauthor={Alistair Letcher and Jakob Foerster and David Balduzzi and Tim Rockt\u00e4schel and Shimon Whiteson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyGjjsC5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "1;2;4", "wc_review": "119;242;762", "wc_reply_reviewers": "0;0;216", "wc_reply_authors": "207;19;773", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 374.3333333333333, 278.6830138745852 ], "wc_reply_reviewers_avg": [ 72.0, 101.82337649086284 ], "wc_reply_authors_avg": [ 333.0, 320.4538448305257 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9449111825230679, "gs_citation": 131, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9514174304819895562&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SyGjjsC5tQ", "pdf": "https://openreview.net/pdf?id=SyGjjsC5tQ", "email": ";;;;", "author_num": 5 }, { "title": "A Mean Field Theory of Batch Normalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/802", "id": "SyMDXnCcF7", "author_site": "Greg Yang, Jeffrey Pennington, Vinay Rao, Jascha Sohl-Dickstein, Samuel Schoenholz", "tldr": "Batch normalization causes exploding gradients in vanilla feedforward networks.", "abstract": "We develop a mean field theory for batch normalization in fully-connected feedforward neural networks. In so doing, we provide a precise characterization of signal propagation and gradient backpropagation in wide batch-normalized networks at initialization. Our theory shows that gradient signals grow exponentially in depth and that these exploding gradients cannot be eliminated by tuning the initial weight variances or by adjusting the nonlinear activation function. Indeed, batch normalization itself is the cause of gradient explosion. As a result, vanilla batch-normalized networks without skip connections are not trainable at large depths for common initialization schemes, a prediction that we verify with a variety of empirical simulations. While gradient explosion cannot be eliminated, it can be reduced by tuning the network close to the linear regime, which improves the trainability of deep batch-normalized networks without residual connections. Finally, we investigate the learning dynamics of batch-normalized networks and observe that after a single step of optimization the networks achieve a relatively stable equilibrium in which gradients have dramatically smaller dynamic range. Our theory leverages Laplace, Fourier, and Gegenbauer transforms and we derive new identities that may be of independent interest.", "keywords": "theory;batch normalization;mean field theory;trainability", "primary_area": "", "supplementary_material": "", "author": "Greg Yang;Jeffrey Pennington;Vinay Rao;Jascha Sohl-Dickstein;Samuel S. Schoenholz", "authorids": "gregyang@microsoft.com;jpennin@google.com;vinaysrao@google.com;jaschasd@google.com;schsam@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyang2018a,\ntitle={A Mean Field Theory of Batch Normalization},\nauthor={Greg Yang and Jeffrey Pennington and Vinay Rao and Jascha Sohl-Dickstein and Samuel S. Schoenholz},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyMDXnCcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "1;3;3", "wc_review": "159;154;208", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "222;282;570", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 173.66666666666666, 24.36299561949547 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 358.0, 151.8947003683802 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 217, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17665814041669020021&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SyMDXnCcF7", "pdf": "https://openreview.net/pdf?id=SyMDXnCcF7", "email": ";;;;", "author_num": 5 }, { "title": "Learning Exploration Policies for Navigation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/883", "id": "SyMWn05F7", "author_site": "Tao Chen, Saurabh Gupta, Abhinav Gupta", "tldr": "", "abstract": "Numerous past works have tackled the problem of task-driven navigation. But, how to effectively explore a new environment to enable a variety of down-stream tasks has received much less attention. In this work, we study how agents can autonomously explore realistic and complex 3D environments without the context of task-rewards. We propose a learning-based approach and investigate different policy architectures, reward functions, and training paradigms. We find that use of policies with spatial memory that are bootstrapped with imitation learning and finally finetuned with coverage rewards derived purely from on-board sensors can be effective at exploring novel environments. We show that our learned exploration policies can explore better than classical approaches based on geometry alone and generic learning-based exploration techniques. Finally, we also show how such task-agnostic exploration can be used for down-stream tasks. Videos are available at https://sites.google.com/view/exploration-for-nav/.", "keywords": "Exploration;navigation;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Tao Chen;Saurabh Gupta;Abhinav Gupta", "authorids": "taoc1@andrew.cmu.edu;sgupta@eecs.berkeley.edu;abhinavg@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nchen2018learning,\ntitle={Learning Exploration Policies for Navigation},\nauthor={Tao Chen and Saurabh Gupta and Abhinav Gupta},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyMWn05F7},\n}", "github": "[![github](/images/github_icon.svg) taochenshh/exp4nav](https://github.com/taochenshh/exp4nav) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SyMWn05F7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "3;7;7", "confidence": "5;4;5", "wc_review": "1079;176;595", "wc_reply_reviewers": "0;160;0", "wc_reply_authors": "1085;520;409", "reply_reviewers": "0;2;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 616.6666666666666, 368.9664242478199 ], "wc_reply_reviewers_avg": [ 53.333333333333336, 75.42472332656506 ], "wc_reply_authors_avg": [ 671.3333333333334, 295.99587084207025 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 288, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1526633576375251578&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SyMWn05F7", "pdf": "https://openreview.net/pdf?id=SyMWn05F7", "email": ";;", "author_num": 3 }, { "title": "Distribution-Interpolation Trade off in Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/865", "id": "SyMhLo0qKQ", "author_site": "Damian Le\u015bniak, Igor Sieradzki, Igor Podolak", "tldr": "We theoretically prove that linear interpolations are unsuitable for analysis of trained implicit generative models. ", "abstract": "We investigate the properties of multidimensional probability distributions in the context of latent space prior distributions of implicit generative models. Our work revolves around the phenomena arising while decoding linear interpolations between two random latent vectors -- regions of latent space in close proximity to the origin of the space are oversampled, which restricts the usability of linear interpolations as a tool to analyse the latent space. We show that the distribution mismatch can be eliminated completely by a proper choice of the latent probability distribution or using non-linear interpolations. We prove that there is a trade off between the interpolation being linear, and the latent distribution having even the most basic properties required for stable training, such as finite mean. We use the multidimensional Cauchy distribution as an example of the prior distribution, and also provide a general method of creating non-linear interpolations, that is easily applicable to a large family of commonly used latent distributions.", "keywords": "generative models;latent distribution;Cauchy distribution;interpolations", "primary_area": "", "supplementary_material": "", "author": "Damian Le\u015bniak;Igor Sieradzki;Igor Podolak", "authorids": "damian.lesniak@doctoral.uj.edu.pl;igor.sieradzki@doctoral.uj.edu.pl;igor.podolak@uj.edu.pl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nle\u015bniak2018distributioninterpolation,\ntitle={Distribution-Interpolation Trade off in Generative Models},\nauthor={Damian Le\u015bniak and Igor Sieradzki and Igor Podolak},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyMhLo0qKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;3", "wc_review": "573;548;534", "wc_reply_reviewers": "135;87;147", "wc_reply_authors": "944;590;597", "reply_reviewers": "1;1;1", "reply_authors": "2;2;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 551.6666666666666, 16.131404843417148 ], "wc_reply_reviewers_avg": [ 123.0, 25.92296279363144 ], "wc_reply_authors_avg": [ 710.3333333333334, 165.25199612174802 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17432671925239359715&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SyMhLo0qKQ", "pdf": "https://openreview.net/pdf?id=SyMhLo0qKQ", "email": ";;", "author_num": 3 }, { "id": "SyMras0cFQ", "title": "An adaptive homeostatic algorithm for the unsupervised learning of visual features", "track": "main", "status": "Reject", "tldr": "Unsupervised learning is hard and depends on normalisation heuristics. Can we find a simpler approach?", "abstract": "The formation of structure in the brain, that is, of the connections between cells within neural populations, is by large an unsupervised learning process: the emergence of this architecture is mostly self-organized. In the primary visual cortex of mammals, for example, one may observe during development the formation of cells selective to localized, oriented features. This leads to the development of a rough representation of contours of the retinal image in area V1. We modeled these mechanisms using sparse Hebbian learning algorithms. These algorithms alternate a coding step to encode the information with a learning step to find the proper encoder. A major difficulty faced by these algorithms is to deduce a good representation while knowing immature encoders, and to learn good encoders with a non-optimal representation. To address this problem, we propose to introduce a new regulation process between learning and coding, called homeostasis. Our homeostasis is compatible with a neuro-mimetic architecture and allows for the fast emergence of localized filters sensitive to orientation. The key to this algorithm lies in a simple adaptation mechanism based on non-linear functions that reconciles the antagonistic processes that occur at the coding and learning time scales. We tested this unsupervised algorithm with this homeostasis rule for a range of existing unsupervised learning algorithms coupled with different neural coding algorithms. In addition, we propose a simplification of this optimal homeostasis rule by implementing a simple heuristic on the probability of activation of neurons. Compared to the optimal homeostasis rule, we show that this heuristic allows to implement a more rapid unsupervised learning algorithm while keeping a large part of its effectiveness. These results demonstrate the potential application of such a strategy in machine learning and we illustrate this with one result in a convolutional neural network.", "keywords": "Sparse Coding;Unsupervised Learning;Natural Scene Statistics;Biologically Plausible Deep Networks;Visual Perception;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Victor Boutin;Angelo Franciosini;Laurent Perrinet", "authorids": "victor.boutin@univ-amu.fr;angelo.franciosini@univ-amu.fr;laurent.perrinet@univ-amu.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nboutin2019an,\ntitle={An adaptive homeostatic algorithm for the unsupervised learning of visual features},\nauthor={Victor Boutin and Angelo Franciosini and Laurent Perrinet},\nyear={2019},\nurl={https://openreview.net/forum?id=SyMras0cFQ},\n}", "github": "[![github](/images/github_icon.svg) bicv/SHL_scripts](https://github.com/bicv/SHL_scripts)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=SyMras0cFQ", "pdf_size": 0, "rating": "4;5;9", "confidence": "4;5;4", "wc_review": "391;268;135", "wc_reply_reviewers": "0;333;0", "wc_reply_authors": "505;807;209", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 2.160246899469287 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 264.6666666666667, 104.53813764469989 ], "wc_reply_reviewers_avg": [ 111.0, 156.97770542341354 ], "wc_reply_authors_avg": [ 507.0, 244.1365737997211 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.3273268353539885, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11837941361744939701&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "title": "Learning to Describe Scenes with Programs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/769", "id": "SyNPk2R9K7", "author_site": "Yunchao Liu, Zheng Wu, Daniel Ritchie, William Freeman, Joshua B Tenenbaum, Jiajun Wu", "tldr": "We present scene programs, a structured scene representation that captures both low-level object appearance and high-level regularity in the scene.", "abstract": "Human scene perception goes beyond recognizing a collection of objects and their pairwise relations. We understand higher-level, abstract regularities within the scene such as symmetry and repetition. Current vision recognition modules and scene representations fall short in this dimension. In this paper, we present scene programs, representing a scene via a symbolic program for its objects, attributes, and their relations. We also propose a model that infers such scene programs by exploiting a hierarchical, object-based scene representation. Experiments demonstrate that our model works well on synthetic data and transfers to real images with such compositional structure. The use of scene programs has enabled a number of applications, such as complex visual analogy-making and scene extrapolation.", "keywords": "Structured scene representations;program synthesis", "primary_area": "", "supplementary_material": "", "author": "Yunchao Liu;Zheng Wu;Daniel Ritchie;William T. Freeman;Joshua B. Tenenbaum;Jiajun Wu", "authorids": "georgeycliu@gmail.com;14wuzheng@sjtu.edu.cn;daniel_ritchie@brown.edu;billf@mit.edu;jbt@mit.edu;jiajunwu@mit.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nliu2018learning,\ntitle={Learning to Describe Scenes with Programs},\nauthor={Yunchao Liu and Jiajun Wu and Zheng Wu and Daniel Ritchie and William T. Freeman and Joshua B. Tenenbaum},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyNPk2R9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;4;3", "wc_review": "382;473;508", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "787;552;525", "reply_reviewers": "0;0;0", "reply_authors": "3;3;3", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 454.3333333333333, 53.1057644914582 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 621.3333333333334, 117.66147297319637 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17114631705849077181&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SyNPk2R9K7", "pdf": "https://openreview.net/pdf?id=SyNPk2R9K7", "email": ";;;;;", "author_num": 6 }, { "id": "SyNbRj09Y7", "title": "Visual Imitation Learning with Recurrent Siamese Networks", "track": "main", "status": "Reject", "tldr": "Learning a vision-based recurrent distance function to allow agents to imitate behaviours from noisy video data.", "abstract": "People are incredibly skilled at imitating others by simply observing them. They achieve this even in the presence of significant morphological differences and capabilities. Further, people are able to do this from raw perceptions of the actions of others, without direct access to the abstracted demonstration actions and with only partial state information. People therefore solve a difficult problem of understanding the salient features of both observations of others and the relationship to their own state when learning to imitate specific tasks.\nHowever, we can attempt to reproduce a similar demonstration via trail and error and through this gain more understanding of the task space.\nTo reproduce this ability an agent would need to both learn how to recognize the differences between itself and some demonstration and at the same time learn to minimize the distance between its own performance and that of the demonstration.\nIn this paper we propose an approach using only visual information to learn a distance metric between agent behaviour and a given video demonstration.\nWe train an RNN-based siamese model to compute distances in space and time between motion clips while training an RL policy to minimize this distance.\nFurthermore, we examine a particularly challenging form of this problem where the agent must learn an imitation based task given a single demonstration.\nWe demonstrate our approach in the setting of deep learning based control for physical simulation of humanoid walking in both 2D with $10$ degrees of freedom (DoF) and 3D with $38$ DoF.", "keywords": "Reinforcement Learning;Imitation Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Glen Berseth;Christopher J. Pal", "authorids": "gberseth@gmail.com;christopher.pal@polymtl.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nberseth2019visual,\ntitle={Visual Imitation Learning with Recurrent Siamese Networks},\nauthor={Glen Berseth and Christopher J. Pal},\nyear={2019},\nurl={https://openreview.net/forum?id=SyNbRj09Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SyNbRj09Y7", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;4;4", "wc_review": "650;435;703", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "74;0;85", "reply_reviewers": "0;0;0", "reply_authors": "1;0;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 596.0, 115.88212401689341 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 53.0, 37.74475681027322 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5502979714818153205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Visceral Machines: Risk-Aversion in Reinforcement Learning with Intrinsic Physiological Rewards", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/911", "id": "SyNvti09KQ", "author_site": "Daniel McDuff, Ashish Kapoor", "tldr": "We present a novel approach to reinforcement learning that leverages a task-independent intrinsic reward function trained on peripheral pulse measurements that are correlated with human autonomic nervous system responses. ", "abstract": " As people learn to navigate the world, autonomic nervous system (e.g., ``fight or flight) responses provide intrinsic feedback about the potential consequence of action choices (e.g., becoming nervous when close to a cliff edge or driving fast around a bend.) Physiological changes are correlated with these biological preparations to protect one-self from danger. We present a novel approach to reinforcement learning that leverages a task-independent intrinsic reward function trained on peripheral pulse measurements that are correlated with human autonomic nervous system responses. Our hypothesis is that such reward functions can circumvent the challenges associated with sparse and skewed rewards in reinforcement learning settings and can help improve sample efficiency. We test this in a simulated driving environment and show that it can increase the speed of learning and reduce the number of collisions during the learning stage.", "keywords": "Reinforcement Learning;Simulation;Affective Computing", "primary_area": "", "supplementary_material": "", "author": "Daniel McDuff;Ashish Kapoor", "authorids": "damcduff@microsoft.com;akapoor@microsoft.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nmcduff2018visceral,\ntitle={Visceral Machines: Reinforcement Learning with Intrinsic Physiological Rewards},\nauthor={Daniel McDuff and Ashish Kapoor},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyNvti09KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;5", "wc_review": "351;540;226", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "359;446;177", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 372.3333333333333, 129.07448323437993 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 327.3333333333333, 112.07834561393007 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13666297261471235343&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SyNvti09KQ", "pdf": "https://openreview.net/pdf?id=SyNvti09KQ", "email": ";", "author_num": 2 }, { "title": "Deep Frank-Wolfe For Neural Network Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/975", "id": "SyVU6s05K7", "author_site": "Leonard Berrada, Andrew Zisserman, M. Pawan Kumar", "tldr": "We train neural networks by locally linearizing them and using a linear SVM solver (Frank-Wolfe) at each iteration.", "abstract": "Learning a deep neural network requires solving a challenging optimization problem: it is a high-dimensional, non-convex and non-smooth minimization problem with a large number of terms. The current practice in neural network optimization is to rely on the stochastic gradient descent (SGD) algorithm or its adaptive variants. However, SGD requires a hand-designed schedule for the learning rate. In addition, its adaptive variants tend to produce solutions that generalize less well on unseen data than SGD with a hand-designed schedule. We present an optimization method that offers empirically the best of both worlds: our algorithm yields good generalization performance while requiring only one hyper-parameter. Our approach is based on a composite proximal framework, which exploits the compositional nature of deep neural networks and can leverage powerful convex optimization algorithms by design. Specifically, we employ the Frank-Wolfe (FW) algorithm for SVM, which computes an optimal step-size in closed-form at each time-step. We further show that the descent direction is given by a simple backward pass in the network, yielding the same computational cost per iteration as SGD. We present experiments on the CIFAR and SNLI data sets, where we demonstrate the significant superiority of our method over Adam, Adagrad, as well as the recently proposed BPGrad and AMSGrad. Furthermore, we compare our algorithm to SGD with a hand-designed learning rate schedule, and show that it provides similar generalization while often converging faster. The code is publicly available at https://github.com/oval-group/dfw.", "keywords": "optimization;conditional gradient;Frank-Wolfe;SVM", "primary_area": "", "supplementary_material": "", "author": "Leonard Berrada;Andrew Zisserman;M. Pawan Kumar", "authorids": "lberrada@robots.ox.ac.uk;az@robots.ox.ac.uk;pawan@robots.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nberrada2018deep,\ntitle={Deep Frank-Wolfe For Neural Network Optimization},\nauthor={Leonard Berrada and Andrew Zisserman and M. Pawan Kumar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyVU6s05K7},\n}", "github": "[![github](/images/github_icon.svg) oval-group/dfw](https://github.com/oval-group/dfw)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;5;4", "wc_review": "146;456;169", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "395;403;167", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 257.0, 141.0271841407417 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 321.6666666666667, 109.41460394095277 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17584931574409094808&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=SyVU6s05K7", "pdf": "https://openreview.net/pdf?id=SyVU6s05K7", "email": ";;", "author_num": 3 }, { "id": "SyVhg20cK7", "title": "Inducing Cooperation via Learning to reshape rewards in semi-cooperative multi-agent reinforcement learning", "track": "main", "status": "Reject", "tldr": "We use an peer evaluation mechanism to make semi-cooperative agents learn collaborative strategies in multiagent reinforcement learning settings", "abstract": "We propose a deep reinforcement learning algorithm for semi-cooperative multi-agent tasks, where agents are equipped with their separate reward functions, yet with willingness to cooperate. Under these semi-cooperative scenarios, popular methods of centralized training with decentralized execution for inducing cooperation and removing the non-stationarity problem do not work well due to lack of a common shared reward as well as inscalability in centralized training. Our algorithm, called Peer-Evaluation based Dual DQN (PED-DQN), proposes to give peer evaluation signals to observed agents, which quantifies how they feel about a certain transition. This exchange of peer evaluation over time turns out to render agents to gradually reshape their reward functions so that their action choices from the myopic best-response tend to result in the good joint action with high cooperation. This evaluation-based method also allows flexible and scalable training by not assuming knowledge of the number of other agents and their observation and action spaces. We provide the performance evaluation of PED-DQN for the scenarios ranging from a simple two-person prisoner's dilemma to more complex semi-cooperative multi-agent tasks. In special cases where agents share a common reward function as in the centralized training methods, we show that inter-agent\nevaluation leads to better performance\n", "keywords": "multi-agent reinforcement learning;deep reinforcement learning;multi-agent systems", "primary_area": "", "supplementary_material": "", "author": "David Earl Hostallero;Daewoo Kim;Kyunghwan Son;Yung Yi", "authorids": "ddhostallero@kaist.ac.kr;kdw2139@gmail.com;khson@lanada.kaist.ac.kr;yiyung@kaist.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhostallero2019inducing,\ntitle={Inducing Cooperation via Learning to reshape rewards in semi-cooperative multi-agent reinforcement learning},\nauthor={David Earl Hostallero and Daewoo Kim and Kyunghwan Son and Yung Yi},\nyear={2019},\nurl={https://openreview.net/forum?id=SyVhg20cK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyVhg20cK7", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;3", "wc_review": "385;596;744", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "452;472;462", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 575.0, 147.3114614232941 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 462.0, 8.16496580927726 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:H5x3R5q178UJ:scholar.google.com/&scioq=Inducing+Cooperation+via+Learning+to+reshape+rewards+in+semi-cooperative+multi-agent+reinforcement+learning&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "SyVpB2RqFX", "title": "INFORMATION MAXIMIZATION AUTO-ENCODING", "track": "main", "status": "Reject", "tldr": "Information theoretical approach for unsupervised learning of unsupervised learning of a hybrid of discrete and continuous representations, ", "abstract": "We propose the Information Maximization Autoencoder (IMAE), an information theoretic approach to simultaneously learn continuous and discrete representations in an unsupervised setting. Unlike the Variational Autoencoder framework, IMAE starts from a stochastic encoder that seeks to map each input data to a hybrid discrete and continuous representation with the objective of maximizing the mutual information between the data and their representations. A decoder is included to approximate the posterior distribution of the data given their representations, where a high fidelity approximation can be achieved by leveraging the informative representations. \nWe show that the proposed objective is theoretically valid and provides a principled framework for understanding the tradeoffs regarding informativeness of each representation factor, disentanglement of representations, and decoding quality. ", "keywords": "Information maximization;unsupervised learning of hybrid of discrete and continuous representations", "primary_area": "", "supplementary_material": "", "author": "Dejiao Zhang;Tianchen Zhao;Laura Balzano", "authorids": "dejiao@umich.edu;ericolon@umich.edu;girasole@umich.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019information,\ntitle={{INFORMATION} {MAXIMIZATION} {AUTO}-{ENCODING}},\nauthor={Dejiao Zhang and Tianchen Zhao and Laura Balzano},\nyear={2019},\nurl={https://openreview.net/forum?id=SyVpB2RqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyVpB2RqFX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "wc_review": "266;745;177", "wc_reply_reviewers": "247;144;0", "wc_reply_authors": "1297;1579;714", "reply_reviewers": "2;1;0", "reply_authors": "4;3;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 396.0, 249.44070771761906 ], "wc_reply_reviewers_avg": [ 130.33333333333334, 101.29933640234549 ], "wc_reply_authors_avg": [ 1196.6666666666667, 360.19099871534206 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:VCyeGW0-d-UJ:scholar.google.com/&scioq=INFORMATION+MAXIMIZATION+AUTO-ENCODING&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "title": "LEARNING TO PROPAGATE LABELS: TRANSDUCTIVE PROPAGATION NETWORK FOR FEW-SHOT LEARNING", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/976", "id": "SyVuRiC5K7", "author_site": "Yanbin Liu, Juho Lee, Minseop Park, Saehoon Kim, Eunho Yang, Sung Ju Hwang, Yi Yang", "tldr": "We propose a novel meta-learning framework for transductive inference that classifies the entire test set at once to alleviate the low-data problem.", "abstract": "The goal of few-shot learning is to learn a classifier that generalizes well even when trained with a limited number of training instances per class. The recently introduced meta-learning approaches tackle this problem by learning a generic classifier across a large number of multiclass classification tasks and generalizing the model to a new task. Yet, even with such meta-learning, the low-data problem in the novel classification task still remains. In this paper, we propose Transductive Propagation Network (TPN), a novel meta-learning framework for transductive inference that classifies the entire test set at once to alleviate the low-data problem. Specifically, we propose to learn to propagate labels from labeled instances to unlabeled test instances, by learning a graph construction module that exploits the manifold structure in the data. TPN jointly learns both the parameters of feature embedding and the graph construction in an end-to-end manner. We validate TPN on multiple benchmark datasets, on which it largely outperforms existing few-shot learning approaches and achieves the state-of-the-art results. ", "keywords": "few-shot learning;meta-learning;label propagation;manifold learning", "primary_area": "", "supplementary_material": "", "author": "Yanbin Liu;Juho Lee;Minseop Park;Saehoon Kim;Eunho Yang;Sung Ju Hwang;Yi Yang", "authorids": "csyanbin@gmail.com;juho.lee@stats.ox.ac.uk;mike_seop@aitrics.com;shkim@aitrics.com;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr;yi.yang@uts.edu.au", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nliu2018learning,\ntitle={{LEARNING} {TO} {PROPAGATE} {LABELS}: {TRANSDUCTIVE} {PROPAGATION} {NETWORK} {FOR} {FEW}-{SHOT} {LEARNING}},\nauthor={Yanbin Liu and Juho Lee and Minseop Park and Saehoon Kim and Eunho Yang and Sungju Hwang and Yi Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyVuRiC5K7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SyVuRiC5K7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "3;3;4", "wc_review": "192;149;127", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "586;378;412", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 156.0, 26.993826454703797 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 458.6666666666667, 91.10189655301122 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 990, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6189607533521090437&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=SyVuRiC5K7", "pdf": "https://openreview.net/pdf?id=SyVuRiC5K7", "email": ";;;;;;", "author_num": 7 }, { "id": "Sye2doC9tX", "title": "Exploration by Uncertainty in Reward Space", "track": "main", "status": "Reject", "tldr": "Exploration by Uncertainty in Reward Space", "abstract": "Efficient exploration plays a key role in reinforcement learning tasks. Commonly used dithering strategies, such as\u000f-greedy, try to explore the action-state space randomly; this can lead to large demand for samples. In this paper, We propose an exploration method based on the uncertainty in reward space. There are two policies in this approach, the exploration policy is used for exploratory sampling in the environment, then the benchmark policy try to update by the data proven by the exploration policy. Benchmark policy is used to provide the uncertainty in reward space, e.g. td-error, which guides the exploration policy updating. We apply our method on two grid-world environments and four Atari games. Experiment results show that our method improves learning speed and have a better performance than baseline policies", "keywords": "Policy Exploration;Uncertainty in Reward Space", "primary_area": "", "supplementary_material": "", "author": "Wei-Yang Qu;Yang Yu;Tang-Jie Lv;Ying-Feng Chen;Chang-Jie Fan", "authorids": "nju_qwy@163.com;yuy@nju.edu.cn;hzlvtangjie@corp.netease.com;chenyingfeng1@corp.netease.com;fanchangjie@corp.netease.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nqu2019exploration,\ntitle={Exploration by Uncertainty in Reward Space},\nauthor={Wei-Yang Qu and Yang Yu and Tang-Jie Lv and Ying-Feng Chen and Chang-Jie Fan},\nyear={2019},\nurl={https://openreview.net/forum?id=Sye2doC9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Sye2doC9tX", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;2;3", "wc_review": "431;302;79", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 270.6666666666667, 145.40136022594683 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9449111825230678, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JUBjTyQp0a4J:scholar.google.com/&scioq=Exploration+by+Uncertainty+in+Reward+Space&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Sye7qoC5FQ", "title": "Adversarial Attacks on Node Embeddings", "track": "main", "status": "Reject", "tldr": "Adversarial attacks on unsupervised node embeddings based on eigenvalue perturbation theory.", "abstract": "The goal of network representation learning is to learn low-dimensional node embeddings that capture the graph structure and are useful for solving downstream tasks. However, despite the proliferation of such methods there is currently no study of their robustness to adversarial attacks. We provide the first adversarial vulnerability analysis on the widely used family of methods based on random walks. We derive efficient adversarial perturbations that poison the network structure and have a negative effect on both the quality of the embeddings and the downstream tasks. We further show that our attacks are transferable since they generalize to many models, and are successful even when the attacker is restricted.", "keywords": "node embeddings;adversarial attacks", "primary_area": "", "supplementary_material": "", "author": "Aleksandar Bojchevski;Stephan G\u00fcnnemann", "authorids": "a.bojchevski@in.tum.de;guennemann@in.tum.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbojchevski2019adversarial,\ntitle={Adversarial Attacks on Node Embeddings},\nauthor={Aleksandar Bojchevski and Stephan G\u00fcnnemann},\nyear={2019},\nurl={https://openreview.net/forum?id=Sye7qoC5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=Sye7qoC5FQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;3", "wc_review": "86;336;266", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "56;272;167", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 229.33333333333334, 105.30379332620878 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 165.0, 88.19297024139736 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "Sye8S209KX", "title": "Learning Robust, Transferable Sentence Representations for Text Classification", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Despite deep recurrent neural networks (RNNs) demonstrate strong performance in text classification, training RNN models are often expensive and requires an extensive collection of annotated data which may not be available. To overcome the data limitation issue, existing approaches leverage either pre-trained word embedding or sentence representation to lift the burden of training RNNs from scratch. In this paper, we show that jointly learning sentence representations from multiple text classification tasks and combining them with pre-trained word-level and sentence level encoders result in robust sentence representations that are useful for transfer learning. Extensive experiments and analyses using a wide range of transfer and linguistic tasks endorse the effectiveness of our approach.", "keywords": "sentence representations learning;multi-task learning;transfer learning", "primary_area": "", "supplementary_material": "", "author": "Wasi Uddin Ahmad;Xueying Bai;Nanyun Peng;Kai-Wei Chang", "authorids": "wasiahmad@cs.ucla.edu;xubai@cs.stonybrook.edu;npeng@isi.edu;kwchang@cs.ucla.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Sye8S209KX", "pdf_size": 0, "rating": "3;4;4", "confidence": "2;4;4", "wc_review": "151;321;236", "wc_reply_reviewers": "0;117;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;1;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 236.0, 69.40220937885671 ], "wc_reply_reviewers_avg": [ 39.0, 55.154328932550705 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10157740190719815279&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "SyeBqsRctm", "title": "Step-wise Sensitivity Analysis: Identifying Partially Distributed Representations for Interpretable Deep Learning", "track": "main", "status": "Reject", "tldr": "We find dependency graphs between learned representations as a first step towards building decision trees to interpret the representation manifold.", "abstract": " In this paper, we introduce a novel method, called step-wise sensitivity analysis, which makes three contributions towards increasing the interpretability of Deep Neural Networks (DNNs). First, we are the first to suggest a methodology that aggregates results across input stimuli to gain model-centric results. Second, we linearly approximate the neuron activation and propose to use the outlier weights to identify distributed code. Third, our method constructs a dependency graph of the relevant neurons across the network to gain fine-grained understanding of the nature and interactions of DNN's internal features. The dependency graph illustrates shared subgraphs that generalise across 10 classes and can be clustered into semantically related groups. This is the first step towards building decision trees as an interpretation of learned representations.", "keywords": "Interpretability;Interpretable Deep Learning;XAI;dependency graph;sensitivity analysis;outlier detection;instance-specific;model-centric", "primary_area": "", "supplementary_material": "", "author": "Botty Dimanov;Mateja Jamnik", "authorids": "botty.dimanov@cl.cam.ac.uk;mateja.jamnik@cl.cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndimanov2019stepwise,\ntitle={Step-wise Sensitivity Analysis: Identifying Partially Distributed Representations for Interpretable Deep Learning},\nauthor={Botty Dimanov and Mateja Jamnik},\nyear={2019},\nurl={https://openreview.net/forum?id=SyeBqsRctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyeBqsRctm", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;4", "wc_review": "639;566;296", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 500.3333333333333, 147.52702200689276 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10518872150459679906&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "SyeKf30cFQ", "title": "A theoretical framework for deep and locally connected ReLU network", "track": "main", "status": "Reject", "tldr": "This paper presents a theoretical framework that models data distribution explicitly for deep and locally connected ReLU network", "abstract": "Understanding theoretical properties of deep and locally connected nonlinear network, such as deep convolutional neural network (DCNN), is still a hard problem despite its empirical success. In this paper, we propose a novel theoretical framework for such networks with ReLU nonlinearity. The framework bridges data distribution with gradient descent rules, favors disentangled representations and is compatible with common regularization techniques such as Batch Norm, after a novel discovery of its projection nature. The framework is built upon teacher-student setting, by projecting the student's forward/backward pass onto the teacher's computational graph. We do not impose unrealistic assumptions (e.g., Gaussian inputs, independence of activation, etc). Our framework could help facilitate theoretical analysis of many practical issues, e.g. disentangled representations in deep networks. ", "keywords": "theoretical analysis;deep network;optimization;disentangled representation", "primary_area": "", "supplementary_material": "", "author": "Yuandong Tian", "authorids": "yuandong@fb.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ntian2019a,\ntitle={A theoretical framework for deep and locally connected Re{LU} network},\nauthor={Yuandong Tian},\nyear={2019},\nurl={https://openreview.net/forum?id=SyeKf30cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyeKf30cFQ", "pdf_size": 0, "rating": "3;5;7", "confidence": "4;3;4", "wc_review": "314;642;175", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 377.0, 195.78729955404836 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18032244598698536454&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SyeLno09Fm", "title": "Few-Shot Intent Inference via Meta-Inverse Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "The applicability of inverse reinforcement learning is often hampered by the expense of collecting expert demonstrations; this paper seeks to broaden its applicability by incorporating prior task information through meta-learning.", "abstract": "A significant challenge for the practical application of reinforcement learning toreal world problems is the need to specify an oracle reward function that correctly defines a task. Inverse reinforcement learning (IRL) seeks to avoid this challenge by instead inferring a reward function from expert behavior. While appealing, it can be impractically expensive to collect datasets of demonstrations that cover the variation common in the real world (e.g. opening any type of door). Thus in practice, IRL must commonly be performed with only a limited set of demonstrations where it can be exceedingly difficult to unambiguously recover a reward function. In this work, we exploit the insight that demonstrations from other tasks can be used to constrain the set of possible reward functions by learning a \"prior\" that is specifically optimized for the ability to infer expressive reward functions from limited numbers of demonstrations. We demonstrate that our method can efficiently recover rewards from images for novel tasks and provide intuition as to how our approach is analogous to learning a prior.", "keywords": "Inverse Reinforcement Learning;Meta-Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Kelvin Xu;Ellis Ratner;Anca Dragan;Sergey Levine;Chelsea Finn", "authorids": "kelvinxu@eecs.berkeley.edu;eratner@berkeley.edu;anca@berkeley.edu;svlevine@eecs.berkeley.edu;cbfinn@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nxu2019fewshot,\ntitle={Few-Shot Intent Inference via Meta-Inverse Reinforcement Learning},\nauthor={Kelvin Xu and Ellis Ratner and Anca Dragan and Sergey Levine and Chelsea Finn},\nyear={2019},\nurl={https://openreview.net/forum?id=SyeLno09Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyeLno09Fm", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;3", "wc_review": "846;373;601", "wc_reply_reviewers": "67;0;117", "wc_reply_authors": "969;415;586", "reply_reviewers": "1;0;1", "reply_authors": "3;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 606.6666666666666, 193.14300976794948 ], "wc_reply_reviewers_avg": [ 61.333333333333336, 47.93282336307299 ], "wc_reply_authors_avg": [ 656.6666666666666, 231.62373702959627 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1057799504784879774&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SyeQFiCcF7", "title": "Siamese Capsule Networks", "track": "main", "status": "Reject", "tldr": "A variant of capsule networks that can be used for pairwise learning tasks. Results shows that Siamese Capsule Networks work well in the few shot learning setting.", "abstract": "Capsule Networks have shown encouraging results on defacto benchmark computer vision datasets such as MNIST, CIFAR and smallNORB. Although, they are yet to be tested on tasks where (1) the entities detected inherently have more complex internal representations and (2) there are very few instances per class to learn from and (3) where point-wise classification is not suitable. Hence, this paper carries out experiments on face verification in both controlled and uncontrolled settings that together address these points. In doing so we introduce Siamese Capsule Networks, a new variant that can be used for pairwise learning tasks. The model is trained using contrastive loss with l2-normalized capsule encoded pose features. We find that Siamese Capsule Networks perform well against strong baselines on both pairwise learning datasets, yielding best results in the few-shot learning setting where image pairs in the test set contain unseen subjects.", "keywords": "capsule networks;pairwise learning;few-shot learning;face verification", "primary_area": "", "supplementary_material": "", "author": "James O' Neill", "authorids": "james.o-neill@liverpool.ac.uk", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nneill2019siamese,\ntitle={Siamese Capsule Networks},\nauthor={James O' Neill},\nyear={2019},\nurl={https://openreview.net/forum?id=SyeQFiCcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SyeQFiCcF7", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "568;365;163", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;188;27", "reply_reviewers": "0;0;0", "reply_authors": "0;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 365.3333333333333, 165.3407256411909 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 71.66666666666667, 82.99531445944538 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8110348816317015117&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "Syeben09FQ", "title": "Evaluating GANs via Duality", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative Adversarial Networks (GANs) have shown great results in accurately modeling complex distributions, but their training is known to be difficult due to instabilities caused by a challenging minimax optimization problem. This is especially troublesome given the lack of an evaluation metric that can reliably detect non-convergent behaviors. We leverage the notion of duality gap from game theory in order to propose a novel convergence metric for GANs that has low computational cost. We verify the validity of the proposed metric for various test scenarios commonly used in the literature. ", "keywords": "Generative Adversarial Networks;GANs;game theory", "primary_area": "", "supplementary_material": "", "author": "Paulina Grnarova;Kfir Y Levy;Aurelien Lucchi;Nathanael Perraudin;Thomas Hofmann;Andreas Krause", "authorids": "paulina.grnarova@inf.ethz.ch;yehuda.levy@inf.ethz.ch;aurelien.lucchi@inf.ethz.ch;nathanael.perraudin@sdsc.ethz.ch;thomas.hofmann@inf.ethz.ch;krausea@ethz.ch", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ngrnarova2019evaluating,\ntitle={Evaluating {GAN}s via Duality},\nauthor={Paulina Grnarova and Kfir Y Levy and Aurelien Lucchi and Nathanael Perraudin and Thomas Hofmann and Andreas Krause},\nyear={2019},\nurl={https://openreview.net/forum?id=Syeben09FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Syeben09FQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;3", "wc_review": "232;492;190", "wc_reply_reviewers": "0;346;0", "wc_reply_authors": "724;1318;915", "reply_reviewers": "0;1;0", "reply_authors": "1;3;2", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 304.6666666666667, 133.56979033033215 ], "wc_reply_reviewers_avg": [ 115.33333333333333, 163.10596419369696 ], "wc_reply_authors_avg": [ 985.6666666666666, 247.5942020502275 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14511328782066389538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SyehMhC9Y7", "title": "Deep Imitative Models for Flexible Inference, Planning, and Control", "track": "main", "status": "Reject", "tldr": "Hybrid Vision-Driven Imitation Learning and Model-Based Reinforcement Learning for Planning, Forecasting, and Control", "abstract": "Imitation learning provides an appealing framework for autonomous control: in many tasks, demonstrations of preferred behavior can be readily obtained from human experts, removing the need for costly and potentially dangerous online data collection in the real world. However, policies learned with imitation learning have limited flexibility to accommodate varied goals at test time. Model-based reinforcement learning (MBRL) offers considerably more flexibility, since a predictive model learned from data can be used to achieve various goals at test time. However, MBRL suffers from two shortcomings. First, the model does not help to choose desired or safe outcomes -- its dynamics estimate only what is possible, not what is preferred. Second, MBRL typically requires additional online data collection to ensure that the model is accurate in those situations that are actually encountered when attempting to achieve test time goals. Collecting this data with a partially trained model can be dangerous and time-consuming. In this paper, we aim to combine the benefits of imitation learning and MBRL, and propose imitative models: probabilistic predictive models able to plan expert-like trajectories to achieve arbitrary goals. We find this method substantially outperforms both direct imitation and MBRL in a simulated autonomous driving task, and can be learned efficiently from a fixed set of expert demonstrations without additional online data collection. We also show our model can flexibly incorporate user-supplied costs at test-time, can plan to sequences of goals, and can even perform well with imprecise goals, including goals on the wrong side of the road.", "keywords": "imitation learning;forecasting;computer vision", "primary_area": "", "supplementary_material": "", "author": "Nicholas Rhinehart;Rowan McAllister;Sergey Levine", "authorids": "nrhineha@cs.cmu.edu;rmcallister@berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nrhinehart2019deep,\ntitle={Deep Imitative Models for Flexible Inference, Planning, and Control},\nauthor={Nicholas Rhinehart and Rowan McAllister and Sergey Levine},\nyear={2019},\nurl={https://openreview.net/forum?id=SyehMhC9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SyehMhC9Y7", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;1;3", "wc_review": "293;125;868", "wc_reply_reviewers": "0;0;362", "wc_reply_authors": "384;109;1200", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 1.632993161855452 ], "wc_review_avg": [ 428.6666666666667, 318.13658841587034 ], "wc_reply_reviewers_avg": [ 120.66666666666667, 170.64843652635346 ], "wc_reply_authors_avg": [ 564.3333333333334, 463.29280398277524 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=599185864570432210&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "Syeil309tX", "title": "Optimized Gated Deep Learning Architectures for Sensor Fusion", "track": "main", "status": "Reject", "tldr": "Optimized gated deep learning architectures for sensor fusion is proposed.", "abstract": "Sensor fusion is a key technology that integrates various sensory inputs to allow for robust decision making in many applications such as autonomous driving and robot control. Deep neural networks have been adopted for sensor fusion in a body of recent studies. Among these, the so-called netgated architecture was proposed, which has demonstrated improved performances over the conventional convolu- tional neural networks (CNN). In this paper, we address several limitations of the baseline negated architecture by proposing two further optimized architectures: a coarser-grained gated architecture employing (feature) group-level fusion weights and a two-stage gated architectures leveraging both the group-level and feature- level fusion weights. Using driving mode prediction and human activity recogni- tion datasets, we demonstrate the significant performance improvements brought by the proposed gated architectures and also their robustness in the presence of sensor noise and failures.\n", "keywords": "deep learning;convolutional neural network;sensor fusion;activity recognition", "primary_area": "", "supplementary_material": "", "author": "Myung Seok Shim;Peng Li", "authorids": "mrshim1101@tamu.edu;pli@tamu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nshim2019optimized,\ntitle={Optimized Gated Deep Learning Architectures for Sensor Fusion},\nauthor={Myung Seok Shim and Peng Li},\nyear={2019},\nurl={https://openreview.net/forum?id=Syeil309tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=Syeil309tX", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;5", "wc_review": "276;398;779", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 484.3333333333333, 214.2309242123762 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16405820088921327667&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "SyerAiCqt7", "title": "Hierarchical Bayesian Modeling for Clustering Sparse Sequences in the Context of Group Profiling", "track": "main", "status": "Reject", "tldr": "Hierarchical Bayesian Modeling for Clustering Sparse Sequences ; user group modeling using behavioral data", "abstract": "This paper proposes a hierarchical Bayesian model for clustering sparse sequences.This is a mixture model and does not need the data to be represented by a Gaussian mixture and that gives significant modelling freedom.It also generates a very interpretable profile for the discovered latent groups.The data that was used for the work have been contributed by a restaurant loyalty program company. The data is a collection of sparse sequences where each entry of each sequence is the number of user visits of one week to some restaurant. This algorithm successfully clustered the data and calculated the expected user affiliation in each cluster.", "keywords": "Hierarchical Bayesian Modeling;Sparse sequence clustering;Group profiling;User group modeling", "primary_area": "", "supplementary_material": "", "author": "Ishani Chakraborty", "authorids": "ishani.chakrab@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nchakraborty2019hierarchical,\ntitle={Hierarchical Bayesian Modeling for Clustering Sparse Sequences in the Context of Group Profiling},\nauthor={Ishani Chakraborty},\nyear={2019},\nurl={https://openreview.net/forum?id=SyerAiCqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2;AnonReviewer1;AnonReviewer5", "site": "https://openreview.net/forum?id=SyerAiCqt7", "pdf_size": 0, "rating": "1;2;2;2;3", "confidence": "5;4;5;5;4", "wc_review": "176;75;42;110;191", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "0;0;0;0;0", "reply_reviewers": "0;0;0;0;0", "reply_authors": "0;0;0;0;0", "rating_avg": [ 2.0, 0.6324555320336759 ], "confidence_avg": [ 4.6, 0.4898979485566356 ], "wc_review_avg": [ 118.8, 57.234255476943176 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.6454972243679028, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:INEoz9eZZhwJ:scholar.google.com/&scioq=Hierarchical+Bayesian+Modeling+for+Clustering+Sparse+Sequences+in+the+Context+of+Group+Profiling&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "Syez3j0cKX", "title": "Dissecting an Adversarial framework for Information Retrieval", "track": "main", "status": "Reject", "tldr": "Points out problems in loss function used in IRGAN, a recently proposed GAN framework for Information Retrieval. Further, a model motivated by co-training is proposed, which achieves better performance.", "abstract": "Recent advances in Generative Adversarial Networks facilitated by improvements to the framework and successful application to various problems has resulted in extensions to multiple domains. IRGAN attempts to leverage the framework for Information-Retrieval (IR), a task that can be described as modeling the correct conditional probability distribution p(d|q) over the documents (d), given the query (q). The work that proposes IRGAN claims that optimizing their minimax loss function will result in a generator which can learn the distribution, but their setup and baseline term steer the model away from an exact adversarial formulation, and this work attempts to point out certain inaccuracies in their formulation. Analyzing their loss curves gives insight into possible mistakes in the loss functions and better performance can be obtained by using the co-training like setup we propose, where two models are trained in a co-operative rather than an adversarial fashion.", "keywords": "GAN;Deep Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Ameet Deshpande;Mitesh M.Khapra", "authorids": "cs15b001@cse.iitm.ac.in;miteshk@cse.iitm.ac.in", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndeshpande2019dissecting,\ntitle={Dissecting an Adversarial framework for Information Retrieval},\nauthor={Ameet Deshpande and Mitesh M.Khapra},\nyear={2019},\nurl={https://openreview.net/forum?id=Syez3j0cKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=Syez3j0cKX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;4", "wc_review": "674;262;869", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "779;246;357", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 601.6666666666666, 253.03008165477524 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 460.6666666666667, 229.6117496025749 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4097233475304787946&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "SyezvsC5tX", "title": "The loss landscape of overparameterized neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We explore some mathematical features of the loss landscape of overparameterized neural networks. A priori one might imagine that the loss function looks like a typical function from $\\mathbb{R}^n$ to $\\mathbb{R}$ - in particular, nonconvex, with discrete global minima. In this paper, we prove that in at least one important way, the loss function of an overparameterized neural network does not look like a typical function. If a neural net has $n$ parameters and is trained on $d$ data points, with $n>d$, we show that the locus $M$ of global minima of $L$ is usually not discrete, but rather an $n-d$ dimensional submanifold of $\\mathbb{R}^n$. In practice, neural nets commonly have orders of magnitude more parameters than data points, so this observation implies that $M$ is typically a very high-dimensional subset of $\\mathbb{R}^n$. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Y. Cooper", "authorids": "yaim@math.ias.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ncooper2019the,\ntitle={The loss landscape of overparameterized neural networks},\nauthor={Y. Cooper},\nyear={2019},\nurl={https://openreview.net/forum?id=SyezvsC5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SyezvsC5tX", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;3", "wc_review": "281;536;246", "wc_reply_reviewers": "337;221;0", "wc_reply_authors": "652;818;120", "reply_reviewers": "2;2;0", "reply_authors": "3;4;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 354.3333333333333, 129.24997313560866 ], "wc_reply_reviewers_avg": [ 186.0, 139.78793462479751 ], "wc_reply_authors_avg": [ 530.0, 297.72918343129663 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.9428090415820634 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17308884340236852365&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "Syf9Q209YQ", "title": "Manifold regularization with GANs for semi-supervised learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative Adversarial Networks are powerful generative models that can model the manifold of natural images. We leverage this property to perform manifold regularization by approximating a variant of the Laplacian norm using a Monte Carlo approximation that is easily computed with the GAN. When incorporated into the semi-supervised feature-matching GAN we achieve state-of-the-art results for semi-supervised learning on CIFAR-10 benchmarks when few labels are used, with a method that is significantly easier to implement than competing methods. We find that manifold regularization improves the quality of generated images, and is affected by the quality of the GAN used to approximate the regularizer.", "keywords": "semi-supervised learning;generative adversarial networks;manifold regularization", "primary_area": "", "supplementary_material": "", "author": "Bruno Lecouat;Chuan-Sheng Foo;Houssam Zenati;Vijay Chandrasekhar", "authorids": "bruno_lecouat@i2r.a-star.edu.sg;foo_chuan_sheng@i2r.a-star.edu.sg;houssam.zenati@student.ecp.fr;vijay@i2r.a-star.edu.sg", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlecouat2019manifold,\ntitle={Manifold regularization with {GAN}s for semi-supervised learning},\nauthor={Bruno Lecouat and Chuan-Sheng Foo and Houssam Zenati and Vijay Chandrasekhar},\nyear={2019},\nurl={https://openreview.net/forum?id=Syf9Q209YQ},\n}", "github": "[![github](/images/github_icon.svg) bruno-31/gan-manifold-reg](https://github.com/bruno-31/gan-manifold-reg)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Syf9Q209YQ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "403;414;309", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "404;713;320", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 375.3333333333333, 47.11923410054775 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 479.0, 168.9792886717186 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15828936131520593648&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Improving the Generalization of Adversarial Training with Domain Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/675", "id": "SyfIfnC5Ym", "author_site": "Chuanbiao Song, Kun He, Liwei Wang, John E Hopcroft", "tldr": "We propose a novel adversarial training with domain adaptation method that significantly improves the generalization ability on adversarial examples from different attacks.", "abstract": "By injecting adversarial examples into training data, adversarial training is promising for improving the robustness of deep learning models. However, most existing adversarial training approaches are based on a specific type of adversarial attack. It may not provide sufficiently representative samples from the adversarial domain, leading to a weak generalization ability on adversarial examples from other attacks. Moreover, during the adversarial training, adversarial perturbations on inputs are usually crafted by fast single-step adversaries so as to scale to large datasets. This work is mainly focused on the adversarial training yet efficient FGSM adversary. In this scenario, it is difficult to train a model with great generalization due to the lack of representative adversarial samples, aka the samples are unable to accurately reflect the adversarial domain. To alleviate this problem, we propose a novel Adversarial Training with Domain Adaptation (ATDA) method. Our intuition is to regard the adversarial training on FGSM adversary as a domain adaption task with limited number of target domain samples. The main idea is to learn a representation that is semantically meaningful and domain invariant on the clean domain as well as the adversarial domain. Empirical evaluations on Fashion-MNIST, SVHN, CIFAR-10 and CIFAR-100 demonstrate that ATDA can greatly improve the generalization of adversarial training and the smoothness of the learned models, and outperforms state-of-the-art methods on standard benchmark datasets. To show the transfer ability of our method, we also extend ATDA to the adversarial training on iterative attacks such as PGD-Adversial Training (PAT) and the defense performance is improved considerably.", "keywords": "adversarial training;domain adaptation;adversarial example;deep learning", "primary_area": "", "supplementary_material": "", "author": "Chuanbiao Song;Kun He;Liwei Wang;John E. Hopcroft", "authorids": "cbsong@hust.edu.cn;brooklet60@hust.edu.cn;wanglw@pku.edu.cn;jeh@cs.cornell.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsong2018improving,\ntitle={Improving the Generalization of Adversarial Training with Domain Adaptation},\nauthor={Chuanbiao Song and Kun He and Liwei Wang and John E. Hopcroft},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyfIfnC5Ym},\n}", "github": "[![github](/images/github_icon.svg) JHL-HUST/ATDA](https://github.com/JHL-HUST/ATDA) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SyfIfnC5Ym)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;4;3", "wc_review": "172;134;272", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "228;530;402", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 192.66666666666666, 58.20271089524572 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 386.6666666666667, 123.7668058173201 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12534049630846932475&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=SyfIfnC5Ym", "pdf": "https://openreview.net/pdf?id=SyfIfnC5Ym", "email": ";;;", "author_num": 4 }, { "id": "SyfXKoRqFQ", "title": "Ada-Boundary: Accelerating the DNN Training via Adaptive Boundary Batch Selection", "track": "main", "status": "Reject", "tldr": "We suggest a smart batch selection technique called Ada-Boundary.", "abstract": "Neural networks can converge faster with help from a smarter batch selection strategy. In this regard, we propose Ada-Boundary, a novel adaptive-batch selection algorithm that constructs an effective mini-batch according to the learning progress of the model.Our key idea is to present confusing samples what the true label is. Thus, the samples near the current decision boundary are considered as the most effective to expedite convergence. Taking advantage of our design, Ada-Boundary maintains its dominance in various degrees of training difficulty. We demonstrate the advantage of Ada-Boundary by extensive experiments using two convolutional neural networks for three benchmark data sets. The experiment results show that Ada-Boundary improves the training time by up to 31.7% compared with the state-of-the-art strategy and by up to 33.5% compared with the baseline strategy.", "keywords": "acceleration;batch selection;convergence;decision boundary", "primary_area": "", "supplementary_material": "", "author": "Hwanjun Song;Sundong Kim;Minseok Kim;Jae-Gil Lee", "authorids": "songhwanjun@kaist.ac.kr;sundong.kim@kaist.ac.kr;minseokkim@kaist.ac.kr;jaegil@kaist.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsong2019adaboundary,\ntitle={Ada-Boundary: Accelerating the {DNN} Training via Adaptive Boundary Batch Selection},\nauthor={Hwanjun Song and Sundong Kim and Minseok Kim and Jae-Gil Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=SyfXKoRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SyfXKoRqFQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "wc_review": "245;263;441", "wc_reply_reviewers": "97;0;0", "wc_reply_authors": "430;313;473", "reply_reviewers": "1;0;0", "reply_authors": "2;2;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 316.3333333333333, 88.45840202540904 ], "wc_reply_reviewers_avg": [ 32.333333333333336, 45.726238516730064 ], "wc_reply_authors_avg": [ 405.3333333333333, 67.60834925428138 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16818993236650349659&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9 }, { "id": "SyfdsjA9FX", "title": "Live Face De-Identification in Video", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We propose a method for face de-identification that enables fully automatic video modification at high frame rates. The goal is to maximally decorrelate the identity, while having the perception (pose, illumination and expression) fixed. We achieve this by a novel feed forward encoder-decoder network architecture that is conditioned on the high-level representation of a person's facial image. The network is global, in the sense that it does not need to be retrained for a given video or for a given identity, and it creates natural-looking image sequences with little distortion in time. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Oran Gafni;Lior Wolf;Yaniv Taigman", "authorids": "oran@fb.com;wolf@fb.com;yaniv@fb.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=SyfdsjA9FX", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "wc_review": "282;529;442", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "893;369;418", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 417.6666666666667, 102.29478101165387 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 560.0, 236.31476184670873 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 174, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7195075464231903164&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "Syfe2iR5FQ", "title": "Parametrizing Fully Convolutional Nets with a Single High-Order Tensor", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent findings indicate that over-parametrization, while crucial to the success of deep learning, also introduces large amounts of redundancy. Tensor methods have the potential to parametrize over-complete representations in a compact manner by leveraging this redundancy. In this paper, we propose fully parametrizing Convolutional Neural Networks (CNNs) with a single, low-rank tensor. Previous works on network tensorization haved focused on parametrizing individual layers (convolutional or fully connected) only, and perform the tensorization layer-by-layer disjointly. In contrast, we propose to jointly capture the full structure of a CNN by parametrizing it with a single, high-order tensor, the modes of which represent each of the architectural design parameters of the CNN (e.g. number of convolutional blocks, depth, number of stacks, input features, etc). This parametrization allows to regularize the whole network and drastically reduce the number of parameters by imposing a low-rank structure on that tensor. Further, our network is end-to-end trainable from scratch, which has been shown to be challenging in prior work. We study the case of networks with rich structure, namely Fully Convolutional CNNs, which we propose to parametrize them with a single 8-dimensional tensor. We show that our approach can achieve superior performance with small compression rates, and attain high compression rates with negligible drop in accuracy for the challenging task of human pose estimation.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jean Kossaifi;Adrian Bulat;Georgios Tzimiropoulos;Maja Pantic", "authorids": "jean.kossaifi@gmail.com;bulat.adrian@gmail.com;yorgos.tzimiropoulos@nottingham.ac.uk;maja.pantic@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=Syfe2iR5FQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "wc_review": "175;494;265", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "98;331;294", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 311.3333333333333, 134.2890745949035 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 241.0, 102.23828376232979 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=402650315925726334&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "Syfz6sC9tQ", "title": "Generative Feature Matching Networks", "track": "main", "status": "Reject", "tldr": "A new non-adversarial feature matching-based approach to train generative models that achieves state-of-the-art results.", "abstract": "We propose a non-adversarial feature matching-based approach to train generative models. Our approach, Generative Feature Matching Networks (GFMN), leverages pretrained neural networks such as autoencoders and ConvNet classifiers to perform feature extraction. We perform an extensive number of experiments with different challenging datasets, including ImageNet. Our experimental results demonstrate that, due to the expressiveness of the features from pretrained ImageNet classifiers, even by just matching first order statistics, our approach can achieve state-of-the-art results for challenging benchmarks such as CIFAR10 and STL10.", "keywords": "Generative Deep Neural Networks;Feature Matching;Maximum Mean Discrepancy;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Cicero Nogueira dos Santos;Inkit Padhi;Pierre Dognin;Youssef Mroueh", "authorids": "cicerons@us.ibm.com;inkit.padhi@ibm.com;pdognin@us.ibm.com;mroueh@us.ibm.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsantos2019generative,\ntitle={Generative Feature Matching Networks},\nauthor={Cicero Nogueira dos Santos and Inkit Padhi and Pierre Dognin and Youssef Mroueh},\nyear={2019},\nurl={https://openreview.net/forum?id=Syfz6sC9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=Syfz6sC9tQ", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;3;3;4", "wc_review": "132;483;213;1284", "wc_reply_reviewers": "18;0;0;51", "wc_reply_authors": "325;2070;159;2331", "reply_reviewers": "1;0;0;2", "reply_authors": "3;4;1;5", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 528.0, 455.412450422691 ], "wc_reply_reviewers_avg": [ 17.25, 20.825165065372232 ], "wc_reply_authors_avg": [ 1221.25, 985.3376007744756 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 3.25, 1.479019945774904 ], "replies_avg": [ 28, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16052784612670354416&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Dimensionality Reduction for Representing the Knowledge of Probabilistic Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1013", "id": "SygD-hCcF7", "author_site": "Marc T Law, Jake Snell, Amir-massoud Farahmand, Raquel Urtasun, Richard Zemel", "tldr": "dimensionality reduction for cases where examples can be represented as soft probability distributions", "abstract": "Most deep learning models rely on expressive high-dimensional representations to achieve good performance on tasks such as classification. However, the high dimensionality of these representations makes them difficult to interpret and prone to over-fitting. We propose a simple, intuitive and scalable dimension reduction framework that takes into account the soft probabilistic interpretation of standard deep models for classification. When applying our framework to visualization, our representations more accurately reflect inter-class distances than standard visualization techniques such as t-SNE. We show experimentally that our framework improves generalization performance to unseen categories in zero-shot learning. We also provide a finite sample error upper bound guarantee for the method.", "keywords": "metric learning;distance learning;dimensionality reduction;bound guarantees", "primary_area": "", "supplementary_material": "", "author": "Marc T Law;Jake Snell;Amir-massoud Farahmand;Raquel Urtasun;Richard S Zemel", "authorids": "law@cs.toronto.edu;jsnell@cs.toronto.edu;farahmand@vectorinstitute.ai;urtasun@cs.toronto.edu;zemel@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlaw2018dimensionality,\ntitle={Dimensionality Reduction for Representing the Knowledge of Probabilistic Models},\nauthor={Marc T Law and Jake Snell and Amir-massoud Farahmand and Raquel Urtasun and Richard S Zemel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SygD-hCcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;9", "confidence": "1;4;3", "wc_review": "212;565;378", "wc_reply_reviewers": "0;104;0", "wc_reply_authors": "703;780;368", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 385.0, 144.1966250182946 ], "wc_reply_reviewers_avg": [ 34.666666666666664, 49.026070162267295 ], "wc_reply_authors_avg": [ 617.0, 178.85375776501502 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.49999999999999994, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9583060097070031668&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SygD-hCcF7", "pdf": "https://openreview.net/pdf?id=SygD-hCcF7", "email": ";;;;", "author_num": 5 }, { "id": "SygHGnRqK7", "title": "Probabilistic Federated Neural Matching", "track": "main", "status": "Reject", "tldr": "We propose a Bayesian nonparametric model for federated learning with neural networks.", "abstract": "In federated learning problems, data is scattered across different servers and exchanging or pooling it is often impractical or prohibited. We develop a Bayesian nonparametric framework for federated learning with neural networks. Each data server is assumed to train local neural network weights, which are modeled through our framework. We then develop an inference approach that allows us to synthesize a more expressive global network without additional supervision or data pooling. We then demonstrate the efficacy of our approach on federated learning problems simulated from two popular image classification datasets.", "keywords": "Bayesian nonparametrics;Indian Buffet Process;Federated Learning", "primary_area": "", "supplementary_material": "", "author": "Mikhail Yurochkin;Mayank Agarwal;Soumya Ghosh;Kristjan Greenewald;Nghia Hoang;Yasaman Khazaeni", "authorids": "mikhail.yurochkin@ibm.com;mayank.agarwal@ibm.com;ghoshso@us.ibm.com;kristjan.h.greenewald@ibm.com;nghiaht@ibm.com;yasaman.khazaeni@us.ibm.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nyurochkin2019probabilistic,\ntitle={Probabilistic Federated Neural Matching},\nauthor={Mikhail Yurochkin and Mayank Agarwal and Soumya Ghosh and Kristjan Greenewald and Nghia Hoang and Yasaman Khazaeni},\nyear={2019},\nurl={https://openreview.net/forum?id=SygHGnRqK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SygHGnRqK7", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;4;4", "wc_review": "60;416;397", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "247;527;542", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 291.0, 163.52573701612437 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 438.6666666666667, 135.66707616645814 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4046375992662159387&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "SygInj05Fm", "title": "Physiological Signal Embeddings (PHASE) via Interpretable Stacked Models", "track": "main", "status": "Reject", "tldr": "Physiological signal embeddings for prediction performance and hospital transference with a general Shapley value interpretability method for stacked models.", "abstract": "In health, machine learning is increasingly common, yet neural network embedding (representation) learning is arguably under-utilized for physiological signals. This inadequacy stands out in stark contrast to more traditional computer science domains, such as computer vision (CV), and natural language processing (NLP). For physiological signals, learning feature embeddings is a natural solution to data insufficiency caused by patient privacy concerns -- rather than share data, researchers may share informative embedding models (i.e., representation models), which map patient data to an output embedding. Here, we present the PHASE (PHysiologicAl Signal Embeddings) framework, which consists of three components: i) learning neural network embeddings of physiological signals, ii) predicting outcomes based on the learned embedding, and iii) interpreting the prediction results by estimating feature attributions in the \"stacked\" models (i.e., feature embedding model followed by prediction model). PHASE is novel in three ways: 1) To our knowledge, PHASE is the first instance of transferal of neural networks to create physiological signal embeddings. 2) We present a tractable method to obtain feature attributions through stacked models. We prove that our stacked model attributions can approximate Shapley values -- attributions known to have desirable properties -- for arbitrary sets of models. 3) PHASE was extensively tested in a cross-hospital setting including publicly available data. In our experiments, we show that PHASE significantly outperforms alternative embeddings -- such as raw, exponential moving average/variance, and autoencoder -- currently in use. Furthermore, we provide evidence that transferring neural network embedding/representation learners between distinct hospitals still yields performant embeddings and offer recommendations when transference is ineffective.", "keywords": "Representation learning;transfer learning;health;machine learning;physiological signals;interpretation;feature attributions;shapley values;univariate embeddings;LSTMs;XGB;neural networks;stacked models;model pipelines;interpretable stacked models", "primary_area": "", "supplementary_material": "", "author": "Hugh Chen;Scott Lundberg;Gabe Erion;Su-In Lee", "authorids": "hughchen@cs.washington.edu;slund1@cs.washington.edu;erion@cs.washington.edu;suinlee@cs.washington.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchen2019physiological,\ntitle={Physiological Signal Embeddings ({PHASE}) via Interpretable Stacked Models},\nauthor={Hugh Chen and Scott Lundberg and Gabe Erion and Su-In Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=SygInj05Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SygInj05Fm", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;5", "wc_review": "425;693;367", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1607;1977;1067", "reply_reviewers": "0;0;0", "reply_authors": "3;4;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 495.0, 141.99530508670583 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1550.3333333333333, 373.6605708691007 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sLtzp0yjTzsJ:scholar.google.com/&scioq=Physiological+Signal+Embeddings+(PHASE)+via+Interpretable+Stacked+Models&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SygJSiA5YQ", "title": "Weak contraction mapping and optimization", "track": "main", "status": "Reject", "tldr": "A gradient-free method is proposed for non-convex optimization problem ", "abstract": "The weak contraction mapping is a self mapping that the range is always a subset of the domain, which admits a unique fixed-point. The iteration of weak contraction mapping is a Cauchy sequence that yields the unique fixed-point. A gradient-free optimization method as an application of weak contraction mapping is proposed to achieve global minimum convergence. The optimization method is robust to local minima and initial point position.", "keywords": "Weak contraction mapping;fixed-point theorem;non-convex optimization", "primary_area": "", "supplementary_material": "", "author": "Siwei Luo", "authorids": "siuluosiwei@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nluo2019weak,\ntitle={Weak contraction mapping and optimization},\nauthor={Siwei Luo},\nyear={2019},\nurl={https://openreview.net/forum?id=SygJSiA5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=SygJSiA5YQ", "pdf_size": 0, "rating": "1;3;4", "confidence": "5;2;5", "wc_review": "433;346;172", "wc_reply_reviewers": "226;0;0", "wc_reply_authors": "417;0;38", "reply_reviewers": "1;0;0", "reply_authors": "2;0;1", "rating_avg": [ 2.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 4.0, 1.4142135623730951 ], "wc_review_avg": [ 317.0, 108.5080642164443 ], "wc_reply_reviewers_avg": [ 75.33333333333333, 106.53742169877317 ], "wc_reply_authors_avg": [ 151.66666666666666, 188.25927747538913 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.816496580927726 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:QqIVdtg5jjsJ:scholar.google.com/&scioq=Weak+contraction+mapping+and+optimization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SygK6sA5tX", "title": "Graph Classification with Geometric Scattering", "track": "main", "status": "Reject", "tldr": "We present a new feed forward graph ConvNet based on generalizing the wavelet scattering transform of Mallat, and demonstrate its utility in graph classification and data exploration tasks.", "abstract": "One of the most notable contributions of deep learning is the application of convolutional neural networks (ConvNets) to structured signal classification, and in particular image classification. Beyond their impressive performances in supervised learning, the structure of such networks inspired the development of deep filter banks referred to as scattering transforms. These transforms apply a cascade of wavelet transforms and complex modulus operators to extract features that are invariant to group operations and stable to deformations. Furthermore, ConvNets inspired recent advances in geometric deep learning, which aim to generalize these networks to graph data by applying notions from graph signal processing to learn deep graph filter cascades. We further advance these lines of research by proposing a geometric scattering transform using graph wavelets defined in terms of random walks on the graph. We demonstrate the utility of features extracted with this designed deep filter bank in graph classification of biochemistry and social network data (incl. state of the art results in the latter case), and in data exploration, where they enable inference of EC exchange preferences in enzyme evolution.", "keywords": "geometric deep learning;graph neural network;graph classification;scattering", "primary_area": "", "supplementary_material": "", "author": "Feng Gao;Guy Wolf;Matthew Hirn", "authorids": "gaofeng2@msu.edu;guy.wolf@yale.edu;mhirn@msu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngao2019graph,\ntitle={Graph Classification with Geometric Scattering},\nauthor={Feng Gao and Guy Wolf and Matthew Hirn},\nyear={2019},\nurl={https://openreview.net/forum?id=SygK6sA5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SygK6sA5tX", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "wc_review": "1002;126;358", "wc_reply_reviewers": "142;0;0", "wc_reply_authors": "1228;947;682", "reply_reviewers": "1;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 495.3333333333333, 370.57552836395206 ], "wc_reply_reviewers_avg": [ 47.333333333333336, 66.9394419523265 ], "wc_reply_authors_avg": [ 952.3333333333334, 222.93546649697134 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18077836856386222524&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Learning protein sequence embeddings using information from structure", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1101", "id": "SygLehCqtm", "author_site": "Tristan Bepler, Bonnie Berger", "tldr": "We present a method for learning protein sequence embedding models using structural information in the form of global structural similarity between proteins and within protein residue-residue contacts.", "abstract": "Inferring the structural properties of a protein from its amino acid sequence is a challenging yet important problem in biology. Structures are not known for the vast majority of protein sequences, but structure is critical for understanding function. Existing approaches for detecting structural similarity between proteins from sequence are unable to recognize and exploit structural patterns when sequences have diverged too far, limiting our ability to transfer knowledge between structurally related proteins. We newly approach this problem through the lens of representation learning. We introduce a framework that maps any protein sequence to a sequence of vector embeddings --- one per amino acid position --- that encode structural information. We train bidirectional long short-term memory (LSTM) models on protein sequences with a two-part feedback mechanism that incorporates information from (i) global structural similarity between proteins and (ii) pairwise residue contact maps for individual proteins. To enable learning from structural similarity information, we define a novel similarity measure between arbitrary-length sequences of vector embeddings based on a soft symmetric alignment (SSA) between them. Our method is able to learn useful position-specific embeddings despite lacking direct observations of position-level correspondence between sequences. We show empirically that our multi-task framework outperforms other sequence-based methods and even a top-performing structure-based alignment method when predicting structural similarity, our goal. Finally, we demonstrate that our learned embeddings can be transferred to other protein sequence problems, improving the state-of-the-art in transmembrane domain prediction.", "keywords": "sequence embedding;sequence alignment;RNN;LSTM;protein structure;amino acid sequence;contextual embeddings;transmembrane prediction", "primary_area": "", "supplementary_material": "", "author": "Tristan Bepler;Bonnie Berger", "authorids": "tbepler@mit.edu;bab@mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbepler2018learning,\ntitle={Learning protein sequence embeddings using information from structure},\nauthor={Tristan Bepler and Bonnie Berger},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SygLehCqtm},\n}", "github": "[![github](/images/github_icon.svg) tbepler/protein-sequence-embedding-iclr2019](https://github.com/tbepler/protein-sequence-embedding-iclr2019)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;4;4", "wc_review": "545;394;681", "wc_reply_reviewers": "0;0;174", "wc_reply_authors": "744;264;1849", "reply_reviewers": "0;0;1", "reply_authors": "1;1;4", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 540.0, 117.22058977273005 ], "wc_reply_reviewers_avg": [ 58.0, 82.02438661763951 ], "wc_reply_authors_avg": [ 952.3333333333334, 663.6305866636615 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 382, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15164585032422536283&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SygLehCqtm", "pdf": "https://openreview.net/pdf?id=SygLehCqtm", "email": ";", "author_num": 2 }, { "id": "SygNooCqY7", "title": "Noise-Tempered Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present a novel method to stabilize the training of generative adversarial networks. The training stability is often undermined by the limited and low-dimensional support of the probability density function of the data samples. To address this problem we propose to simultaneously train the generative adversarial networks against different additive noise models, including the noise-free case. The benefits of this approach are that: 1) The case with noise added to both real and generated samples extends the support of the probability density function of the data, while not compromising the exact matching of the original data distribution, and 2) The noise-free case allows the exact matching of the original data distribution. We demonstrate our approach with both fixed additive noise and with learned noise models. We show that our approach results in a stable and well-behaved training of even the original minimax GAN formulation. Moreover, our technique can be incorporated in most modern GAN formulations and leads to a consistent improvement on several common datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Simon Jenni;Paolo Favaro", "authorids": "jenni@inf.unibe.ch;paolo.favaro@inf.unibe.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SygNooCqY7", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;4", "wc_review": "569;696;561", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "348;170;318", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 608.6666666666666, 61.8402961039339 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 278.6666666666667, 77.80888266915431 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:qLrBb07rMksJ:scholar.google.com/&scioq=Noise-Tempered+Generative+Adversarial+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SygONjRqKm", "title": "Amortized Context Vector Inference for Sequence-to-Sequence Networks", "track": "main", "status": "Reject", "tldr": "A generalisation of context representation in neural attention under the variational inference rationale.", "abstract": "Neural attention (NA) has become a key component of sequence-to-sequence models that yield state-of-the-art performance in as hard tasks as abstractive document summarization (ADS), machine translation (MT), and video captioning (VC). NA mechanisms perform inference of context vectors; these constitute weighted sums of deterministic input sequence encodings, adaptively sourced over long temporal horizons. Inspired from recent work in the field of amortized variational inference (AVI), in this work we consider treating the context vectors generated by soft-attention (SA) models as latent variables, with approximate finite mixture model posteriors inferred via AVI. We posit that this formulation may yield stronger generalization capacity, in line with the outcomes of existing applications of AVI to deep networks. To illustrate our method, we implement it and experimentally evaluate it considering challenging ADS, VC, and MT benchmarks. This way, we exhibit its improved effectiveness over state-of-the-art alternatives.", "keywords": "neural attention;sequence-to-sequence;variational inference", "primary_area": "", "supplementary_material": "", "author": "Sotirios Chatzis;Kyriacos Tolias;Aristotelis Charalampous", "authorids": "sotirios.chatzis@cut.ac.cy;k.v.tolias@edu.cut.ac.cy;aristotelis.charalampous@edu.cut.ac.cy", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchatzis2019amortized,\ntitle={Amortized Context Vector Inference for Sequence-to-Sequence Networks},\nauthor={Sotirios Chatzis and Kyriacos Tolias and Aristotelis Charalampous},\nyear={2019},\nurl={https://openreview.net/forum?id=SygONjRqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SygONjRqKm", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;3;3", "wc_review": "347;156;190", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "774;480;535", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 231.0, 83.19054433447751 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 596.3333333333334, 127.62009594452157 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dX3--gjBtsYJ:scholar.google.com/&scioq=Amortized+Context+Vector+Inference+for+Sequence-to-Sequence+Networks&hl=en&as_sdt=0,33", "gs_version_total": 3 }, { "title": "Variational Smoothing in Recurrent Neural Network Language Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/677", "id": "SygQvs0cFQ", "author_site": "Lingpeng Kong, G\u00e1bor Melis, Wang Ling, Lei Yu, Dani Yogatama", "tldr": "", "abstract": "We present a new theoretical perspective of data noising in recurrent neural network language models (Xie et al., 2017). We show that each variant of data noising is an instance of Bayesian recurrent neural networks with a particular variational distribution (i.e., a mixture of Gaussians whose weights depend on statistics derived from the corpus such as the unigram distribution). We use this insight to propose a more principled method to apply at prediction time and propose natural extensions to data noising under the variational framework. In particular, we propose variational smoothing with tied input and output embedding matrices and an element-wise variational smoothing method. We empirically verify our analysis on two benchmark language modeling datasets and demonstrate performance improvements over existing data noising methods.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Lingpeng Kong;Gabor Melis;Wang Ling;Lei Yu;Dani Yogatama", "authorids": "lingpenk@cs.cmu.edu;melisgl@google.com;lingwang@google.com;leiyu@google.com;dyogatama@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nkong2018variational,\ntitle={Variational Smoothing in Recurrent Neural Network Language Models},\nauthor={Lingpeng Kong and Gabor Melis and Wang Ling and Lei Yu and Dani Yogatama},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SygQvs0cFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "2;6;7", "confidence": "5;4;4", "wc_review": "189;301;198", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "287;276;72", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 2.160246899469287 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 229.33333333333334, 50.80901057445968 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 211.66666666666666, 98.86129452700665 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9819805060619656, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10055945762623652654&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SygQvs0cFQ", "pdf": "https://openreview.net/pdf?id=SygQvs0cFQ", "email": ";;;;", "author_num": 5 }, { "id": "SygeznA9YX", "title": "Data Interpretation and Reasoning Over Scientific Plots", "track": "main", "status": "Withdraw", "tldr": "We created a new dataset for data interpretation over plots and also propose a baseline for the same.", "abstract": "Data Interpretation is an important part of Quantitative Aptitude exams and requires an individual to answer questions grounded in plots such as bar charts, line graphs, scatter plots, \\textit{etc}. Recently, there has been an increasing interest in building models which can perform this task by learning from datasets containing triplets of the form \\{plot, question, answer\\}. Two such datasets have been proposed in the recent past which contain plots generated from synthetic data with limited (i) $x-y$ axes variables (ii) question templates and (iii) answer vocabulary and hence do not adequately capture the challenges posed by this task. To overcome these limitations of existing datasets, we introduce a new dataset containing $9.7$ million question-answer pairs grounded over $270,000$ plots with three main differentiators. First, the plots in our dataset contain a wide variety of realistic $x$-$y$ variables such as CO2 emission, fertility rate, \\textit{etc.} extracted from real word data sources such as World Bank, government sites, \\textit{etc}. Second, the questions in our dataset are more complex as they are based on templates extracted from interesting questions asked by a crowd of workers using a fraction of these plots. Lastly, the answers in our dataset are not restricted to a small vocabulary and a large fraction of the answers seen at test time are not present in the training vocabulary. As a result, existing models for Visual Question Answering which largely use end-to-end models in a multi-class classification framework cannot be used for this task. We establish initial results on this dataset and emphasize the complexity of the task using a multi-staged modular pipeline with various sub-components to (i) extract relevant data from the plot and convert it to a semi-structured table (ii) combine the question with this table and use compositional semantic parsing to arrive at a logical form from which the answer can be derived. We believe that such a modular framework is the best way to go forward as it would enable the research community to independently make progress on all the sub-tasks involved in plot question answering.", "keywords": "VQA;Data Interpretation;Parsing;Object Detection", "primary_area": "", "supplementary_material": "", "author": "Pritha Ganguly;Nitesh Methani;Mitesh M. Khapra", "authorids": ";;", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=SygeznA9YX", "pdf_size": 0, "rating": "3;6;6", "confidence": "4;4;4", "wc_review": "618;240;360", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 406.0, 157.70859202973057 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:L67yLMtEuGUJ:scholar.google.com/&scioq=Data+Interpretation+and+Reasoning+Over+Scientific+Plots&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SygjB3AcYX", "title": "Generalized Label Propagation Methods for Semi-Supervised Learning", "track": "main", "status": "Withdraw", "tldr": "We extend the classical label propation methods to jointly model graph and feature information from a graph filtering perspective, and show connections to the graph convlutional networks.", "abstract": "The key challenge in semi-supervised learning is how to effectively leverage unlabeled data to improve learning performance. The classical label propagation method, despite its popularity, has limited modeling capability in that it only exploits graph information for making predictions. In this paper, we consider label propagation from a graph signal processing perspective and decompose it into three components: signal, filter, and classifier. By extending the three components, we propose a simple generalized label propagation (GLP) framework for semi-supervised learning. GLP naturally integrates graph and data feature information, and offers the flexibility of selecting appropriate filters and domain-specific classifiers for different applications. Interestingly, GLP also provides new insight into the popular graph convolutional network and elucidates its working mechanisms. Extensive experiments on three citation networks, one knowledge graph, and one image dataset demonstrate the efficiency and effectiveness of GLP.", "keywords": "semi-supervised learning;label propagation;graph convolutional networks", "primary_area": "", "supplementary_material": "", "author": "Qimai Li;Xiao-Ming Wu;Zhichao Guan.", "authorids": "csqmli@comp.polyu.edu.hk;xiao-ming.wu@polyu.edu.hk;zcguan@zju.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SygjB3AcYX", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;5", "wc_review": "509;394;144", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "163;252;16", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 349.0, 152.3701633085253 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 143.66666666666666, 97.31164141846315 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5878211725204086416&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Biologically-Plausible Learning Algorithms Can Scale to Large Datasets", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/662", "id": "SygvZ209F7", "author_site": "Wu Xiao, HONGLIN CHEN, Qianli Liao, Tomaso Poggio", "tldr": "Biologically plausible learning algorithms, particularly sign-symmetry, work well on ImageNet", "abstract": "The backpropagation (BP) algorithm is often thought to be biologically implausible in the brain. One of the main reasons is that BP requires symmetric weight matrices in the feedforward and feedback pathways. To address this \u201cweight transport problem\u201d (Grossberg, 1987), two biologically-plausible algorithms, proposed by Liao et al. (2016) and Lillicrap et al. (2016), relax BP\u2019s weight symmetry requirements and demonstrate comparable learning capabilities to that of BP on small datasets. However, a recent study by Bartunov et al. (2018) finds that although feedback alignment (FA) and some variants of target-propagation (TP) perform well on MNIST and CIFAR, they perform significantly worse than BP on ImageNet. Here, we additionally evaluate the sign-symmetry (SS) algorithm (Liao et al., 2016), which differs from both BP and FA in that the feedback and feedforward weights do not share magnitudes but share signs. We examined the performance of sign-symmetry and feedback alignment on ImageNet and MS COCO datasets using different network architectures (ResNet-18 and AlexNet for ImageNet; RetinaNet for MS COCO). Surprisingly, networks trained with sign-symmetry can attain classification performance approaching that of BP-trained networks. These results complement the study by Bartunov et al. (2018) and establish a new benchmark for future biologically-plausible learning algorithms on more difficult datasets and more complex architectures.", "keywords": "biologically plausible learning algorithm;ImageNet;sign-symmetry;feedback alignment", "primary_area": "", "supplementary_material": "", "author": "Will Xiao;Honglin Chen;Qianli Liao;Tomaso Poggio", "authorids": "xiaow@fas.harvard.edu;chenhonglin@g.ucla.edu;lql@mit.edu;tp@csail.mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nxiao2018biologicallyplausible,\ntitle={Biologically-Plausible Learning Algorithms Can Scale to Large Datasets},\nauthor={Will Xiao and Honglin Chen and Qianli Liao and Tomaso Poggio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SygvZ209F7},\n}", "github": "[![github](/images/github_icon.svg) willwx/sign-symmetry](https://github.com/willwx/sign-symmetry) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SygvZ209F7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;9;9", "confidence": "4;4;5", "wc_review": "1015;231;455", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "788;239;168", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 7.333333333333333, 2.357022603955158 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 567.0, 329.7190723429063 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 398.3333333333333, 277.056352069795 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10952740218459903429&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=SygvZ209F7", "pdf": "https://openreview.net/pdf?id=SygvZ209F7", "email": ";;;", "author_num": 4 }, { "id": "Sygx4305KQ", "title": "Small steps and giant leaps: Minimal Newton solvers for Deep Learning", "track": "main", "status": "Reject", "tldr": "A fast second-order solver for deep learning that works on ImageNet-scale problems with no hyper-parameter tuning", "abstract": "We propose a fast second-order method that can be used as a drop-in replacement for current deep learning solvers. Compared to stochastic gradient descent (SGD), it only requires two additional forward-mode automatic differentiation operations per iteration, which has a computational cost comparable to two standard forward passes and is easy to implement. Our method addresses long-standing issues with current second-order solvers, which invert an approximate Hessian matrix every iteration exactly or by conjugate-gradient methods, procedures that are much slower than a SGD step. Instead, we propose to keep a single estimate of the gradient projected by the inverse Hessian matrix, and update it once per iteration with just two passes over the network. This estimate has the same size and is similar to the momentum variable that is commonly used in SGD. No estimate of the Hessian is maintained.\nWe first validate our method, called CurveBall, on small problems with known solutions (noisy Rosenbrock function and degenerate 2-layer linear networks), where current deep learning solvers struggle. We then train several large models on CIFAR and ImageNet, including ResNet and VGG-f networks, where we demonstrate faster convergence with no hyperparameter tuning. We also show our optimiser's generality by testing on a large set of randomly-generated architectures.", "keywords": "deep learning", "primary_area": "", "supplementary_material": "", "author": "Joao Henriques;Sebastien Ehrhardt;Samuel Albanie;Andrea Vedaldi", "authorids": "joao@robots.ox.ac.uk;hyenal@robots.ox.ac.uk;albanie@robots.ox.ac.uk;vedali@robots.ox.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhenriques2019small,\ntitle={Small steps and giant leaps: Minimal Newton solvers for Deep Learning},\nauthor={Joao Henriques and Sebastien Ehrhardt and Samuel Albanie and Andrea Vedaldi},\nyear={2019},\nurl={https://openreview.net/forum?id=Sygx4305KQ},\n}", "github": "[![github](/images/github_icon.svg) jotaf98/curveball](https://github.com/jotaf98/curveball) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=Sygx4305KQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sygx4305KQ", "pdf_size": 0, "rating": "3;7;7", "confidence": "5;4;5", "wc_review": "1331;638;223", "wc_reply_reviewers": "522;0;0", "wc_reply_authors": "2795;795;409", "reply_reviewers": "2;0;0", "reply_authors": "5;1;1", "rating_avg": [ 5.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 730.6666666666666, 457.0604141929404 ], "wc_reply_reviewers_avg": [ 174.0, 246.07315985291854 ], "wc_reply_authors_avg": [ 1333.0, 1045.7316418023636 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16786428600369055645&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 13 }, { "id": "SygxYoC5FX", "title": "BIGSAGE: unsupervised inductive representation learning of graph via bi-attended sampling and global-biased aggregating", "track": "main", "status": "Reject", "tldr": "For unsupervised and inductive network embedding, we propose a novel approach to explore most relevant neighbors and preserve previously learnt knowledge of nodes by utilizing bi-attention architecture and introducing global bias, respectively", "abstract": "Different kinds of representation learning techniques on graph have shown significant effect in downstream machine learning tasks. Recently, in order to inductively learn representations for graph structures that is unobservable during training, a general framework with sampling and aggregating (GraphSAGE) was proposed by Hamilton and Ying and had been proved more efficient than transductive methods on fileds like transfer learning or evolving dataset. However, GraphSAGE is uncapable of selective neighbor sampling and lack of memory of known nodes that've been trained. To address these problems, we present an unsupervised method that samples neighborhood information attended by co-occurring structures and optimizes a trainable global bias as a representation expectation for each node in the given graph. Experiments show that our approach outperforms the state-of-the-art inductive and unsupervised methods for representation learning on graphs.", "keywords": "network embedding;unsupervised learning;inductive learning", "primary_area": "", "supplementary_material": "", "author": "Xin Luo;Hankz Hankui Zhuo", "authorids": "luox35@mail2.sysu.edu.cn;zhuohank@mail.sysu.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nluo2019bigsage,\ntitle={{BIGSAGE}: unsupervised inductive representation learning of graph via bi-attended sampling and global-biased aggregating},\nauthor={Xin Luo and Hankz Hankui Zhuo},\nyear={2019},\nurl={https://openreview.net/forum?id=SygxYoC5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SygxYoC5FX", "pdf_size": 0, "rating": "2;4;4", "confidence": "4;4;3", "wc_review": "251;181;245", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 225.66666666666666, 31.678944988044595 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13449738012719297998&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Syl6tjAqKX", "title": "BEHAVIOR MODULE IN NEURAL NETWORKS", "track": "main", "status": "Reject", "tldr": "Extendable Modular Architecture is proposed for developing of variety of Agent Behaviors in DQN.", "abstract": "Prefrontal cortex (PFC) is a part of the brain which is responsible for behavior repertoire. Inspired by PFC functionality and connectivity, as well as human behavior formation process, we propose a novel modular architecture of neural networks with a Behavioral Module (BM) and corresponding end-to-end training strategy. This approach allows the efficient learning of behaviors and preferences representation. This property is particularly useful for user modeling (as for dialog agents) and recommendation tasks, as allows learning personalized representations of different user states. In the experiment with video games playing, the resultsshow that the proposed method allows separation of main task\u2019s objectives andbehaviors between different BMs. The experiments also show network extendability through independent learning of new behavior patterns. Moreover, we demonstrate a strategy for an efficient transfer of newly learned BMs to unseen tasks.", "keywords": "Modular Networks;Reinforcement Learning;Task Separation;Representation Learning;Transfer Learning;Adversarial Transfer", "primary_area": "", "supplementary_material": "", "author": "Andrey Sakryukin;Yongkang Wong;Mohan S. Kankanhalli", "authorids": "asakryukin@u.nus.edu;yongkang.wong@nus.edu.sg;mohan@comp.nus.edu.sg", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsakryukin2019behavior,\ntitle={{BEHAVIOR} {MODULE} {IN} {NEURAL} {NETWORKS}},\nauthor={Andrey Sakryukin and Yongkang Wong and Mohan S. Kankanhalli},\nyear={2019},\nurl={https://openreview.net/forum?id=Syl6tjAqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Syl6tjAqKX", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;5", "wc_review": "283;324;576", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 394.3333333333333, 129.54364343940446 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Coarse-grain Fine-grain Coattention Network for Multi-evidence Question Answering", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/757", "id": "Syl7OsRqY7", "author_site": "Victor Zhong, Caiming Xiong, Nitish Shirish Keskar, richard socher", "tldr": "A new state-of-the-art model for multi-evidence question answering using coarse-grain fine-grain hierarchical attention.", "abstract": "End-to-end neural models have made significant progress in question answering, however recent studies show that these models implicitly assume that the answer and evidence appear close together in a single document. In this work, we propose the Coarse-grain Fine-grain Coattention Network (CFC), a new question answering model that combines information from evidence across multiple documents. The CFC consists of a coarse-grain module that interprets documents with respect to the query then finds a relevant answer, and a fine-grain module which scores each candidate answer by comparing its occurrences across all of the documents with the query. We design these modules using hierarchies of coattention and self-attention, which learn to emphasize different parts of the input. On the Qangaroo WikiHop multi-evidence question answering task, the CFC obtains a new state-of-the-art result of 70.6% on the blind test set, outperforming the previous best by 3% accuracy despite not using pretrained contextual encoders.", "keywords": "question answering;reading comprehension;nlp;natural language processing;attention;representation learning", "primary_area": "", "supplementary_material": "", "author": "Victor Zhong;Caiming Xiong;Nitish Shirish Keskar;Richard Socher", "authorids": "victor@victorzhong.com;cxiong@salesforce.com;nkeskar@salesforce.com;richard@socher.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzhong2018coarsegrain,\ntitle={Coarse-grain Fine-grain Coattention Network for Multi-evidence Question Answering},\nauthor={Victor Zhong and Caiming Xiong and Nitish Keskar and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syl7OsRqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;7", "confidence": "3;5;4", "wc_review": "306;429;273", "wc_reply_reviewers": "285;16;0", "wc_reply_authors": "1120;462;152", "reply_reviewers": "2;1;0", "reply_authors": "4;1;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 336.0, 67.12674578735364 ], "wc_reply_reviewers_avg": [ 100.33333333333333, 130.74232503499223 ], "wc_reply_authors_avg": [ 578.0, 403.60706964406194 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 78, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6052706847759570603&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Syl7OsRqY7", "pdf": "https://openreview.net/pdf?id=Syl7OsRqY7", "email": ";;;", "author_num": 4 }, { "title": "Learning a Meta-Solver for Syntax-Guided Program Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/801", "id": "Syl8Sn0cK7", "author_site": "Xujie Si, Yuan Yang, Hanjun Dai, Mayur Naik, Le Song", "tldr": "We propose a meta-learning framework that learns a transferable policy from only weak supervision to solve synthesis tasks with different logical specifications and grammars.", "abstract": "We study a general formulation of program synthesis called syntax-guided synthesis(SyGuS) that concerns synthesizing a program that follows a given grammar and satisfies a given logical specification. Both the logical specification and the grammar have complex structures and can vary from task to task, posing significant challenges for learning across different tasks. Furthermore, training data is often unavailable for domain specific synthesis tasks. To address these challenges, we propose a meta-learning framework that learns a transferable policy from only weak supervision. Our framework consists of three components: 1) an encoder, which embeds both the logical specification and grammar at the same time using a graph neural network; 2) a grammar adaptive policy network which enables learning a transferable policy; and 3) a reinforcement learning algorithm that jointly trains the embedding and adaptive policy. We evaluate the framework on 214 cryptographic circuit synthesis tasks. It solves 141 of them in the out-of-box solver setting, significantly outperforming a similar search-based approach but without learning, which solves only 31. The result is comparable to two state-of-the-art classical synthesis engines, which solve 129 and 153 respectively. In the meta-solver setting, the framework can efficiently adapt to unseen tasks and achieves speedup ranging from 2x up to 100x.", "keywords": "Syntax-guided Synthesis;Context Free Grammar;Logical Specification;Representation Learning;Meta Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Xujie Si;Yuan Yang;Hanjun Dai;Mayur Naik;Le Song", "authorids": "xsi@cis.upenn.edu;yyang754@gatech.edu;hanjundai@gatech.edu;mhnaik@cis.upenn.edu;lsong@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nsi2018learning,\ntitle={Learning a Meta-Solver for Syntax-Guided Program Synthesis},\nauthor={Xujie Si and Yuan Yang and Hanjun Dai and Mayur Naik and Le Song},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syl8Sn0cK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "2;5;4", "wc_review": "234;700;782", "wc_reply_reviewers": "0;0;321", "wc_reply_authors": "371;775;1331", "reply_reviewers": "0;0;3", "reply_authors": "1;1;3", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 572.0, 241.3351749469328 ], "wc_reply_reviewers_avg": [ 107.0, 151.32085117392117 ], "wc_reply_authors_avg": [ 825.6666666666666, 393.55248132308293 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14131454637714712316&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=Syl8Sn0cK7", "pdf": "https://openreview.net/pdf?id=Syl8Sn0cK7", "email": ";;;;", "author_num": 5 }, { "title": "Towards Robust, Locally Linear Deep Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/964", "id": "SylCrnCcFX", "author_site": "Guang-He Lee, David Alvarez-Melis, Tommi Jaakkola", "tldr": "A scalable algorithm to establish robust derivatives of deep networks w.r.t. the inputs.", "abstract": "Deep networks realize complex mappings that are often understood by their locally linear behavior at or around points of interest. For example, we use the derivative of the mapping with respect to its inputs for sensitivity analysis, or to explain (obtain coordinate relevance for) a prediction. One key challenge is that such derivatives are themselves inherently unstable. In this paper, we propose a new learning problem to encourage deep networks to have stable derivatives over larger regions. While the problem is challenging in general, we focus on networks with piecewise linear activation functions. Our algorithm consists of an inference step that identifies a region around a point where linear approximation is provably stable, and an optimization step to expand such regions. We propose a novel relaxation to scale the algorithm to realistic models. We illustrate our method with residual and recurrent networks on image and sequence datasets.", "keywords": "robust derivatives;transparency;interpretability", "primary_area": "", "supplementary_material": "", "author": "Guang-He Lee;David Alvarez-Melis;Tommi S. Jaakkola", "authorids": "guanghe@csail.mit.edu;davidam@csail.mit.edu;tommi@csail.mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlee2018towards,\ntitle={Towards Robust, Locally Linear Deep Networks},\nauthor={Guang-He Lee and David Alvarez-Melis and Tommi S. Jaakkola},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SylCrnCcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;3;4", "wc_review": "266;421;196", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "23;665;173", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 294.3333333333333, 94.0153651744697 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 287.0, 274.2115971289325 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12036498269952329745&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SylCrnCcFX", "pdf": "https://openreview.net/pdf?id=SylCrnCcFX", "email": ";;", "author_num": 3 }, { "title": "How Important is a Neuron", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/927", "id": "SylKoo0cKm", "author_site": "Kedar Dhamdhere, Mukund Sundararajan, Qiqi Yan", "tldr": "", "abstract": "The problem of attributing a deep network\u2019s prediction to its input/base features is\nwell-studied (cf. Simonyan et al. (2013)). We introduce the notion of conductance\nto extend the notion of attribution to understanding the importance of hidden units.\nInformally, the conductance of a hidden unit of a deep network is the flow of attribution\nvia this hidden unit. We can use conductance to understand the importance of\na hidden unit to the prediction for a specific input, or over a set of inputs. We justify\nconductance in multiple ways via a qualitative comparison with other methods,\nvia some axiomatic results, and via an empirical evaluation based on a feature\nselection task. The empirical evaluations are done using the Inception network\nover ImageNet data, and a convolutinal network over text data. In both cases, we\ndemonstrate the effectiveness of conductance in identifying interesting insights\nabout the internal workings of these networks.", "keywords": "attribution;saliency;influence", "primary_area": "", "supplementary_material": "", "author": "Kedar Dhamdhere;Mukund Sundararajan;Qiqi Yan", "authorids": "kedar@google.com;mukunds@google.com;qiqiyan@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ndhamdhere2018how,\ntitle={How Important is a Neuron},\nauthor={Kedar Dhamdhere and Mukund Sundararajan and Qiqi Yan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SylKoo0cKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;2;4", "wc_review": "246;233;1079", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "77;150;66", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 519.3333333333334, 395.7796805743766 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 97.66666666666667, 37.276742823851386 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 179, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9400986673874150699&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SylKoo0cKm", "pdf": "https://openreview.net/pdf?id=SylKoo0cKm", "email": ";;", "author_num": 3 }, { "title": "Learning to Make Analogies by Contrasting Abstract Relational Structure", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/943", "id": "SylLYsCcFm", "author_site": "Felix Hill, Adam Santoro, David Barrett, Ari Morcos, Timothy Lillicrap", "tldr": "The most robust capacity for analogical reasoning is induced when networks learn analogies by contrasting abstract relational structures in their input domains.", "abstract": "Analogical reasoning has been a principal focus of various waves of AI research. Analogy is particularly challenging for machines because it requires relational structures to be represented such that they can be flexibly applied across diverse domains of experience. Here, we study how analogical reasoning can be induced in neural networks that learn to perceive and reason about raw visual data. We find that the critical factor for inducing such a capacity is not an elaborate architecture, but rather, careful attention to the choice of data and the manner in which it is presented to the model. The most robust capacity for analogical reasoning is induced when networks learn analogies by contrasting abstract relational structures in their input domains, a training method that uses only the input data to force models to learn about important abstract features. Using this technique we demonstrate capacities for complex, visual and symbolic analogy making and generalisation in even the simplest neural network architectures.", "keywords": "cognitive science;analogy;psychology;cognitive theory;cognition;abstraction;generalization", "primary_area": "", "supplementary_material": "", "author": "Felix Hill;Adam Santoro;David Barrett;Ari Morcos;Timothy Lillicrap", "authorids": "felixhill@google.com;adamsantoro@google.com;barrettdavid@google.com;arimorcos@google.com;countzero@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhill2018learning,\ntitle={Learning to Make Analogies by Contrasting Abstract Relational Structure},\nauthor={Felix Hill and Adam Santoro and David Barrett and Ari Morcos and Timothy Lillicrap},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SylLYsCcFm},\n}", "github": "[![github](/images/github_icon.svg) deepmind/abstract-reasoning-matrices](https://github.com/deepmind/abstract-reasoning-matrices) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SylLYsCcFm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;5;3", "wc_review": "286;372;267", "wc_reply_reviewers": "93;215;49", "wc_reply_authors": "846;669;862", "reply_reviewers": "1;2;1", "reply_authors": "2;2;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 308.3333333333333, 45.68247901426639 ], "wc_reply_reviewers_avg": [ 119.0, 70.21870595978444 ], "wc_reply_authors_avg": [ 792.3333333333334, 87.45411495305537 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15521573039503233138&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=SylLYsCcFm", "pdf": "https://openreview.net/pdf?id=SylLYsCcFm", "email": ";;;;", "author_num": 5 }, { "title": "Learning what you can do before doing anything", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/651", "id": "SylPMnR9Ym", "author_site": "Oleh Rybkin, Karl Pertsch, Kosta Derpanis, Kostas Daniilidis, Andrew Jaegle", "tldr": "We learn a representation of an agent's action space from pure visual observations. We use a recurrent latent variable approach with a novel composability loss.", "abstract": "Intelligent agents can learn to represent the action spaces of other agents simply by observing them act. Such representations help agents quickly learn to predict the effects of their own actions on the environment and to plan complex action sequences. In this work, we address the problem of learning an agent\u2019s action space purely from visual observation. We use stochastic video prediction to learn a latent variable that captures the scene's dynamics while being minimally sensitive to the scene's static content. We introduce a loss term that encourages the network to capture the composability of visual sequences and show that it leads to representations that disentangle the structure of actions. We call the full model with composable action representations Composable Learned Action Space Predictor (CLASP). We show the applicability of our method to synthetic settings and its potential to capture action spaces in complex, realistic visual settings. When used in a semi-supervised setting, our learned representations perform comparably to existing fully supervised methods on tasks such as action-conditioned video prediction and planning in the learned action space, while requiring orders of magnitude fewer action labels. Project website: https://daniilidis-group.github.io/learned_action_spaces", "keywords": "unsupervised learning;vision;motion;action space;video prediction;variational models", "primary_area": "", "supplementary_material": "", "author": "Oleh Rybkin;Karl Pertsch;Konstantinos G. Derpanis;Kostas Daniilidis;Andrew Jaegle", "authorids": "oleh@seas.upenn.edu;pertsch@usc.edu;kosta@ryerson.ca;kostas@seas.upenn.edu;ajaegle@upenn.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nrybkin2018learning,\ntitle={Learning what you can do before doing anything},\nauthor={Oleh Rybkin and Karl Pertsch and Andrew Jaegle and Konstantinos G. Derpanis and Kostas Daniilidis},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SylPMnR9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "wc_review": "370;440;606", "wc_reply_reviewers": "0;0;131", "wc_reply_authors": "483;732;793", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 472.0, 98.96800829897845 ], "wc_reply_reviewers_avg": [ 43.666666666666664, 61.753992223625154 ], "wc_reply_authors_avg": [ 669.3333333333334, 134.09035096613857 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6991177284121513943&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SylPMnR9Ym", "pdf": "https://openreview.net/pdf?id=SylPMnR9Ym", "email": ";;;;", "author_num": 5 }, { "id": "SylU3jC5Y7", "title": "ADAPTIVE NETWORK SPARSIFICATION VIA DEPENDENT VARIATIONAL BETA-BERNOULLI DROPOUT", "track": "main", "status": "Reject", "tldr": "We propose a novel Bayesian network sparsification method that adaptively prunes networks according to inputs.", "abstract": "While variational dropout approaches have been shown to be effective for network sparsification, they are still suboptimal in the sense that they set the dropout rate for each neuron without consideration of the input data. With such input-independent dropout, each neuron is evolved to be generic across inputs, which makes it difficult to sparsify networks without accuracy loss. To overcome this limitation, we propose adaptive variational dropout whose probabilities are drawn from sparsity-inducing beta-Bernoulli prior. It allows each neuron to be evolved either to be generic or specific for certain inputs, or dropped altogether. Such input-adaptive sparsity-inducing dropout allows the resulting network to tolerate larger degree of sparsity without losing its expressive power by removing redundancies among features. We validate our dependent variational beta-Bernoulli dropout on multiple public datasets, on which it obtains significantly more compact networks than baseline methods, with consistent accuracy improvements over the base networks.", "keywords": "Bayesian deep learning;network pruning", "primary_area": "", "supplementary_material": "", "author": "Juho Lee;Saehoon Kim;Jaehong Yoon;Hae Beom Lee;Eunho Yang;Sung Ju Hwang", "authorids": "juho.lee@stats.ox.ac.uk;shkim@aitrics.com;jaehong.yoon@kaist.ac.kr;haebeom.lee@kaist.ac.kr;eunhoy@kaist.ac.kr;sjhwang82@kaist.ac.kr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlee2019adaptive,\ntitle={{ADAPTIVE} {NETWORK} {SPARSIFICATION} {VIA} {DEPENDENT} {VARIATIONAL} {BETA}-{BERNOULLI} {DROPOUT}},\nauthor={Juho Lee and Saehoon Kim and Jaehong Yoon and Hae Beom Lee and Eunho Yang and Sung Ju Hwang},\nyear={2019},\nurl={https://openreview.net/forum?id=SylU3jC5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SylU3jC5Y7", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "759;171;646", "wc_reply_reviewers": "321;0;0", "wc_reply_authors": "1056;385;155", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 525.3333333333334, 254.76307075834643 ], "wc_reply_reviewers_avg": [ 107.0, 151.32085117392117 ], "wc_reply_authors_avg": [ 532.0, 382.23640154577987 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1629123053934656258&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SylWNnA5FQ", "title": "Program Synthesis with Learned Code Idioms", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Program synthesis of general-purpose source code from natural language specifi-\ncations is challenging due to the need to reason about high-level patterns in the\ntarget program and low-level implementation details at the same time. In this work,\nwe present PATOIS , the first system that allows a neural program synthesizer to\nexplicitly interleave high-level and low-level reasoning at every generation step. It\naccomplishes this by automatically mining common code idioms from a given cor-\npus and then incorporating them into the underlying language for neural synthesis.\nWe evaluate PATOIS on a challenging program synthesis dataset NAPS and show\nthat using learned code idioms improves the synthesizer\u2019s accuracy.", "keywords": "program synthesis;semantic parsing;code idioms;domain-specific languages", "primary_area": "", "supplementary_material": "", "author": "Richard Shin;Marc Brockschmidt;Miltiadis Allamanis;Oleksandr Polozov", "authorids": "ricshin@cs.berkeley.edu;mabrocks@microsoft.com;miallama@microsoft.com;polozov@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=SylWNnA5FQ", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6944328686798507025&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "Sylw7nCqFQ", "title": "IMAGE DEFORMATION META-NETWORK FOR ONE-SHOT LEARNING", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Humans can robustly learn novel visual concepts even when images undergo various deformations and loose certain information. Incorporating this ability to synthesize deformed instances of new concepts might help visual recognition systems perform better one-shot learning, i.e., learning concepts from one or few examples. Our key insight is that, while the deformed images might not be visually realistic, they still maintain critical semantic information and contribute significantly in formulating classifier decision boundaries. Inspired by the recent progress on meta-learning, we combine a meta-learner with an image deformation network that produces additional training examples, and optimize both models in an endto- end manner. The deformation network learns to synthesize images by fusing a pair of images\u2014a probe image that keeps the visual content and a gallery image that diversifies the deformations. We demonstrate results on the widely used oneshot learning benchmarks (miniImageNet and ImageNet 1K challenge datasets), which significantly outperform the previous state-of-the-art approaches.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Zitian Chen;Yanwei Fu;Yu-Xiong Wang;Lin Ma;Wei Liu;Martial Hebert", "authorids": "tankche2@gmail.com;yanweifu@fudan.edu.cn;yuxiongw@cs.cmu.edu;forest.linma@gmail.com;wl2223@columbia.edu;hebert@ri.cmu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Sylw7nCqFQ", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;1", "wc_review": "619;551;267", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "378;395;231", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 479.0, 152.45545797598282 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 334.6666666666667, 73.631213188128 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 303, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8511386292870768575&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "title": "Learning Grid Cells as Vector Representation of Self-Position Coupled with Matrix Representation of Self-Motion", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/702", "id": "Syx0Mh05YQ", "author_site": "Ruiqi Gao, Jianwen Xie, Song-Chun Zhu, Yingnian Wu", "tldr": "", "abstract": "This paper proposes a representational model for grid cells. In this model, the 2D self-position of the agent is represented by a high-dimensional vector, and the 2D self-motion or displacement of the agent is represented by a matrix that transforms the vector. Each component of the vector is a unit or a cell. The model consists of the following three sub-models. (1) Vector-matrix multiplication. The movement from the current position to the next position is modeled by matrix-vector multi- plication, i.e., the vector of the next position is obtained by multiplying the matrix of the motion to the vector of the current position. (2) Magnified local isometry. The angle between two nearby vectors equals the Euclidean distance between the two corresponding positions multiplied by a magnifying factor. (3) Global adjacency kernel. The inner product between two vectors measures the adjacency between the two corresponding positions, which is defined by a kernel function of the Euclidean distance between the two positions. Our representational model has explicit algebra and geometry. It can learn hexagon patterns of grid cells, and it is capable of error correction, path integral and path planning.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruiqi Gao;Jianwen Xie;Song-Chun Zhu;Ying Nian Wu", "authorids": "ruiqigao@ucla.edu;jianwen@ucla.edu;sczhu@stat.ucla.edu;ywu@stat.ucla.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngao2018learning,\ntitle={Learning Grid Cells as Vector Representation of Self-Position Coupled with Matrix Representation of Self-Motion},\nauthor={Ruiqi Gao and Jianwen Xie and Song-Chun Zhu and Ying Nian Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syx0Mh05YQ},\n}", "github": "[![github](/images/github_icon.svg) ruiqigao/GridCell](https://github.com/ruiqigao/GridCell)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;5;4", "wc_review": "383;693;231", "wc_reply_reviewers": "24;15;0", "wc_reply_authors": "1035;984;328", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 435.6666666666667, 192.25214924387424 ], "wc_reply_reviewers_avg": [ 13.0, 9.899494936611665 ], "wc_reply_authors_avg": [ 782.3333333333334, 321.93615654177285 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1267366913161335013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Syx0Mh05YQ", "pdf": "https://openreview.net/pdf?id=Syx0Mh05YQ", "email": ";;;", "author_num": 4 }, { "id": "Syx4_iCqKQ", "title": "Polar Prototype Networks", "track": "main", "status": "Reject", "tldr": "This work proposes a class of networks that can jointly perform classification and regression by imposing layout structures in the network output space.", "abstract": "This paper proposes a neural network for classification and regression, without the need to learn layout structures in the output space. Standard solutions such as softmax cross-entropy and mean squared error are effective but parametric, meaning that known inductive structures such as maximum margin separation and simplicity (Occam's Razor) need to be learned for the task at hand. Instead, we propose polar prototype networks, a class of networks that explicitly states the structure, \\ie the layout, of the output. The structure is defined by polar prototypes, points on the hypersphere of the output space. For classification, each class is described by a single polar prototype and they are a priori distributed with maximal separation and equal shares on the hypersphere. Classes are assigned to prototypes randomly or based on semantic priors and training becomes a matter of minimizing angular distances between examples and their class prototypes. For regression, we show that training can be performed as a polar interpolation between two prototypes, arriving at a regression with higher-dimensional outputs. From empirical analysis, we find that polar prototype networks benefit from large margin separation and semantic class structure, while only requiring a minimal amount of output dimensions. While the structure is simple, the performance is on par with (classification) or better than (regression) standard network methods. Moreover, we show that we gain the ability to perform regression and classification jointly in the same space, which is disentangled and interpretable by design.", "keywords": "prototype networks;polar prototypes;output structure", "primary_area": "", "supplementary_material": "", "author": "Pascal Mettes;Elise van der Pol;Cees G. M. Snoek", "authorids": "p.s.m.mettes@uva.nl;e.e.vanderpol@uva.nl;cgmsnoek@uva.nl", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmettes2019polar,\ntitle={Polar Prototype Networks},\nauthor={Pascal Mettes and Elise van der Pol and Cees G. M. Snoek},\nyear={2019},\nurl={https://openreview.net/forum?id=Syx4_iCqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=Syx4_iCqKQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;3", "wc_review": "304;454;324", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "718;638;482", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 360.6666666666667, 66.4997911442 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 612.6666666666666, 97.99773240006911 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:R0IePEtN3QEJ:scholar.google.com/&scioq=Polar+Prototype+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Universal Stagewise Learning for Non-Convex Problems with Convergence on Averaged Solutions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/955", "id": "Syx5V2CcFm", "author_site": "Zaiyi Chen, Zhuoning Yuan, Jinfeng Yi, Bowen Zhou, Enhong Chen, Tianbao Yang", "tldr": "", "abstract": "Although stochastic gradient descent (SGD) method and its variants (e.g., stochastic momentum methods, AdaGrad) are algorithms of choice for solving non-convex problems (especially deep learning), big gaps still remain between the theory and the practice with many questions unresolved. For example, there is still a lack of theories of convergence for SGD and its variants that use stagewise step size and return an averaged solution in practice. In addition, theoretical insights of why adaptive step size of AdaGrad could improve non-adaptive step size of SGD is still missing for non-convex optimization. This paper aims to address these questions and fill the gap between theory and practice. We propose a universal stagewise optimization framework for a broad family of non-smooth non-convex problems with the following key features: (i) at each stage any suitable stochastic convex optimization algorithms (e.g., SGD or AdaGrad) that return an averaged solution can be employed for minimizing a regularized convex problem; (ii) the step size is decreased in a stagewise manner; (iii) an averaged solution is returned as the final solution. % that is selected from all stagewise averaged solutions with sampling probabilities increasing as the stage number. \nOur theoretical results of stagewise {\\ada} exhibit its adaptive convergence, therefore shed insights on its faster convergence than stagewise SGD for problems with slowly growing cumulative stochastic gradients. To the best of our knowledge, these new results are the first of their kind for addressing the unresolved issues of existing theories mentioned earlier. Besides theoretical contributions, our empirical studies show that our stagewise variants of SGD, AdaGrad improve the generalization performance of existing variants/implementations of SGD and AdaGrad. ", "keywords": "optimization;sgd;adagrad", "primary_area": "", "supplementary_material": "", "author": "Zaiyi Chen;Zhuoning Yuan;Jinfeng Yi;Bowen Zhou;Enhong Chen;Tianbao Yang", "authorids": "czy6516@hotmail.com;zhuoning-yuan@uiowa.edu;jinfengyi.ustc@gmail.com;bwen@jd.com;cheneh@ustc.edu.cn;tianbao-yang@uiowa.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nchen2018universal,\ntitle={Universal Stagewise Learning for Non-Convex Problems with Convergence on Averaged Solutions},\nauthor={Zaiyi Chen and Zhuoning Yuan and Jinfeng Yi and Bowen Zhou and Enhong Chen and Tianbao Yang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syx5V2CcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;4;4", "wc_review": "239;206;304", "wc_reply_reviewers": "88;17;0", "wc_reply_authors": "309;539;161", "reply_reviewers": "1;1;0", "reply_authors": "3;1;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 249.66666666666666, 40.71308825863687 ], "wc_reply_reviewers_avg": [ 35.0, 38.113864493995706 ], "wc_reply_authors_avg": [ 336.3333333333333, 155.52348875830802 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13646070350942826341&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=Syx5V2CcFm", "pdf": "https://openreview.net/pdf?id=Syx5V2CcFm", "email": ";;;;;", "author_num": 6 }, { "title": "Invariant and Equivariant Graph Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/764", "id": "Syx72jC9tm", "author_site": "Haggai Maron, Heli Ben-Hamu, Nadav Shamir, Yaron Lipman", "tldr": "The paper provides a full characterization of permutation invariant and equivariant linear layers for graph data.", "abstract": "Invariant and equivariant networks have been successfully used for learning images, sets, point clouds, and graphs. A basic challenge in developing such networks is finding the maximal collection of invariant and equivariant \\emph{linear} layers. Although this question is answered for the first three examples (for popular transformations, at-least), a full characterization of invariant and equivariant linear layers for graphs is not known. \n\nIn this paper we provide a characterization of all permutation invariant and equivariant linear layers for (hyper-)graph data, and show that their dimension, in case of edge-value graph data, is $2$ and $15$, respectively. More generally, for graph data defined on $k$-tuples of nodes, the dimension is the $k$-th and $2k$-th Bell numbers. Orthogonal bases for the layers are computed, including generalization to multi-graph data. The constant number of basis elements and their characteristics allow successfully applying the networks to different size graphs. From the theoretical point of view, our results generalize and unify recent advancement in equivariant deep learning. In particular, we show that our model is capable of approximating any message passing neural network.\n\nApplying these new linear layers in a simple deep neural network framework is shown to achieve comparable results to state-of-the-art and to have better expressivity than previous invariant and equivariant bases.\n", "keywords": "graph learning;equivariance;deep learning", "primary_area": "", "supplementary_material": "", "author": "Haggai Maron;Heli Ben-Hamu;Nadav Shamir;Yaron Lipman", "authorids": "haggai.maron@weizmann.ac.il;heli.benhamu@weizmann.ac.il;nadav13@gmail.com;yaron.lipman@weizmann.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmaron2018invariant,\ntitle={Invariant and Equivariant Graph Networks},\nauthor={Haggai Maron and Heli Ben-Hamu and Nadav Shamir and Yaron Lipman},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syx72jC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;8;9", "confidence": "5;5;4", "wc_review": "733;221;601", "wc_reply_reviewers": "381;0;0", "wc_reply_authors": "929;287;212", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 2.160246899469287 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 518.3333333333334, 217.04275052522615 ], "wc_reply_reviewers_avg": [ 127.0, 179.60512242138307 ], "wc_reply_authors_avg": [ 476.0, 321.77942755869276 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.654653670707977, "gs_citation": 599, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17830445355098449552&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=Syx72jC9tm", "pdf": "https://openreview.net/pdf?id=Syx72jC9tm", "email": ";;;", "author_num": 4 }, { "id": "Syx9rnRcYm", "title": "A CASE STUDY ON OPTIMAL DEEP LEARNING MODEL FOR UAVS", "track": "main", "status": "Reject", "tldr": "case study on optimal deep learning model for UAVs", "abstract": "Over the passage of time Unmanned Autonomous Vehicles (UAVs), especially\nAutonomous flying drones grabbed a lot of attention in Artificial Intelligence.\nSince electronic technology is getting smaller, cheaper and more efficient, huge\nadvancement in the study of UAVs has been observed recently. From monitoring\nfloods, discerning the spread of algae in water bodies to detecting forest trail, their\napplication is far and wide. Our work is mainly focused on autonomous flying\ndrones where we establish a case study towards efficiency, robustness and accuracy\nof UAVs where we showed our results well supported through experiments.\nWe provide details of the software and hardware architecture used in the study. We\nfurther discuss about our implementation algorithms and present experiments that\nprovide a comparison between three different state-of-the-art algorithms namely\nTrailNet, InceptionResnet and MobileNet in terms of accuracy, robustness, power\nconsumption and inference time. In our study, we have shown that MobileNet has\nproduced better results with very less computational requirement and power consumption.\nWe have also reported the challenges we have faced during our work\nas well as a brief discussion on our future work to improve safety features and\nperformance.", "keywords": "Energy Efficiency;Autonomous Flying;Trail Detection", "primary_area": "", "supplementary_material": "", "author": "Chandan Kumar;Subrahmanyam Vaddi;Aishwarya Sarkar", "authorids": "chandan@iastate.edu;svaddi@iastate.edu;asarkar1@iastate.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkumar2019a,\ntitle={A {CASE} {STUDY} {ON} {OPTIMAL} {DEEP} {LEARNING} {MODEL} {FOR} {UAVS}},\nauthor={Chandan Kumar and Subrahmanyam Vaddi and Aishwarya Sarkar},\nyear={2019},\nurl={https://openreview.net/forum?id=Syx9rnRcYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer5", "site": "https://openreview.net/forum?id=Syx9rnRcYm", "pdf_size": 0, "rating": "2;3;3", "confidence": "2;2;3", "wc_review": "541;277;292", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 370.0, 121.07022755409358 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15118569207969131514&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Robustness May Be at Odds with Accuracy", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1032", "id": "SyxAb30cY7", "author_site": "Dimitris Tsipras, Shibani Santurkar, Logan Engstrom, Alexander Turner, Aleksander Madry", "tldr": "We show that adversarial robustness might come at the cost of standard classification performance, but also yields unexpected benefits.", "abstract": "We show that there exists an inherent tension between the goal of adversarial robustness and that of standard generalization. \nSpecifically, training robust models may not only be more resource-consuming, but also lead to a reduction of standard accuracy. We demonstrate that this trade-off between the standard accuracy of a model and its robustness to adversarial perturbations provably exists even in a fairly simple and natural setting. These findings also corroborate a similar phenomenon observed in practice. Further, we argue that this phenomenon is a consequence of robust classifiers learning fundamentally different feature representations than standard classifiers. These differences, in particular, seem to result in unexpected benefits: the features learned by robust models tend to align better with salient data characteristics and human perception.", "keywords": "adversarial examples;robust machine learning;robust optimization;deep feature representations", "primary_area": "", "supplementary_material": "", "author": "Dimitris Tsipras;Shibani Santurkar;Logan Engstrom;Alexander Turner;Aleksander Madry", "authorids": "tsipras@mit.edu;shibani@mit.edu;engstrom@mit.edu;turneram@mit.edu;madry@mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ntsipras2018robustness,\ntitle={Robustness May Be at Odds with Accuracy},\nauthor={Dimitris Tsipras and Shibani Santurkar and Logan Engstrom and Alexander Turner and Aleksander Madry},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxAb30cY7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=SyxAb30cY7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;2;3", "wc_review": "304;167;339", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "141;33;396", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 270.0, 74.22039252568439 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 190.0, 152.190669884852 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 2099, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5850945088404252192&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=SyxAb30cY7", "pdf": "https://openreview.net/pdf?id=SyxAb30cY7", "email": ";;;;", "author_num": 5 }, { "id": "SyxHKjAcYX", "title": "Zero-Resource Multilingual Model Transfer: Learning What to Share", "track": "main", "status": "Reject", "tldr": "A zero-resource multilingual transfer learning model that requires neither target language training data nor cross-lingual resources.", "abstract": "Modern natural language processing and understanding applications have enjoyed a great boost utilizing neural networks models. However, this is not the case for most languages especially low-resource ones with insufficient annotated training data. Cross-lingual transfer learning methods improve the performance on a low-resource target language by leveraging labeled data from other (source) languages, typically with the help of cross-lingual resources such as parallel corpora. In this work, we propose a zero-resource multilingual transfer learning model that can utilize training data in multiple source languages, while not requiring target language training data nor cross-lingual supervision. Unlike most existing methods that only rely on language-invariant features for cross-lingual transfer, our approach utilizes both language-invariant and language-specific features in a coherent way. Our model leverages adversarial networks to learn language-invariant features and mixture-of-experts models to dynamically exploit the relation between the target language and each individual source language. This enables our model to learn effectively what to share between various languages in the multilingual setup. It results in significant performance gains over prior art, as shown in an extensive set of experiments over multiple text classification and sequence tagging tasks including a large-scale real-world industry dataset.", "keywords": "cross-lingual transfer learning;multilingual transfer learning;zero-resource model transfer;adversarial training;mixture of experts;multilingual natural language understanding", "primary_area": "", "supplementary_material": "", "author": "Xilun Chen;Ahmed Hassan Awadallah;Hany Hassan;Wei Wang;Claire Cardie", "authorids": "xlchen@cs.cornell.edu;hassanam@microsoft.com;hanyh@microsoft.com;wei.wang@microsoft.com;cardie@cs.cornell.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchen2019zeroresource,\ntitle={Zero-Resource Multilingual Model Transfer: Learning What to Share},\nauthor={Xilun Chen and Ahmed Hassan Awadallah and Hany Hassan and Wei Wang and Claire Cardie},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxHKjAcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyxHKjAcYX", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;4", "wc_review": "256;302;190", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "359;238;280", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 249.33333333333334, 45.966171135835204 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 292.3333333333333, 50.16195991209098 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17647734947501122603&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "SyxMWh09KX", "title": "Attentive Task-Agnostic Meta-Learning for Few-Shot Text Classification", "track": "main", "status": "Reject", "tldr": "Meta-learning task-agnostic representations with attention.", "abstract": "Current deep learning based text classification methods are limited by their ability to achieve fast learning and generalization when the data is scarce. We address this problem by integrating a meta-learning procedure that uses the knowledge learned across many tasks as an inductive bias towards better natural language understanding. Inspired by the Model-Agnostic Meta-Learning framework (MAML), we introduce the Attentive Task-Agnostic Meta-Learning (ATAML) algorithm for text classification. The proposed ATAML is designed to encourage task-agnostic representation learning by way of task-agnostic parameterization and facilitate task-specific adaptation via attention mechanisms. We provide evidence to show that the attention mechanism in ATAML has a synergistic effect on learning performance. Our experimental results reveal that, for few-shot text classification tasks, gradient-based meta-learning approaches ourperform popular transfer learning methods. In comparisons with models trained from random initialization, pretrained models and meta trained MAML, our proposed ATAML method generalizes better on single-label and multi-label classification tasks in miniRCV1 and miniReuters-21578 datasets.", "keywords": "meta-learning;learning to learn;few-shot learning", "primary_area": "", "supplementary_material": "", "author": "Xiang Jiang;Mohammad Havaei;Gabriel Chartrand;Hassan Chouaib;Thomas Vincent;Andrew Jesson;Nicolas Chapados;Stan Matwin", "authorids": "xiang.jiang@dal.ca;mohammad@imagia.com;gabriel@imagia.com;hassan.chouaib@imagia.com;thomas.vincent@imagia.com;andrew.jesson@imagia.com;nic@imagia.com;stan@cs.dal.ca", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\njiang2019attentive,\ntitle={Attentive Task-Agnostic Meta-Learning for Few-Shot Text Classification},\nauthor={Xiang Jiang and Mohammad Havaei and Gabriel Chartrand and Hassan Chouaib and Thomas Vincent and Andrew Jesson and Nicolas Chapados and Stan Matwin},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxMWh09KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyxMWh09KX", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;4;3", "wc_review": "165;409;146", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 240.0, 119.75252258999251 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15704130194915928400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "SyxXhsAcFQ", "title": "Cohen Welling bases & SO(2)-Equivariant classifiers using Tensor nonlinearity.", "track": "main", "status": "Reject", "tldr": "", "abstract": "In this paper we propose autoencoder architectures for learning a Cohen-Welling\n(CW)-basis for images and their rotations. We use the learned CW-basis to build\na rotation equivariant classifier to classify images. The autoencoder and classi-\nfier architectures use only tensor product nonlinearity. The model proposed by\nCohen & Welling (2014) uses ideas from group representation theory, and extracts\na basis exposing irreducible representations for images and their rotations. We\ngive several architectures to learn CW-bases including a novel coupling AE archi-\ntecture to learn a coupled CW-bases for images in different scales simultaneously.\nOur use of tensor product nonlinearity is inspired from recent work of Kondor\n(2018a). Our classifier has very good accuracy and we use fewer parameters.\nEven when the sample complexity to learn a good CW-basis is low we learn clas-\nsifiers which perform impressively. We show that a coupled CW-bases in one scale\ncan be deployed to classify images in a classifier trained and tested on images in\na different scale with only a marginal dip in performance.", "keywords": "group representations;group equivariant networks;tensor product nonlinearity", "primary_area": "", "supplementary_material": "", "author": "Muthuvel Murugan;K Venkata Subrahmanyam", "authorids": "muthu@cmi.ac.in;kv@cmi.ac.in", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmurugan2019cohen,\ntitle={Cohen Welling bases & {SO}(2)-Equivariant classifiers using Tensor nonlinearity.},\nauthor={Muthuvel Murugan and K Venkata Subrahmanyam},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxXhsAcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyxXhsAcFQ", "pdf_size": 0, "rating": "3;6;7", "confidence": "2;2;4", "wc_review": "649;297;673", "wc_reply_reviewers": "0;17;0", "wc_reply_authors": "2238;657;794", "reply_reviewers": "0;1;0", "reply_authors": "4;2;1", "rating_avg": [ 5.333333333333333, 1.699673171197595 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 539.6666666666666, 171.87075247276817 ], "wc_reply_reviewers_avg": [ 5.666666666666667, 8.013876853447538 ], "wc_reply_authors_avg": [ 1229.6666666666667, 715.189640740288 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.6933752452815365, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GLAp4PLCUyoJ:scholar.google.com/&scioq=Cohen+Welling+bases+%26+SO(2)-Equivariant+classifiers+using+Tensor+nonlinearity.&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "SyxYEoA5FX", "title": "Invariance and Inverse Stability under ReLU", "track": "main", "status": "Reject", "tldr": "We analyze the invertibility of deep neural networks by studying preimages of ReLU-layers and the stability of the inverse.", "abstract": "We flip the usual approach to study invariance and robustness of neural networks by considering the non-uniqueness and instability of the inverse mapping. We provide theoretical and numerical results on the inverse of ReLU-layers. First, we derive a necessary and sufficient condition on the existence of invariance that provides a geometric interpretation. Next, we move to robustness via analyzing local effects on the inverse. To conclude, we show how this reverse point of view not only provides insights into key effects, but also enables to view adversarial examples from different perspectives.", "keywords": "deep neural networks;invertibility;invariance;robustness;ReLU networks", "primary_area": "", "supplementary_material": "", "author": "Jens Behrmann;S\u00f6ren Dittmer;Pascal Fernsel;Peter Maass", "authorids": "jensb@uni-bremen.de;sdittmer@math.uni-bremen.de;pfernsel@math.uni-bremen.de;pmaass@uni-bremen.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbehrmann2019invariance,\ntitle={Invariance and Inverse Stability under Re{LU}},\nauthor={Jens Behrmann and S\u00f6ren Dittmer and Pascal Fernsel and Peter Maass},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxYEoA5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=SyxYEoA5FX", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;3", "wc_review": "329;613;367", "wc_reply_reviewers": "76;34;30", "wc_reply_authors": "339;540;603", "reply_reviewers": "1;1;1", "reply_authors": "1;1;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 436.3333333333333, 125.88177875380623 ], "wc_reply_reviewers_avg": [ 46.666666666666664, 20.805982045769646 ], "wc_reply_authors_avg": [ 494.0, 112.57886124845996 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:hX-eYfol0NcJ:scholar.google.com/&scioq=Invariance+and+Inverse+Stability+under+ReLU&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Feature Intertwiner for Object Detection", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/698", "id": "SyxZJn05YX", "author_site": "Hongyang Li, Bo Dai, Shaoshuai Shi, Wanli Ouyang, Xiaogang Wang", "tldr": "(Camera-ready version) A feature intertwiner module to leverage features from one accurate set to help the learning of another less reliable set.", "abstract": "A well-trained model should classify objects with unanimous score for every category. This requires the high-level semantic features should be alike among samples, despite a wide span in resolution, texture, deformation, etc. Previous works focus on re-designing the loss function or proposing new regularization constraints on the loss. In this paper, we address this problem via a new perspective. For each category, it is assumed that there are two sets in the feature space: one with more reliable information and the other with less reliable source. We argue that the reliable set could guide the feature learning of the less reliable set during training - in spirit of student mimicking teacher\u2019s behavior and thus pushing towards a more compact class centroid in the high-dimensional space. Such a scheme also benefits the reliable set since samples become more closer within the same category - implying that it is easilier for the classifier to identify. We refer to this mutual learning process as feature intertwiner and embed the spirit into object detection. It is well-known that objects of low resolution are more difficult to detect due to the loss of detailed information during network forward pass. We thus regard objects of high resolution as the reliable set and objects of low resolution as the less reliable set. Specifically, an intertwiner is achieved by minimizing the distribution divergence between two sets. We design a historical buffer to represent all previous samples in the reliable set and utilize them to guide the feature learning of the less reliable set. The design of obtaining an effective feature representation for the reliable set is further investigated, where we introduce the optimal transport (OT) algorithm into the framework. Samples in the less reliable set are better aligned with the reliable set with aid of OT metric. Incorporated with such a plug-and-play intertwiner, we achieve an evident improvement over previous state-of-the-arts on the COCO object detection benchmark.", "keywords": "feature learning;computer vision;deep learning", "primary_area": "", "supplementary_material": "", "author": "Hongyang Li;Bo Dai;Shaoshuai Shi;Wanli Ouyang;Xiaogang Wang", "authorids": "yangli@ee.cuhk.edu.hk;db014@ie.cuhk.edu.hk;shaoss@link.cuhk.edu.hk;wanli.ouyang@gmail.com;xgwang@ee.cuhk.edu.hk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2018feature,\ntitle={Feature Intertwiners},\nauthor={Hongyang Li and Bo Dai and Shaoshuai Shi and Wanli Ouyang and Xiaogang Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxZJn05YX},\n}", "github": "[![github](/images/github_icon.svg) hli2020/feature_intertwiner](https://github.com/hli2020/feature_intertwiner) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=SyxZJn05YX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;7;9", "confidence": "4;3;4", "wc_review": "109;321;450", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "652;743;672", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 293.3333333333333, 140.58054235522386 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 689.0, 39.04698025029166 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1331733591833237522&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SyxZJn05YX", "pdf": "https://openreview.net/pdf?id=SyxZJn05YX", "email": ";;;;", "author_num": 5 }, { "id": "SyxZOsA9tX", "title": "Accelerated Value Iteration via Anderson Mixing", "track": "main", "status": "Reject", "tldr": "", "abstract": "Acceleration for reinforcement learning methods is an important and challenging theme. We introduce the Anderson acceleration technique into the value iteration, developing an accelerated value iteration algorithm that we call Anderson Accelerated Value Iteration (A2VI). We further apply our method to the Deep Q-learning algorithm, resulting in the Deep Anderson Accelerated Q-learning (DA2Q) algorithm. Our approach can be viewed as an approximation of the policy evaluation by interpolating on historical data. A2VI is more efficient than the modified policy iteration, which is a classical approximate method for policy evaluation. We give a theoretical analysis of our algorithm and conduct experiments on both toy problems and Atari games. Both the theoretical and empirical results show the effectiveness of our algorithm.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Yujun Li;Chengzhuo Ni;Guangzeng Xie;Wenhao Yang;Shuchang Zhou;Zhihua Zhang", "authorids": "liyujun145@gmail.com;hzxsncz@pku.edu.cn;smsxgz@pku.edu.cn;yangwenhaosms@pku.edu.cn;zsc@megvii.com;zhzhang@math.pku.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nli2019accelerated,\ntitle={Accelerated Value Iteration via Anderson Mixing},\nauthor={Yujun Li and Chengzhuo Ni and Guangzeng Xie and Wenhao Yang and Shuchang Zhou and Zhihua Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxZOsA9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyxZOsA9tX", "pdf_size": 0, "rating": "4;4;7", "confidence": "3;4;4", "wc_review": "218;772;173", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "786;355;182", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 387.6666666666667, 272.3849400307505 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 441.0, 253.96981447933268 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12908306699818217399&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Adversarial Reprogramming of Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1124", "id": "Syx_Ss05tm", "author_site": "Gamaleldin Elsayed, Ian Goodfellow, Jascha Sohl-Dickstein", "tldr": "We introduce the first instance of adversarial attacks that reprogram the target model to perform a task chosen by the attacker---without the attacker needing to specify or compute the desired output for each test-time input.", "abstract": "Deep neural networks are susceptible to adversarial attacks. In computer vision, well-crafted perturbations to images can cause neural networks to make mistakes such as confusing a cat with a computer. Previous adversarial attacks have been designed to degrade performance of models or cause machine learning models to produce specific outputs chosen ahead of time by the attacker. We introduce attacks that instead reprogram the target model to perform a task chosen by the attacker without the attacker needing to specify or compute the desired output for each test-time input. This attack finds a single adversarial perturbation, that can be added to all test-time inputs to a machine learning model in order to cause the model to perform a task chosen by the adversary\u2014even if the model was not trained to do this task. These perturbations can thus be considered a program for the new task. We demonstrate adversarial reprogramming on six ImageNet classification models, repurposing these models to perform a counting task, as well as classification tasks: classification of MNIST and CIFAR-10 examples presented as inputs to the ImageNet model.", "keywords": "Adversarial;Neural Networks;Machine Learning Security", "primary_area": "", "supplementary_material": "", "author": "Gamaleldin F. Elsayed;Ian Goodfellow;Jascha Sohl-Dickstein", "authorids": "gamaleldin.elsayed@gmail.com;goodfellow@google.com;jaschasd@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nelsayed2018adversarial,\ntitle={Adversarial Reprogramming of Neural Networks},\nauthor={Gamaleldin F. Elsayed and Ian Goodfellow and Jascha Sohl-Dickstein},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syx_Ss05tm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 6 community implementations](https://paperswithcode.com/paper/?openreview=Syx_Ss05tm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;6;8", "confidence": "3;5;4", "wc_review": "88;761;497", "wc_reply_reviewers": "0;0;42", "wc_reply_authors": "255;500;208", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 448.6666666666667, 276.86860100456 ], "wc_reply_reviewers_avg": [ 14.0, 19.79898987322333 ], "wc_reply_authors_avg": [ 321.0, 128.01822786879478 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 227, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14670912168580243999&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=Syx_Ss05tm", "pdf": "https://openreview.net/pdf?id=Syx_Ss05tm", "email": ";;", "author_num": 3 }, { "id": "SyxaYsAqY7", "title": "Second-Order Adversarial Attack and Certifiable Robustness", "track": "main", "status": "Reject", "tldr": "", "abstract": "Adversarial training has been recognized as a strong defense against adversarial attacks. In this paper, we propose a powerful second-order attack method that reduces the accuracy of the defense model by Madry et al. (2017). We demonstrate that adversarial training overfits to the choice of the norm in the sense that it is only robust to the attack used for adversarial training, thus suggesting it has not achieved universal robustness. The effectiveness of our attack method motivates an investigation of provable robustness of a defense model. To this end, we introduce a framework that allows one to obtain a certifiable lower bound on the prediction accuracy against adversarial examples. We conduct experiments to show the effectiveness of our attack method. At the same time, our defense model achieves significant improvements compared to previous works under our proposed attack.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bai Li;Changyou Chen;Wenlin Wang;Lawrence Carin", "authorids": "bai.li@duke.edu;cchangyou@gmail.com;wenlin.wang@duke.edu;lcarin@duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2019secondorder,\ntitle={Second-Order Adversarial Attack and Certifiable Robustness},\nauthor={Bai Li and Changyou Chen and Wenlin Wang and Lawrence Carin},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxaYsAqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyxaYsAqY7", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;5;3", "wc_review": "1085;825;606", "wc_reply_reviewers": "0;419;0", "wc_reply_authors": "1075;877;531", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 838.6666666666666, 195.78956957804354 ], "wc_reply_reviewers_avg": [ 139.66666666666666, 197.5184942114423 ], "wc_reply_authors_avg": [ 827.6666666666666, 224.8100432711038 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 114, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17602793403018033414&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "SyxdbnR9YQ", "title": "HANDLING CONCEPT DRIFT IN WIFI-BASED INDOOR LOCALIZATION USING REPRESENTATION LEARNING", "track": "main", "status": "Withdraw", "tldr": "We introduce an augmented robust feature space for streaming wifi data that is capable of tackling concept drift for indoor localization", "abstract": "We outline the problem of concept drifts for time series data. In this work, we analyze the temporal inconsistency of streaming wireless signals in the context of device-free passive indoor localization. We show that data obtained from WiFi channel state information (CSI) can be used to train a robust system capable of performing room level localization. One of the most challenging issues for such a system is the movement of input data distribution to an unexplored space over time, which leads to an unwanted shift in the learned boundaries of the output space. In this work, we propose a phase and magnitude augmented feature space along with a standardization technique that is little affected by drifts. We show that this robust representation of the data yields better learning accuracy and requires less number of retraining. ", "keywords": "concept drift;wifi localization;feature representation.", "primary_area": "", "supplementary_material": "", "author": "Raihan Seraj;Negar Ghourchian;Michel Allegue-Martinez", "authorids": "raihan.seraj@mail.mcgill.ca;negar.gh@aerial.ai;michel.allegue@aerial.ai", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyxdbnR9YQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "1;4;4", "wc_review": "99;158;97", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 118.0, 28.296053906272277 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tx97-roi8MAJ:scholar.google.com/&scioq=HANDLING+CONCEPT+DRIFT+IN+WIFI-BASED+INDOOR+LOCALIZATION+USING+REPRESENTATION+LEARNING&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "G-SGD: Optimizing ReLU Neural Networks in its Positively Scale-Invariant Space", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/724", "id": "SyxfEn09Y7", "author_site": "Qi Meng, Shuxin Zheng, Huishuai Zhang, Wei Chen, Qiwei Ye, Zhi-Ming Ma, Nenghai Yu, Tie-Yan Liu", "tldr": "", "abstract": "It is well known that neural networks with rectified linear units (ReLU) activation functions are positively scale-invariant. Conventional algorithms like stochastic gradient descent optimize the neural networks in the vector space of weights, which is, however, not positively scale-invariant. This mismatch may lead to problems during the optimization process. Then, a natural question is: \\emph{can we construct a new vector space that is positively scale-invariant and sufficient to represent ReLU neural networks so as to better facilitate the optimization process }? In this paper, we provide our positive answer to this question. First, we conduct a formal study on the positive scaling operators which forms a transformation group, denoted as $\\mathcal{G}$. We prove that the value of a path (i.e. the product of the weights along the path) in the neural network is invariant to positive scaling and the value vector of all the paths is sufficient to represent the neural networks under mild conditions. Second, we show that one can identify some basis paths out of all the paths and prove that the linear span of their value vectors (denoted as $\\mathcal{G}$-space) is an invariant space with lower dimension under the positive scaling group. Finally, we design stochastic gradient descent algorithm in $\\mathcal{G}$-space (abbreviated as $\\mathcal{G}$-SGD) to optimize the value vector of the basis paths of neural networks with little extra cost by leveraging back-propagation. Our experiments show that $\\mathcal{G}$-SGD significantly outperforms the conventional SGD algorithm in optimizing ReLU networks on benchmark datasets. ", "keywords": "optimization;neural network;irreducible positively scale-invariant space;deep learning", "primary_area": "", "supplementary_material": "", "author": "Qi Meng;Shuxin Zheng;Huishuai Zhang;Wei Chen;Qiwei Ye;Zhi-Ming Ma;Nenghai Yu;Tie-Yan Liu", "authorids": "meq@microsoft.com;zhengsx@mail.ustc.edu.cn;huzhang@microsoft.com;wche@microsoft.com;qiwye@microsoft.com;mazm@amt.ac.cn;ynh@ustc.edu.cn;tyliu@microsoft.com", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nmeng2018gsgd,\ntitle={G-{SGD}: Optimizing Re{LU} Neural Networks in its Positively Scale-Invariant Space},\nauthor={Qi Meng and Shuxin Zheng and Huishuai Zhang and Wei Chen and Zhi-Ming Ma and Tie-Yan Liu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxfEn09Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;3;2", "wc_review": "252;440;108", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "646;486;55", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 266.6666666666667, 135.9346248099513 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 395.6666666666667, 249.58676959237155 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17842882787808230795&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=SyxfEn09Y7", "pdf": "https://openreview.net/pdf?id=SyxfEn09Y7", "email": ";;;;;;;", "author_num": 8 }, { "id": "Syxgbh05tQ", "title": "Lyapunov-based Safe Policy Optimization", "track": "main", "status": "Reject", "tldr": "Safe Reinforcement Learning Algorithms for Continuous Control", "abstract": "In many reinforcement learning applications, it is crucial that the agent interacts with the environment only through safe policies, i.e.,~policies that do not take the agent to certain undesirable situations. These problems are often formulated as a constrained Markov decision process (CMDP) in which the agent's goal is to optimize its main objective while not violating a number of safety constraints. In this paper, we propose safe policy optimization algorithms that are based on the Lyapunov approach to CMDPs, an approach that has well-established theoretical guarantees in control engineering. We first show how to generate a set of state-dependent Lyapunov constraints from the original CMDP safety constraints. We then propose safe policy gradient algorithms that train a neural network policy using DDPG or PPO, while guaranteeing near-constraint satisfaction at every policy update by projecting either the policy parameter or the action onto the set of feasible solutions induced by the linearized Lyapunov constraints. Unlike the existing (safe) constrained PG algorithms, ours are more data efficient as they are able to utilize both on-policy and off-policy data. Furthermore, the action-projection version of our algorithms often leads to less conservative policy updates and allows for natural integration into an end-to-end PG training pipeline. We evaluate our algorithms and compare them with CPO and the Lagrangian method on several high-dimensional continuous state and action simulated robot locomotion tasks, in which the agent must satisfy certain safety constraints while minimizing its expected cumulative cost. ", "keywords": "Reinforcement Learning;Safe Learning;Lyapunov Functions;Constrained Markov Decision Problems", "primary_area": "", "supplementary_material": "", "author": "Yinlam Chow;Ofir Nachum;Mohammad Ghavamzadeh;Edgar Guzman-Duenez", "authorids": "yinlamchow@google.com;ofirnachum@google.com;mohammad.ghavamzadeh@inria.fr;duenez@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchow2019lyapunovbased,\ntitle={Lyapunov-based Safe Policy Optimization},\nauthor={Yinlam Chow and Ofir Nachum and Mohammad Ghavamzadeh and Edgar Guzman-Duenez},\nyear={2019},\nurl={https://openreview.net/forum?id=Syxgbh05tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=Syxgbh05tQ", "pdf_size": 0, "rating": "5;6;6;8", "confidence": "3;2;2;3", "wc_review": "343;189;319;91", "wc_reply_reviewers": "35;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "1;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 6.25, 1.0897247358851685 ], "confidence_avg": [ 2.5, 0.5 ], "wc_review_avg": [ 235.5, 101.94483802527718 ], "wc_reply_reviewers_avg": [ 8.75, 15.155444566227676 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.2294157338705618, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4455469996975046673&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SyxknjC9KQ", "title": "Dense Morphological Network: An Universal Function Approximator", "track": "main", "status": "Reject", "tldr": "Using mophological operation (dilation and erosion) we have defined a class of network which can approximate any continious function. ", "abstract": "Artificial neural networks are built on the basic operation of linear combination and non-linear activation function. Theoretically this structure can approximate any continuous function with three layer architecture. But in practice learning the parameters of such network can be hard. Also the choice of activation function can greatly impact the performance of the network. In this paper we are proposing to replace the basic linear combination operation with non-linear operations that do away with the need of additional non-linear activation function. To this end we are proposing the use of elementary morphological operations (dilation and erosion) as the basic operation in neurons. We show that these networks (Denoted as Morph-Net) with morphological operations can approximate any smooth function requiring less number of parameters than what is necessary for normal neural networks. The results show that our network perform favorably when compared with similar structured network. We have carried out our experiments on MNIST, Fashion-MNIST, CIFAR10 and CIFAR100.", "keywords": "Mathematical Morphology;Neural Network;Activation Function;Universal Aproximatimation.", "primary_area": "", "supplementary_material": "", "author": "Ranjan Mondal;Sanchayan Santra;Bhabatosh Chanda", "authorids": "ranjan.rev@gmail.com;sanchayan_r@isical.ac.in;chanda@isical.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmondal2019dense,\ntitle={Dense Morphological Network: An Universal Function Approximator},\nauthor={Ranjan Mondal and Sanchayan Santra and Bhabatosh Chanda},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxknjC9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=SyxknjC9KQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;4;3", "wc_review": "400;166;358", "wc_reply_reviewers": "0;93;0", "wc_reply_authors": "998;588;850", "reply_reviewers": "0;1;0", "reply_authors": "2;2;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 308.0, 101.86265262597475 ], "wc_reply_reviewers_avg": [ 31.0, 43.840620433565945 ], "wc_reply_authors_avg": [ 812.0, 169.5248261071713 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6178292826311846709&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "SyxnvsAqFm", "title": "Computation-Efficient Quantization Method for Deep Neural Networks", "track": "main", "status": "Reject", "tldr": "A simple computation-efficient quantization training method for CNNs and RNNs.", "abstract": "Deep Neural Networks, being memory and computation intensive, are a challenge to deploy in smaller devices. Numerous quantization techniques have been proposed to reduce the inference latency/memory consumption. However, these techniques impose a large overhead on the training procedure or need to change the training process. We present a non-intrusive quantization technique based on re-training the full precision model, followed by directly optimizing the corresponding binary model. The quantization training process takes no longer than the original training process. We also propose a new loss function to regularize the weights, resulting in reduced quantization error. Combining both help us achieve full precision accuracy on CIFAR dataset using binary quantization. We also achieve full precision accuracy on WikiText-2 using 2 bit quantization. Comparable results are also shown for ImageNet. We also present a 1.5 bits hybrid model exceeding the performance of TWN LSTM model for WikiText-2.", "keywords": "quantization;binary;ternary;flat minima;model compression;deep learning", "primary_area": "", "supplementary_material": "", "author": "Parichay Kapoor;Dongsoo Lee;Byeongwook Kim;Saehyung Lee", "authorids": "kparichay@gmail.com;dslee3@gmail.com;guddnr145@gmail.com;halo8218@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkapoor2019computationefficient,\ntitle={Computation-Efficient Quantization Method for Deep Neural Networks},\nauthor={Parichay Kapoor and Dongsoo Lee and Byeongwook Kim and Saehyung Lee},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxnvsAqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=SyxnvsAqFm", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "191;209;302", "wc_reply_reviewers": "24;78;143", "wc_reply_authors": "259;235;654", "reply_reviewers": "1;1;1", "reply_authors": "1;1;3", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 234.0, 48.641546028061235 ], "wc_reply_reviewers_avg": [ 81.66666666666667, 48.65068230650921 ], "wc_reply_authors_avg": [ 382.6666666666667, 192.11165734772288 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3796908623858316493&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "From Hard to Soft: Understanding Deep Network Nonlinearities via Vector Quantization and Statistical Inference", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/991", "id": "Syxt2jC5FX", "author_site": "Randall Balestriero, Richard Baraniuk", "tldr": "Reformulate deep networks nonlinearities from a vector quantization scope and bridge most known nonlinearities together.", "abstract": "Nonlinearity is crucial to the performance of a deep (neural) network (DN).\nTo date there has been little progress understanding the menagerie of available nonlinearities, but recently progress has been made on understanding the r\\^{o}le played by piecewise affine and convex nonlinearities like the ReLU and absolute value activation functions and max-pooling.\nIn particular, DN layers constructed from these operations can be interpreted as {\\em max-affine spline operators} (MASOs) that have an elegant link to vector quantization (VQ) and $K$-means.\nWhile this is good theoretical progress, the entire MASO approach is predicated on the requirement that the nonlinearities be piecewise affine and convex, which precludes important activation functions like the sigmoid, hyperbolic tangent, and softmax.\n{\\em This paper extends the MASO framework to these and an infinitely large class of new nonlinearities by linking deterministic MASOs with probabilistic Gaussian Mixture Models (GMMs).}\nWe show that, under a GMM, piecewise affine, convex nonlinearities like ReLU, absolute value, and max-pooling can be interpreted as solutions to certain natural ``hard'' VQ inference problems, while sigmoid, hyperbolic tangent, and softmax can be interpreted as solutions to corresponding ``soft'' VQ inference problems.\nWe further extend the framework by hybridizing the hard and soft VQ optimizations to create a $\\beta$-VQ inference that interpolates between hard, soft, and linear VQ inference.\nA prime example of a $\\beta$-VQ DN nonlinearity is the {\\em swish} nonlinearity, which offers state-of-the-art performance in a range of computer vision tasks but was developed ad hoc by experimentation.\nFinally, we validate with experiments an important assertion of our theory, namely that DN performance can be significantly improved by enforcing orthogonality in its linear filters.\n", "keywords": "Spline;Vector Quantization;Inference;Nonlinearities;Deep Network", "primary_area": "", "supplementary_material": "", "author": "Randall Balestriero;Richard Baraniuk", "authorids": "randallbalestriero@gmail.com;richb@rice.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbalestriero2018from,\ntitle={From Hard to Soft: Understanding Deep Network Nonlinearities via Vector Quantization and Statistical Inference},\nauthor={Randall Balestriero and Richard Baraniuk},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syxt2jC5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;5", "wc_review": "345;215;186", "wc_reply_reviewers": "0;10;0", "wc_reply_authors": "561;215;141", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 248.66666666666666, 69.13915115346892 ], "wc_reply_reviewers_avg": [ 3.3333333333333335, 4.714045207910316 ], "wc_reply_authors_avg": [ 305.6666666666667, 183.05797503037724 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6884032427867360186&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Syxt2jC5FX", "pdf": "https://openreview.net/pdf?id=Syxt2jC5FX", "email": ";", "author_num": 2 }, { "title": "Aggregated Momentum: Stability Through Passive Damping", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/990", "id": "Syxt5oC5YQ", "author_site": "James Lucas, Shengyang Sun, Richard Zemel, Roger Grosse", "tldr": "We introduce a simple variant of momentum optimization which is able to outperform classical momentum, Nesterov, and Adam on deep learning tasks with minimal hyperparameter tuning.", "abstract": "Momentum is a simple and widely used trick which allows gradient-based optimizers to pick up speed along low curvature directions. Its performance depends crucially on a damping coefficient. Largecamping coefficients can potentially deliver much larger speedups, but are prone to oscillations and instability; hence one typically resorts to small values such as 0.5 or 0.9. We propose Aggregated Momentum (AggMo), a variant of momentum which combines multiple velocity vectors with different damping coefficients. AggMo is trivial to implement, but significantly dampens oscillations, enabling it to remain stable even for aggressive damping coefficients such as 0.999. We reinterpret Nesterov's accelerated gradient descent as a special case of AggMo and analyze rates of convergence for quadratic objectives. Empirically, we find that AggMo is a suitable drop-in replacement for other momentum methods, and frequently delivers faster convergence with little to no tuning.", "keywords": "momentum;optimization;deep learning;neural networks", "primary_area": "", "supplementary_material": "", "author": "James Lucas;Shengyang Sun;Richard Zemel;Roger Grosse", "authorids": "jlucas@cs.toronto.edu;ssy@cs.toronto.edu;zemel@cs.toronto.edu;rgrosse@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlucas2018aggregated,\ntitle={Aggregated Momentum: Stability Through Passive Damping},\nauthor={James Lucas and Shengyang Sun and Richard Zemel and Roger Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=Syxt5oC5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;3", "wc_review": "112;129;141", "wc_reply_reviewers": "37;0;0", "wc_reply_authors": "896;249;125", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 127.33333333333333, 11.897712198383164 ], "wc_reply_reviewers_avg": [ 12.333333333333334, 17.441967269268172 ], "wc_reply_authors_avg": [ 423.3333333333333, 338.0378019623775 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3877232077315711794&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=Syxt5oC5YQ", "pdf": "https://openreview.net/pdf?id=Syxt5oC5YQ", "email": ";;;", "author_num": 4 }, { "title": "Variational Autoencoder with Arbitrary Conditioning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/659", "id": "SyxtJh0qYm", "author_site": "Oleg Ivanov, Mikhail Figurnov, Dmitry P. Vetrov", "tldr": "We propose an extension of conditional variational autoencoder that allows conditioning on an arbitrary subset of the features and sampling the remaining ones.", "abstract": "We propose a single neural probabilistic model based on variational autoencoder that can be conditioned on an arbitrary subset of observed features and then sample the remaining features in \"one shot\". The features may be both real-valued and categorical. Training of the model is performed by stochastic variational Bayes. The experimental evaluation on synthetic data, as well as feature imputation and image inpainting problems, shows the effectiveness of the proposed approach and diversity of the generated samples.", "keywords": "unsupervised learning;generative models;conditional variational autoencoder;variational autoencoder;missing features multiple imputation;inpainting", "primary_area": "", "supplementary_material": "", "author": "Oleg Ivanov;Michael Figurnov;Dmitry Vetrov", "authorids": "tigvarts@gmail.com;michael@figurnov.ru;vetrovd@yandex.ru", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nivanov2018variational,\ntitle={Variational Autoencoder with Arbitrary Conditioning},\nauthor={Oleg Ivanov and Michael Figurnov and Dmitry Vetrov},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxtJh0qYm},\n}", "github": "[![github](/images/github_icon.svg) tigvarts/ucm](https://github.com/tigvarts/ucm) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=SyxtJh0qYm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "wc_review": "419;275;147", "wc_reply_reviewers": "339;0;0", "wc_reply_authors": "1067;719;36", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 280.3333333333333, 111.10755549866485 ], "wc_reply_reviewers_avg": [ 113.0, 159.80613254815975 ], "wc_reply_authors_avg": [ 607.3333333333334, 428.24629465867986 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 192, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10062724307854177274&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=SyxtJh0qYm", "pdf": "https://openreview.net/pdf?id=SyxtJh0qYm", "email": ";;", "author_num": 3 }, { "id": "SyxvSiCcFQ", "title": "Neural Network Cost Landscapes as Quantum States", "track": "main", "status": "Reject", "tldr": "We show that NN parameter and hyperparameter cost landscapes can be generated as quantum states using a single quantum circuit and that these can be used for training and meta-training.", "abstract": "Quantum computers promise significant advantages over classical computers for a number of different applications. We show that the complete loss function landscape of a neural network can be represented as the quantum state output by a quantum computer. We demonstrate this explicitly for a binary neural network and, further, show how a quantum computer can train the network by manipulating this state using a well-known algorithm known as quantum amplitude amplification. We further show that with minor adaptation, this method can also represent the meta-loss landscape of a number of neural network architectures simultaneously. We search this meta-loss landscape with the same method to simultaneously train and design a binary neural network. ", "keywords": "quantum;neural networks;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Abdulah Fawaz;Sebastien Piat;Paul Klein;Peter Mountney;Simone Severini", "authorids": "abdulah.fawaz.14@ucl.ac.uk;sebastien.piat@siemens-healthineers.com;klein.paul@siemens-healthineers.com;peter.mountney@siemens-healthineers.com;s.severini@ucl.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nfawaz2019neural,\ntitle={Neural Network Cost Landscapes as Quantum States},\nauthor={Abdulah Fawaz and Sebastien Piat and Paul Klein and Peter Mountney and Simone Severini},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxvSiCcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyxvSiCcFQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;5", "wc_review": "394;381;555", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1033;645;770", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 443.3333333333333, 79.13841601200323 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 816.0, 161.70549361931606 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Crhe7x1yHroJ:scholar.google.com/&scioq=Neural+Network+Cost+Landscapes+as+Quantum+States&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "SyxwW2A5Km", "title": "Learning Representations of Categorical Feature Combinations via Self-Attention", "track": "main", "status": "Reject", "tldr": "", "abstract": "Self-attention has been widely used to model the sequential data and achieved remarkable results in many applications. Although it can be used to model dependencies without regard to positions of sequences, self-attention is seldom applied to non-sequential data. In this work, we propose to learn representations of multi-field categorical data in prediction tasks via self-attention mechanism, where features are orderless but have intrinsic relations over different fields. In most current DNN based models, feature embeddings are simply concatenated for further processing by networks. Instead, by applying self-attention to transform the embeddings, we are able to relate features in different fields and automatically learn representations of their combinations, which are known as the factors of many prevailing linear models. To further improve the effect of feature combination mining, we modify the original self-attention structure by restricting the similarity weight to have at most k non-zero values, which additionally regularizes the model. We experimentally evaluate the effectiveness of our self-attention model on non-sequential data. Across two click through rate prediction benchmark datasets, i.e., Cretio and Avazu, our model with top-k restricted self-attention achieves the state-of-the-art performance. Compared with the vanilla MLP, the gain by adding self-attention is significantly larger than that by modifying the network structures, which most current works focus on.", "keywords": "Learning Representations;Feature Combinations;Self-Attention", "primary_area": "", "supplementary_material": "", "author": "Chen Xu;Chengzhen Fu;Peng Jiang;Wenwu Ou", "authorids": "chaos.xc@alibaba-inc.com;fuchengzhen@pku.edu.cn;jiangpeng.jp@alibaba-inc.com;wenwu.ou@alibaba-inc.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nxu2019learning,\ntitle={Learning Representations of Categorical Feature Combinations via Self-Attention},\nauthor={Chen Xu and Chengzhen Fu and Peng Jiang and Wenwu Ou},\nyear={2019},\nurl={https://openreview.net/forum?id=SyxwW2A5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=SyxwW2A5Km", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "wc_review": "585;258;487", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 443.3333333333333, 137.02149060964447 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6296741431506023468&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Time-Agnostic Prediction: Predicting Predictable Video Frames", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/967", "id": "SyzVb3CcFX", "author_site": "Dinesh Jayaraman, Frederik Ebert, Alexei Efros, Sergey Levine", "tldr": "In visual prediction tasks, letting your predictive model choose which times to predict does two things: (i) improves prediction quality, and (ii) leads to semantically coherent \"bottleneck state\" predictions, which are useful for planning.", "abstract": "Prediction is arguably one of the most basic functions of an intelligent system. In general, the problem of predicting events in the future or between two waypoints is exceedingly difficult. However, most phenomena naturally pass through relatively predictable bottlenecks---while we cannot predict the precise trajectory of a robot arm between being at rest and holding an object up, we can be certain that it must have picked the object up. To exploit this, we decouple visual prediction from a rigid notion of time. While conventional approaches predict frames at regularly spaced temporal intervals, our time-agnostic predictors (TAP) are not tied to specific times so that they may instead discover predictable \"bottleneck\" frames no matter when they occur. We evaluate our approach for future and intermediate frame prediction across three robotic manipulation tasks. Our predictions are not only of higher visual quality, but also correspond to coherent semantic subgoals in temporally extended tasks.", "keywords": "visual prediction;subgoal generation;bottleneck states;time-agnostic", "primary_area": "", "supplementary_material": "", "author": "Dinesh Jayaraman;Frederik Ebert;Alexei Efros;Sergey Levine", "authorids": "dineshjayaraman@berkeley.edu;febert@berkeley.edu;efros@eecs.berkeley.edu;svlevine@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\njayaraman2018timeagnostic,\ntitle={Time-Agnostic Prediction: Predicting Predictable Video Frames},\nauthor={Dinesh Jayaraman and Frederik Ebert and Alexei Efros and Sergey Levine},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=SyzVb3CcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;3;4", "wc_review": "330;522;387", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "762;794;534", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 413.0, 80.51086883148137 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 696.6666666666666, 115.76220837945728 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5221092832811225749&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=SyzVb3CcFX", "pdf": "https://openreview.net/pdf?id=SyzVb3CcFX", "email": ";;;", "author_num": 4 }, { "id": "SyzjBiR9t7", "title": "MANIFOLDNET: A DEEP NEURAL NETWORK FOR MANIFOLD-VALUED DATA", "track": "main", "status": "Reject", "tldr": "", "abstract": "Developing deep neural networks (DNNs) for manifold-valued data sets\nhas gained much interest of late in the deep learning research\ncommunity. Examples of manifold-valued data include data from\nomnidirectional cameras on automobiles, drones etc., diffusion\nmagnetic resonance imaging, elastography and others. In this paper, we\npresent a novel theoretical framework for DNNs to cope with\nmanifold-valued data inputs. In doing this generalization, we draw\nparallels to the widely popular convolutional neural networks (CNNs).\nWe call our network the ManifoldNet.\n\nAs in vector spaces where convolutions are equivalent to computing the\nweighted mean of functions, an analogous definition for\nmanifold-valued data can be constructed involving the computation of\nthe weighted Fr\\'{e}chet Mean (wFM). To this end, we present a\nprovably convergent recursive computation of the wFM of the given\ndata, where the weights makeup the convolution mask, to be\nlearned. Further, we prove that the proposed wFM layer achieves a\ncontraction mapping and hence the ManifoldNet does not need the\nadditional non-linear ReLU unit used in standard CNNs. Operations such\nas pooling in traditional CNN are no longer necessary in this setting\nsince wFM is already a pooling type operation. Analogous to the\nequivariance of convolution in Euclidean space to translations, we\nprove that the wFM is equivariant to the action of the group of\nisometries admitted by the Riemannian manifold on which the data\nreside. This equivariance property facilitates weight sharing within\nthe network. We present experiments, using the ManifoldNet framework,\nto achieve video classification and image reconstruction using an\nauto-encoder+decoder setting. Experimental results demonstrate the\nefficacy of ManifoldNet in the context of classification and\nreconstruction accuracy.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Rudrasis Chakraborty;Jose Bouza;Jonathan Manton;Baba C. Vemuri", "authorids": "rudrasischa@gmail.com;josebouza@ufl.edu;jonathan.manton@ieee.org;baba.vemuri@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchakraborty2019manifoldnet,\ntitle={{MANIFOLDNET}: A {DEEP} {NEURAL} {NETWORK} {FOR} {MANIFOLD}-{VALUED} {DATA}},\nauthor={Rudrasis Chakraborty and Jose Bouza and Jonathan Manton and Baba C. Vemuri},\nyear={2019},\nurl={https://openreview.net/forum?id=SyzjBiR9t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=SyzjBiR9t7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;3", "wc_review": "645;455;248", "wc_reply_reviewers": "542;0;116", "wc_reply_authors": "2254;752;1029", "reply_reviewers": "3;0;1", "reply_authors": "5;1;2", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 449.3333333333333, 162.1240951315449 ], "wc_reply_reviewers_avg": [ 219.33333333333334, 233.02265030583519 ], "wc_reply_authors_avg": [ 1345.0, 652.632106677772 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.247219128924647 ], "reply_authors_avg": [ 2.6666666666666665, 1.699673171197595 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FNJXe41-2HUJ:scholar.google.com/&scioq=MANIFOLDNET:+A+DEEP+NEURAL+NETWORK+FOR+MANIFOLD-VALUED+DATA&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "Syzn9i05Ym", "title": "Learning Neural Random Fields with Inclusive Auxiliary Generators", "track": "main", "status": "Reject", "tldr": "We develop a new approach to learning neural random fields and show that the new approach obtains state-of-the-art sample generation quality and achieves strong semi-supervised learning results on par with state-of-the-art deep generative models.", "abstract": "Neural random fields (NRFs), which are defined by using neural networks to implement potential functions in undirected models, provide an interesting family of model spaces for machine learning. In this paper we develop a new approach to learning NRFs with inclusive-divergence minimized auxiliary generator - the inclusive-NRF approach, for continuous data (e.g. images), with solid theoretical examination on exploiting gradient information in model sampling. We show that inclusive-NRFs can be flexibly used in unsupervised/supervised image generation and semi-supervised classification, and empirically to the best of our knowledge, represent the best-performed random fields in these tasks. Particularly, inclusive-NRFs achieve state-of-the-art sample generation quality on CIFAR-10 in both unsupervised and supervised settings. Semi-supervised inclusive-NRFs show strong classification results on par with state-of-the-art generative model based semi-supervised learning methods, and simultaneously achieve superior generation, on the widely benchmarked datasets - MNIST, SVHN and CIFAR-10.", "keywords": "Neural random fields;Deep generative models;Unsupervised learning;Semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Yunfu Song;Zhijian Ou", "authorids": "769414284@qq.com;ozjthu@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsong2019learning,\ntitle={Learning Neural Random Fields with Inclusive Auxiliary Generators},\nauthor={Yunfu Song and Zhijian Ou},\nyear={2019},\nurl={https://openreview.net/forum?id=Syzn9i05Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=Syzn9i05Ym", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;2;3", "wc_review": "421;357;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "639;492;174", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 336.3333333333333, 78.93175674101155 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 435.0, 194.06699874012583 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=485034417353725989&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "SyzrLjA5FQ", "title": "Selective Self-Training for semi-supervised Learning", "track": "main", "status": "Reject", "tldr": "Our proposed algorithm does not use all of the unlabeled data for the training, and it rather uses them selectively.", "abstract": "Semi-supervised learning (SSL) is a study that efficiently exploits a large amount of unlabeled data to improve performance in conditions of limited labeled data. Most of the conventional SSL methods assume that the classes of unlabeled data are included in the set of classes of labeled data. In addition, these methods do not sort out useless unlabeled samples and use all the unlabeled data for learning, which is not suitable for realistic situations. In this paper, we propose an SSL method called selective self-training (SST), which selectively decides whether to include each unlabeled sample in the training process. It is also designed to be applied to a more real situation where classes of unlabeled data are different from the ones of the labeled data. For the conventional SSL problems which deal with data where both the labeled and unlabeled samples share the same class categories, the proposed method not only performs comparable to other conventional SSL algorithms but also can be combined with other SSL algorithms. While the conventional methods cannot be applied to the new SSL problems where the separated data do not share the classes, our method does not show any performance degradation even if the classes of unlabeled data are different from those of the labeled data.", "keywords": "deep learning;image recognition;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Jisoo Jeong;Seungeui Lee;Nojun Kwak", "authorids": "soo3553@snu.ac.kr;dehlix@snu.ac.kr;nojunk@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njeong2019selective,\ntitle={Selective Self-Training for semi-supervised Learning},\nauthor={Jisoo Jeong and Seungeui Lee and Nojun Kwak},\nyear={2019},\nurl={https://openreview.net/forum?id=SyzrLjA5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=SyzrLjA5FQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "1037;330;308", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1362;925;648", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 558.3333333333334, 338.5875891930411 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 978.3333333333334, 293.9187340443311 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14432309269641998427&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r14Aas09Y7", "title": "COCO-GAN: Conditional Coordinate Generative Adversarial Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent advancements on Generative Adversarial Network (GAN) have inspired a wide range of works that generate synthetic images. However, the current processes have to generate an entire image at once, and therefore resolutions are limited by memory or computational constraints. In this work, we propose COnditional COordinate GAN (COCO-GAN), which generates a specific patch of an image conditioned on a spatial position rather than the entire image at a time. The generated patches are later combined together to form a globally coherent full-image. With this process, we show that the generated image can achieve competitive quality to state-of-the-arts and the generated patches are locally smooth between consecutive neighbors. One direct implication of the COCO-GAN is that it can be applied onto any coordinate systems including the cylindrical systems which makes it feasible for generating panorama images. The fact that the patch generation process is independent to each other inspires a wide range of new applications: firstly, \"Patch-Inspired Image Generation\" enables us to generate the entire image based on a single patch. Secondly, \"Partial-Scene Generation\" allows us to generate images within a customized target region. Finally, thanks to COCO-GAN's patch generation and massive parallelism, which enables combining patches for generating a full-image with higher resolution than state-of-the-arts.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chieh Hubert Lin;Chia-Che Chang;Yu-Sheng Chen;Da-Cheng Juan;Wei Wei;Hwann-Tzong Chen", "authorids": "hubert052702@gmail.com;chang810249@gmail.com;nothinglo@cmlab.csie.ntu.edu.tw;dacheng@google.com;wewei@google.com;htchen@cs.nthu.edu.tw", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlin2019cocogan,\ntitle={{COCO}-{GAN}: Conditional Coordinate Generative Adversarial Network},\nauthor={Chieh Hubert Lin and Chia-Che Chang and Yu-Sheng Chen and Da-Cheng Juan and Wei Wei and Hwann-Tzong Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=r14Aas09Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r14Aas09Y7", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;4;4", "wc_review": "415;246;590", "wc_reply_reviewers": "0;129;922", "wc_reply_authors": "1605;1031;3781", "reply_reviewers": "0;2;4", "reply_authors": "4;4;8", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 417.0, 140.444532348777 ], "wc_reply_reviewers_avg": [ 350.3333333333333, 407.64554319762766 ], "wc_reply_authors_avg": [ 2139.0, 1184.4807582509168 ], "reply_reviewers_avg": [ 2.0, 1.632993161855452 ], "reply_authors_avg": [ 5.333333333333333, 1.8856180831641267 ], "replies_avg": [ 31, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10292790360513512738&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "A Closer Look at Deep Learning Heuristics: Learning rate restarts, Warmup and Distillation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/711", "id": "r14EOsCqKX", "author_site": "Akhilesh Deepak Gotmare, Nitish Shirish Keskar, Caiming Xiong, richard socher", "tldr": "We use empirical tools of mode connectivity and SVCCA to investigate neural network training heuristics of learning rate restarts, warmup and knowledge distillation.", "abstract": "The convergence rate and final performance of common deep learning models have significantly benefited from recently proposed heuristics such as learning rate schedules, knowledge distillation, skip connections and normalization layers. In the absence of theoretical underpinnings, controlled experiments aimed at explaining the efficacy of these strategies can aid our understanding of deep learning landscapes and the training dynamics. Existing approaches for empirical analysis rely on tools of linear interpolation and visualizations with dimensionality reduction, each with their limitations. Instead, we revisit the empirical analysis of heuristics through the lens of recently proposed methods for loss surface and representation analysis, viz. mode connectivity and canonical correlation analysis (CCA), and hypothesize reasons why the heuristics succeed. In particular, we explore knowledge distillation and learning rate heuristics of (cosine) restarts and warmup using mode connectivity and CCA. Our empirical analysis suggests that: (a) the reasons often quoted for the success of cosine annealing are not evidenced in practice; (b) that the effect of learning rate warmup is to prevent the deeper layers from creating training instability; and (c) that the latent knowledge shared by the teacher is primarily disbursed in the deeper layers.", "keywords": "deep learning heuristics;learning rate restarts;learning rate warmup;knowledge distillation;mode connectivity;SVCCA", "primary_area": "", "supplementary_material": "", "author": "Akhilesh Gotmare;Nitish Shirish Keskar;Caiming Xiong;Richard Socher", "authorids": "akhilesh.gotmare@epfl.ch;nkeskar@salesforce.com;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngotmare2018a,\ntitle={A Closer Look at Deep Learning Heuristics: Learning rate restarts, Warmup and Distillation},\nauthor={Akhilesh Gotmare and Nitish Shirish Keskar and Caiming Xiong and Richard Socher},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r14EOsCqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;5", "wc_review": "285;351;639", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "187;212;206", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 425.0, 153.7010084547268 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 201.66666666666666, 10.656244908763854 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.7559289460184544, "gs_citation": 368, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8375178060138487801&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=r14EOsCqKX", "pdf": "https://openreview.net/pdf?id=r14EOsCqKX", "email": ";;;", "author_num": 4 }, { "id": "r1E0OsA9tX", "title": "Learning From the Experience of Others: Approximate Empirical Bayes in Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning deep neural networks could be understood as the combination of representation learning and learning halfspaces. While most previous work aims to diversify representation learning by data augmentations and regularizations, we explore the opposite direction through the lens of empirical Bayes method. Specifically, we propose a matrix-variate normal prior whose covariance matrix has a Kronecker product structure to capture the correlations in learning different neurons through backpropagation. The prior encourages neurons to learn from the experience of others, hence it provides an effective regularization when training large networks on small datasets. To optimize the model, we design an efficient block coordinate descent algorithm with analytic solutions. Empirically, we show that the proposed method helps the network converge to better local optima that also generalize better, and we verify the effectiveness of the approach on both multiclass classification and multitask regression problems with various network structures. ", "keywords": "Empirical Bayes;Bayesian Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Han Zhao;Yao-Hung Hubert Tsai;Ruslan Salakhutdinov;Geoff Gordon", "authorids": "han.zhao@cs.cmu.edu;yaohungt@cs.cmu.edu;rsalakhu@cs.cmu.edu;ggordon@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhao2019learning,\ntitle={Learning From the Experience of Others: Approximate Empirical Bayes in Neural Networks},\nauthor={Han Zhao and Yao-Hung Hubert Tsai and Ruslan Salakhutdinov and Geoff Gordon},\nyear={2019},\nurl={https://openreview.net/forum?id=r1E0OsA9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1E0OsA9tX", "pdf_size": 0, "rating": "3;6;7", "confidence": "5;4;4", "wc_review": "788;501;250", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1292;456;66", "reply_reviewers": "0;0;0", "reply_authors": "4;1;1", "rating_avg": [ 5.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 513.0, 219.8014255337455 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 604.6666666666666, 511.43284559710827 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9707253433941508, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9387417755122137277&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Self-Monitoring Navigation Agent via Auxiliary Progress Estimation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/821", "id": "r1GAsjC5Fm", "author_site": "Chih-Yao Ma, jiasen lu, Zuxuan Wu, Ghassan AlRegib, Zsolt Kira, richard socher, Caiming Xiong", "tldr": "We propose a self-monitoring agent for the Vision-and-Language Navigation task.", "abstract": "The Vision-and-Language Navigation (VLN) task entails an agent following navigational instruction in photo-realistic unknown environments. This challenging task demands that the agent be aware of which instruction was completed, which instruction is needed next, which way to go, and its navigation progress towards the goal. In this paper, we introduce a self-monitoring agent with two complementary components: (1) visual-textual co-grounding module to locate the instruction completed in the past, the instruction required for the next action, and the next moving direction from surrounding images and (2) progress monitor to ensure the grounded instruction correctly reflects the navigation progress. We test our self-monitoring agent on a standard benchmark and analyze our proposed approach through a series of ablation studies that elucidate the contributions of the primary components. Using our proposed method, we set the new state of the art by a significant margin (8% absolute increase in success rate on the unseen test set). Code is available at https://github.com/chihyaoma/selfmonitoring-agent.", "keywords": "visual grounding;textual grounding;instruction-following;navigation agent", "primary_area": "", "supplementary_material": "", "author": "Chih-Yao Ma;Jiasen Lu;Zuxuan Wu;Ghassan AlRegib;Zsolt Kira;Richard Socher;Caiming Xiong", "authorids": "cyma@gatech.edu;jiasenlu@gatech.edu;zxwu@cs.umd.edu;alregib@gatech.edu;zkira@gatech.edu;rsocher@salesforce.com;cxiong@salesforce.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nma2019selfmonitoring,\ntitle={Self-Monitoring Navigation Agent via Auxiliary Progress Estimation},\nauthor={Chih-Yao Ma and Jiasen Lu and Zuxuan Wu and Ghassan AlRegib and Zsolt Kira and Richard Socher and Caiming Xiong},\nyear={2019},\nurl={https://openreview.net/forum?id=r1GAsjC5Fm},\n}", "github": "[![github](/images/github_icon.svg) chihyaoma/selfmonitoring-agent](https://github.com/chihyaoma/selfmonitoring-agent) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=r1GAsjC5Fm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;5", "wc_review": "416;788;648", "wc_reply_reviewers": "113;114;0", "wc_reply_authors": "1083;967;493", "reply_reviewers": "1;1;0", "reply_authors": "3;3;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 617.3333333333334, 153.40867714123024 ], "wc_reply_reviewers_avg": [ 75.66666666666667, 53.50597059103674 ], "wc_reply_authors_avg": [ 847.6666666666666, 255.21929568292617 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 32, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 318, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5431855784757864150&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=r1GAsjC5Fm", "pdf": "https://openreview.net/pdf?id=r1GAsjC5Fm", "email": ";;;;;;", "author_num": 7 }, { "id": "r1GB5jA5tm", "title": "Adversarial Sampling for Active Learning", "track": "main", "status": "Reject", "tldr": "ASAL is a pool based active learning method that generates high entropy samples and retrieves matching samples from the pool in sub-linear time.", "abstract": "This paper proposes ASAL, a new pool based active learning method that generates high entropy samples. Instead of directly annotating the synthetic samples, ASAL searches similar samples from the pool and includes them for training. Hence, the quality of new samples is high and annotations are reliable. ASAL is particularly suitable for large data sets because it achieves a better run-time complexity (sub-linear) for sample selection than traditional uncertainty sampling (linear). We present a comprehensive set of experiments on two data sets and show that ASAL outperforms similar methods and clearly exceeds the established baseline (random sampling). In the discussion section we analyze in which situations ASAL performs best and why it is sometimes hard to outperform random sample selection. To the best of our knowledge this is the first adversarial active learning technique that is applied for multiple class problems using deep convolutional classifiers and demonstrates superior performance than random sample selection.", "keywords": "active learning;adversarial training;GAN", "primary_area": "", "supplementary_material": "", "author": "Christoph Mayer;Radu Timofte", "authorids": "chmayer@vision.ee.ethz.ch;radu.timofte@vision.ee.ethz.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nmayer2019adversarial,\ntitle={Adversarial Sampling for Active Learning},\nauthor={Christoph Mayer and Radu Timofte},\nyear={2019},\nurl={https://openreview.net/forum?id=r1GB5jA5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1GB5jA5tm", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;4;2", "wc_review": "210;848;342", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "391;1595;742", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 466.6666666666667, 274.9755544690392 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 909.3333333333334, 505.5718434494636 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 153, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=838529958121868178&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "r1GaAjRcF7", "title": "Differentiable Greedy Networks", "track": "main", "status": "Withdraw", "tldr": "We propose a subset selection algorithm that is trainable with gradient based methods yet achieves near optimal performance via submodular optimization.", "abstract": "Optimal selection of a subset of items from a given set is a hard problem that requires combinatorial optimization. In this paper, we propose a subset selection algorithm that is trainable with gradient based methods yet achieves near optimal performance via submodular optimization. We focus on the task of identifying a relevant set of sentences for claim verification in the context of the FEVER task. Conventional methods for this task look at sentences on their individual merit and thus do not optimize the informativeness of sentences as a set. We show that our proposed method which builds on the idea of unfolding a greedy algorithm into a computational graph allows both interpretability and gradient based training. The proposed differentiable greedy network (DGN) outperforms discrete optimization algorithms as well as other baseline methods in terms of precision and recall.", "keywords": "submodular optimization;fact verification;differentiable module;deep unfolding", "primary_area": "", "supplementary_material": "", "author": "Thomas Powers;Rasool Fakoor;Siamak Shakeri;Abhinav Sethy;Amanjit Kainth;Abdel-rahman Mohamed;Ruhi Sarikaya", "authorids": "tcpowers@uw.edu;rasool.fakoor@mavs.uta.edu;siamaks@amazon.com;sethya@amazon.com;amanjitsingh.kainth@mail.utoronto.ca;asamir@cs.toronto.edu;rsarikay@amazon.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=r1GaAjRcF7", "pdf_size": 0, "rating": "2;4;5", "confidence": "5;4;4", "wc_review": "456;528;560", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "565;733;551", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 514.6666666666666, 43.4920171474669 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 616.3333333333334, 82.6935440177903 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2906033584188872416&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Kernel Change-point Detection with Auxiliary Deep Generative Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/693", "id": "r1GbfhRqF7", "author_site": "Wei-Cheng Chang, Chun-Liang Li, Yiming Yang, Barnab\u00e1s P\u00f3czos", "tldr": "In this paper, we propose KL-CPD, a novel kernel learning framework for time series CPD that optimizes a lower bound of test power via an auxiliary generative model as a surrogate to the abnormal distribution. ", "abstract": "Detecting the emergence of abrupt property changes in time series is a challenging problem. Kernel two-sample test has been studied for this task which makes fewer assumptions on the distributions than traditional parametric approaches. However, selecting kernels is non-trivial in practice. Although kernel selection for the two-sample test has been studied, the insufficient samples in change point detection problem hinder the success of those developed kernel selection algorithms. In this paper, we propose KL-CPD, a novel kernel learning framework for time series CPD that optimizes a lower bound of test power via an auxiliary generative model. With deep kernel parameterization, KL-CPD endows kernel two-sample test with the data-driven kernel to detect different types of change-points in real-world applications. The proposed approach significantly outperformed other state-of-the-art methods in our comparative evaluation of benchmark datasets and simulation studies.", "keywords": "deep kernel learning;generative models;kernel two-sample test;time series change-point detection", "primary_area": "", "supplementary_material": "", "author": "Wei-Cheng Chang;Chun-Liang Li;Yiming Yang;Barnab\u00e1s P\u00f3czos", "authorids": "wchang2@cs.cmu.edu;chunlial@cs.cmu.edu;yiming@cs.cmu.edu;bapoczos@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchang2018kernel,\ntitle={Kernel Change-point Detection with Auxiliary Deep Generative Models},\nauthor={Wei-Cheng Chang and Chun-Liang Li and Yiming Yang and Barnab\u00e1s P\u00f3czos},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1GbfhRqF7},\n}", "github": "[![github](/images/github_icon.svg) OctoberChang/klcpd_code](https://github.com/OctoberChang/klcpd_code) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=r1GbfhRqF7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;3", "wc_review": "515;114;83", "wc_reply_reviewers": "17;0;0", "wc_reply_authors": "501;28;79", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 237.33333333333334, 196.74744103940856 ], "wc_reply_reviewers_avg": [ 5.666666666666667, 8.013876853447538 ], "wc_reply_authors_avg": [ 202.66666666666666, 211.9785104412447 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 100, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15362141737124631231&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=r1GbfhRqF7", "pdf": "https://openreview.net/pdf?id=r1GbfhRqF7", "email": ";;;", "author_num": 4 }, { "id": "r1GgDj0cKX", "title": "PRUNING IN TRAINING: LEARNING AND RANKING SPARSE CONNECTIONS IN DEEP CONVOLUTIONAL NETWORKS", "track": "main", "status": "Reject", "tldr": "we propose an algorithm of learning to prune network by enforcing structure sparsity penalties", "abstract": "This paper proposes a Pruning in Training (PiT) framework of learning to reduce the parameter size of networks. Different from existing works, our PiT framework employs the sparse penalties to train networks and thus help rank the importance of weights and filters. Our PiT algorithms can directly prune the network without any fine-tuning. The pruned networks can still achieve comparable performance to the original networks. In particular, we introduce the (Group) Lasso-type Penalty (L-P /GL-P), and (Group) Split LBI Penalty (S-P / GS-P) to regularize the networks, and a pruning strategy proposed is used in help prune the network. We conduct the extensive experiments on MNIST, Cifar-10, and miniImageNet. The results validate the efficacy of our proposed methods. Remarkably, on MNIST dataset, our PiT framework can save 17.5% parameter size of LeNet-5, which achieves the 98.47% recognition accuracy.", "keywords": "Split LBI;sparse penalty;network pruning;feature selection", "primary_area": "", "supplementary_material": "", "author": "Yanwei Fu;Shun Zhang;Donghao Li;Xinwei Sun;Xiangyang Xue;Yuan Yao", "authorids": "yanweifu@fudan.edu.cn;15300180012@fudan.edu.cn;15307100013@fudan.edu.cn;sxwxiaoxiaohehe@pku.edu.cn;xyxue@fudan.edu.cn;yuany@ust.hk", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nfu2019pruning,\ntitle={{PRUNING} {IN} {TRAINING}: {LEARNING} {AND} {RANKING} {SPARSE} {CONNECTIONS} {IN} {DEEP} {CONVOLUTIONAL} {NETWORKS}},\nauthor={Yanwei Fu and Shun Zhang and Donghao Li and Xinwei Sun and Xiangyang Xue and Yuan Yao},\nyear={2019},\nurl={https://openreview.net/forum?id=r1GgDj0cKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1GgDj0cKX", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;4", "wc_review": "379;376;66", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 273.6666666666667, 146.84761565044977 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RIi5zOlA_LQJ:scholar.google.com/&scioq=PRUNING+IN+TRAINING:+LEARNING+AND+RANKING+SPARSE+CONNECTIONS+IN+DEEP+CONVOLUTIONAL+NETWORKS&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1GkMhAqYm", "title": "CoDraw: Collaborative Drawing as a Testbed for Grounded Goal-driven Communication", "track": "main", "status": "Reject", "tldr": "We introduce a dataset, models, and training + evaluation protocols for a collaborative drawing task that allows studying goal-driven and perceptually + actionably grounded language generation and understanding. ", "abstract": "In this work, we propose a goal-driven collaborative task that contains language, vision, and action in a virtual environment as its core components. Specifically, we develop a Collaborative image-Drawing game between two agents, called CoDraw. Our game is grounded in a virtual world that contains movable clip art objects. The game involves two players: a Teller and a Drawer. The Teller sees an abstract scene containing multiple clip art pieces in a semantically meaningful configuration, while the Drawer tries to reconstruct the scene on an empty canvas using available clip art pieces. The two players communicate via two-way communication using natural language. We collect the CoDraw dataset of ~10K dialogs consisting of ~138K messages exchanged between human agents. We define protocols and metrics to evaluate the effectiveness of learned agents on this testbed, highlighting the need for a novel \"crosstalk\" condition which pairs agents trained independently on disjoint subsets of the training data for evaluation. We present models for our task, including simple but effective baselines and neural network approaches trained using a combination of imitation learning and goal-driven training. All models are benchmarked using both fully automated evaluation and by playing the game with live human agents.", "keywords": "CoDraw;collaborative drawing;grounded language", "primary_area": "", "supplementary_material": "", "author": "Nikita Kitaev;Jin-Hwa Kim;Xinlei Chen;Marcus Rohrbach;Yuandong Tian;Dhruv Batra;Devi Parikh", "authorids": "kitaev@cs.berkeley.edu;jnhwkim@gmail.com;xinleic@fb.com;maroffm@gmail.com;yuandong@fb.com;dbatra@gatech.edu;parikh@gatech.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nkitaev2019codraw,\ntitle={CoDraw: Collaborative Drawing as a Testbed for Grounded Goal-driven Communication},\nauthor={Nikita Kitaev and Jin-Hwa Kim and Xinlei Chen and Marcus Rohrbach and Yuandong Tian and Dhruv Batra and Devi Parikh},\nyear={2019},\nurl={https://openreview.net/forum?id=r1GkMhAqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1GkMhAqYm", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;4", "wc_review": "455;724;362", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "675;146;122", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 513.6666666666666, 153.49773794931164 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 314.3333333333333, 255.21798961323674 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 91, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16607049592559109281&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "r1Gsk3R9Fm", "title": "Shallow Learning For Deep Networks", "track": "main", "status": "Reject", "tldr": "We build CNNs layer by layer without end to end training and show for the first time that this kind of approach can scale to Imagenet, while having multiple favorable properties.", "abstract": "Shallow supervised 1-hidden layer neural networks have a number of favorable properties that make them easier to interpret, analyze, and optimize than their deep counterparts, but lack their representational power. Here we use 1-hidden layer learning problems to sequentially build deep networks layer by layer, which can inherit properties from shallow networks. Contrary to previous approaches using shallow networks, we focus on problems where deep learning is reported as critical for success. We thus study CNNs on image recognition tasks using the large-scale Imagenet dataset and the CIFAR-10 dataset. Using a simple set of ideas for architecture and training we find that solving sequential 1-hidden-layer auxiliary problems leads to a CNN that exceeds AlexNet performance on ImageNet. Extending our training methodology to construct individual layers by solving 2-and-3-hidden layer auxiliary problems, we obtain an 11-layer network that exceeds VGG-11 on ImageNet obtaining 89.8% top-5 single crop. To our knowledge, this is the first competitive alternative to end-to-end training of CNNs that can scale to ImageNet. We conduct a wide range of experiments to study the properties this induces on the intermediate layers.", "keywords": "CNN;greedy learning", "primary_area": "", "supplementary_material": "", "author": "Eugene Belilovsky;Michael Eickenberg;Edouard Oyallon", "authorids": "belilove@iro.umontreal.ca;michael.eickenberg@berkeley.edu;edouard.oyallon@centralesupelec.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbelilovsky2019shallow,\ntitle={Shallow Learning For Deep Networks},\nauthor={Eugene Belilovsky and Michael Eickenberg and Edouard Oyallon},\nyear={2019},\nurl={https://openreview.net/forum?id=r1Gsk3R9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1Gsk3R9Fm", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "239;410;674", "wc_reply_reviewers": "48;0;171", "wc_reply_authors": "1353;1471;1493", "reply_reviewers": "1;0;1", "reply_authors": "4;3;4", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 441.0, 178.93574265640726 ], "wc_reply_reviewers_avg": [ 73.0, 72.01388754955533 ], "wc_reply_authors_avg": [ 1439.0, 61.47086030524273 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r1MSBjA9Ym", "title": "Collapse of deep and narrow neural nets", "track": "main", "status": "Reject", "tldr": "Deep and narrow neural networks will converge to erroneous mean or median states of the target function depending on the loss with high probability.", "abstract": "Recent theoretical work has demonstrated that deep neural networks have superior performance over shallow networks, but their training is more difficult, e.g., they suffer from the vanishing gradient problem. This problem can be typically resolved by the rectified linear unit (ReLU) activation. However, here we show that even for such activation, deep and narrow neural networks (NNs) will converge to erroneous mean or median states of the target function depending on the loss with high probability. Deep and narrow NNs are encountered in solving partial differential equations with high-order derivatives. We demonstrate this collapse of such NNs both numerically and theoretically, and provide estimates of the probability of collapse. We also construct a diagram of a safe region for designing NNs that avoid the collapse to erroneous states. Finally, we examine different ways of initialization and normalization that may avoid the collapse problem. Asymmetric initializations may reduce the probability of collapse but do not totally eliminate it.", "keywords": "neural networks;deep and narrow;ReLU;collapse", "primary_area": "", "supplementary_material": "", "author": "Lu Lu;Yanhui Su;George Em Karniadakis", "authorids": "lu_lu_1@brown.edu;suyh@fzu.edu.cn;george_karniadakis@brown.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlu2019collapse,\ntitle={Collapse of deep and narrow neural nets},\nauthor={Lu Lu and Yanhui Su and George Em Karniadakis},\nyear={2019},\nurl={https://openreview.net/forum?id=r1MSBjA9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1MSBjA9Ym", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;4;5", "wc_review": "408;318;237", "wc_reply_reviewers": "226;0;0", "wc_reply_authors": "1425;865;309", "reply_reviewers": "1;0;0", "reply_authors": "3;2;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 321.0, 69.8426803609369 ], "wc_reply_reviewers_avg": [ 75.33333333333333, 106.53742169877317 ], "wc_reply_authors_avg": [ 866.3333333333334, 455.6060676603077 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.7559289460184544, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6682295866241726109&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5 }, { "id": "r1MVuoCctX", "title": "MAJOR-MINOR LSTMS FOR WORD-LEVEL LANGUAGE MODEL", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "As a widely-accepted evaluation criterion, complexity has attracted more and more attention in the design of language models. The parameter count is a proxy for complexity, which is often reported and compared in research papers. In general, more parameters means better model performance, but higher complexity. Therefore, reconciling the contradiction between the complexity and the model performance is necessary. In this paper, we propose a simple method to make use of model parameters more effectively, so that the LSTM-based language models can reach better results without the cost of increasing parameters. The method constructs another small-scale LSTM with a part of parameters originally belonging to the vanilla LSTM in each layer, whose output can assist the next layer in processing the output of the vanilla LSTM. We name these two LSTMs Major Minor LSTMs. In experiments, we demonstrate the language model with Major Minor LSTMs surpasses the existing state-of-the-art model on Penn Treebank and WikiText-2 with fewer parameters.", "keywords": "Language model;LSTM;Deep Learning;NLP", "primary_area": "", "supplementary_material": "", "author": "Kai Shuang;Rui Li;Mengyu Gu;Qianqian Yang;Jonathan;Sen Su", "authorids": "shuangk@bupt.edu.cn;lirui@bupt.edu.cn;pattygu0622@bupt.edu.cn;echo_yang@bupt.edu.cn;jonathan.loo@uwl.ac.uk;susen@bupt.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1MVuoCctX", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;5", "wc_review": "237;161;418", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "204;53;53", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 272.0, 107.79919603905526 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 103.33333333333333, 71.18208263944578 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3qFVwK_UuxIJ:scholar.google.com/&scioq=MAJOR-MINOR+LSTMS+FOR+WORD-LEVEL+LANGUAGE+MODEL&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1MmH30cY7", "title": "NA", "track": "main", "status": "Withdraw", "tldr": "NA", "abstract": "NA", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "NA", "authorids": "na@na.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1MmH30cY7", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;4", "wc_review": "344;449;378", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "77;104;81", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 390.3333333333333, 43.74420596553966 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 87.33333333333333, 11.897712198383164 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r1MxciCcKm", "title": "Connecting the Dots Between MLE and RL for Sequence Generation", "track": "main", "status": "Reject", "tldr": "A unified perspective of various learning algorithms for sequence generation, such as MLE, RL, RAML, data noising, etc.", "abstract": "Sequence generation models such as recurrent networks can be trained with a diverse set of learning algorithms. For example, maximum likelihood learning is simple and efficient, yet suffers from the exposure bias problem. Reinforcement learning like policy gradient addresses the problem but can have prohibitively poor exploration efficiency. A variety of other algorithms such as RAML, SPG, and data noising, have also been developed in different perspectives. This paper establishes a formal connection between these algorithms. We present a generalized entropy regularized policy optimization formulation, and show that the apparently divergent algorithms can all be reformulated as special instances of the framework, with the only difference being the configurations of reward function and a couple of hyperparameters. The unified interpretation offers a systematic view of the varying properties of exploration and learning efficiency. Besides, based on the framework, we present a new algorithm that dynamically interpolates among the existing algorithms for improved learning. Experiments on machine translation and text summarization demonstrate the superiority of the proposed algorithm.", "keywords": "sequence generation;maximum likelihood learning;reinforcement learning;policy optimization;text generation;reward augmented maximum likelihood;exposure bias", "primary_area": "", "supplementary_material": "", "author": "Bowen Tan*;Zhiting Hu*;Zichao Yang;Ruslan Salakhutdinov;Eric P. Xing", "authorids": "tanbowen@sjtu.edu.cn;zhitinghu@gmail.com;yangtze2301@gmail.com;rsalakhu@cs.cmu.edu;epxing@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntan*2019connecting,\ntitle={Connecting the Dots Between {MLE} and {RL} for Sequence Generation},\nauthor={Bowen Tan* and Zhiting Hu* and Zichao Yang and Ruslan Salakhutdinov and Eric P. Xing},\nyear={2019},\nurl={https://openreview.net/forum?id=r1MxciCcKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1MxciCcKm", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;3", "wc_review": "455;926;123", "wc_reply_reviewers": "80;0;0", "wc_reply_authors": "503;894;166", "reply_reviewers": "1;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 501.3333333333333, 329.45645471830653 ], "wc_reply_reviewers_avg": [ 26.666666666666668, 37.71236166328253 ], "wc_reply_authors_avg": [ 521.0, 297.4771699923654 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17496310071486953976&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "title": "Unsupervised Learning via Meta-Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/901", "id": "r1My6sR9tX", "author_site": "Kyle Hsu, Sergey Levine, Chelsea Finn", "tldr": "An unsupervised learning method that uses meta-learning to enable efficient learning of downstream image classification tasks, outperforming state-of-the-art methods.", "abstract": "A central goal of unsupervised learning is to acquire representations from unlabeled data or experience that can be used for more effective learning of downstream tasks from modest amounts of labeled data. Many prior unsupervised learning works aim to do so by developing proxy objectives based on reconstruction, disentanglement, prediction, and other metrics. Instead, we develop an unsupervised meta-learning method that explicitly optimizes for the ability to learn a variety of tasks from small amounts of data. To do so, we construct tasks from unlabeled data in an automatic way and run meta-learning over the constructed tasks. Surprisingly, we find that, when integrated with meta-learning, relatively simple task construction mechanisms, such as clustering embeddings, lead to good performance on a variety of downstream, human-specified tasks. Our experiments across four image datasets indicate that our unsupervised meta-learning approach acquires a learning algorithm without any labeled data that is applicable to a wide range of downstream classification tasks, improving upon the embedding learned by four prior unsupervised learning methods.", "keywords": "unsupervised learning;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Kyle Hsu;Sergey Levine;Chelsea Finn", "authorids": "kyle.hsu@mail.utoronto.ca;svlevine@eecs.berkeley.edu;cbfinn@eecs.berkeley.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhsu2018unsupervised,\ntitle={Unsupervised Learning via Meta-Learning},\nauthor={Kyle Hsu and Sergey Levine and Chelsea Finn},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1My6sR9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer5;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7;8", "confidence": "3;3;4;4", "wc_review": "271;211;298;372", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "729;343;70;680", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "rating_avg": [ 6.75, 0.82915619758885 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 288.0, 57.82300580218915 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 455.5, 267.61399440238546 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9045340337332909, "gs_citation": 299, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=52752672237685597&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=r1My6sR9tX", "pdf": "https://openreview.net/pdf?id=r1My6sR9tX", "email": ";;", "author_num": 3 }, { "id": "r1NDBsAqY7", "title": "Unsupervised Word Discovery with Segmental Neural Language Models", "track": "main", "status": "Reject", "tldr": "A LSTM language model that discovers words from unsegmented sequences of characters.", "abstract": "We propose a segmental neural language model that combines the representational power of neural networks and the structure learning mechanism of Bayesian nonparametrics, and show that it learns to discover semantically meaningful units (e.g., morphemes and words) from unsegmented character sequences. The model generates text as a sequence of segments, where each segment is generated either character-by-character from a sequence model or as a single draw from a lexical memory that stores multi-character units. Its parameters are fit to maximize the marginal likelihood of the training data, summing over all segmentations of the input, and its hyperparameters are likewise set to optimize held-out marginal likelihood.\nTo prevent the model from overusing the lexical memory, which leads to poor generalization and bad segmentation, we introduce a differentiable regularizer that penalizes based on the expected length of each segment. To our knowledge, this is the first demonstration of neural networks that have predictive distributions better than LSTM language models and also infer a segmentation into word-like units that are competitive with the best existing word discovery models.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Kazuya Kawakami;Chris Dyer;Phil Blunsom", "authorids": "kawakamik@google.com;cdyer@google.com;pblunsom@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkawakami2019unsupervised,\ntitle={Unsupervised Word Discovery with Segmental Neural Language Models},\nauthor={Kazuya Kawakami and Chris Dyer and Phil Blunsom},\nyear={2019},\nurl={https://openreview.net/forum?id=r1NDBsAqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=r1NDBsAqY7", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;3;4", "wc_review": "281;383;372", "wc_reply_reviewers": "406;0;0", "wc_reply_authors": "896;144;232", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 345.3333333333333, 45.71165667043315 ], "wc_reply_reviewers_avg": [ 135.33333333333334, 191.39023544115884 ], "wc_reply_authors_avg": [ 424.0, 335.6823895688701 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.3273268353539886, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6117541566646001286&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Auxiliary Variational MCMC", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/996", "id": "r1NJqsRctX", "author_site": "Raza Habib, David Barber", "tldr": "", "abstract": "We introduce Auxiliary Variational MCMC, a novel framework for learning MCMC kernels that combines recent advances in variational inference with insights drawn from traditional auxiliary variable MCMC methods such as Hamiltonian Monte Carlo. Our framework exploits low dimensional structure in the target distribution in order to learn a more efficient MCMC sampler. The resulting sampler is able to suppress random walk behaviour and mix between modes efficiently, without the need to compute gradients of the target distribution. We test our sampler on a number of challenging distributions, where the underlying structure is known, and on the task of posterior sampling in Bayesian logistic regression. Code to reproduce all experiments is available at https://github.com/AVMCMC/AuxiliaryVariationalMCMC .\n", "keywords": "MCMC;Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Raza Habib;David Barber", "authorids": "raza.habib@cs.ucl.ac.uk;david.barber@ucl.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nhabib2018auxiliary,\ntitle={Auxiliary Variational {MCMC}},\nauthor={Raza Habib and David Barber},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1NJqsRctX},\n}", "github": "[![github](/images/github_icon.svg) AVMCMC/AuxiliaryVariationalMCMC](https://github.com/AVMCMC/AuxiliaryVariationalMCMC)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;4;5", "wc_review": "623;377;479", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1256;594;1139", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 493.0, 100.91580649234291 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 996.3333333333334, 288.47453190109445 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16399175938915448128&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=r1NJqsRctX", "pdf": "https://openreview.net/pdf?id=r1NJqsRctX", "email": ";", "author_num": 2 }, { "id": "r1Nb5i05tX", "title": "The effectiveness of layer-by-layer training using the information bottleneck principle", "track": "main", "status": "Reject", "tldr": "", "abstract": "The recently proposed information bottleneck (IB) theory of deep nets suggests that during training, each layer attempts to maximize its mutual information (MI) with the target labels (so as to allow good prediction accuracy), while minimizing its MI with the input (leading to effective compression and thus good generalization). To date, evidence of this phenomenon has been indirect and aroused controversy due to theoretical and practical complications. In particular, it has been pointed out that the MI with the input is theoretically in\ufb01nite in many cases of interest, and that the MI with the target is fundamentally dif\ufb01cult to estimate in high dimensions. As a consequence, the validity of this theory has been questioned. In this paper, we overcome these obstacles by two means. First, as previously suggested, we replace the MI with the input by a noise-regularized version, which ensures it is \ufb01nite. As we show, this modi\ufb01ed penalty in fact acts as a form of weight decay regularization. Second, to obtain accurate (noise regularized) MI estimates between an intermediate representation and the input, we incorporate the strong prior-knowledge we have about their relation, into the recently proposed MI estimator of Belghazi et al. (2018). With this scheme, we are able to stably train each layer independently to explicitly optimize the IB functional. Surprisingly, this leads to enhanced prediction accuracy, thus directly validating the IB theory of deep nets for the \ufb01rst time.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Adar Elad;Doron Haviv;Yochai Blau;Tomer Michaeli", "authorids": "adarelad@campus.technion.ac.il;doron.haviv12@gmail.com;yochai@campus.technion.ac.il;tomer.m@ee.technion.ac.il", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nelad2019the,\ntitle={The effectiveness of layer-by-layer training using the information bottleneck principle},\nauthor={Adar Elad and Doron Haviv and Yochai Blau and Tomer Michaeli},\nyear={2019},\nurl={https://openreview.net/forum?id=r1Nb5i05tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1Nb5i05tX", "pdf_size": 0, "rating": "2;5;5", "confidence": "4;5;4", "wc_review": "1086;419;324", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1112;438;126", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 609.6666666666666, 339.0440810409303 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 558.6666666666666, 411.4764742836325 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12738273762748354120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1V0m3C5YQ", "title": "Coupled Recurrent Models for Polyphonic Music Composition", "track": "main", "status": "Reject", "tldr": "New recurrent generative models for composition of rhythmically complex, polyphonic music.", "abstract": "This work describes a novel recurrent model for music composition, which accounts for the rich statistical structure of polyphonic music. There are many ways to factor the probability distribution over musical scores; we consider the merits of various approaches and propose a new factorization that decomposes a score into a collection of concurrent, coupled time series: \"parts.\" The model we propose borrows ideas from both convolutional neural models and recurrent neural models; we argue that these ideas are natural for capturing music's pitch invariances, temporal structure, and polyphony.\n\nWe train generative models for homophonic and polyphonic composition on the KernScores dataset (Sapp, 2005), a collection of 2,300 musical scores comprised of around 2.8 million notes spanning time from the Renaissance to the early 20th century. While evaluation of generative models is know to be hard (Theis et al., 2016), we present careful quantitative results using a unit-adjusted cross entropy metric that is independent of how we factor the distribution over scores. We also present qualitative results using a blind discrimination test.\n", "keywords": "music composition;music generation;polyphonic music modeling", "primary_area": "", "supplementary_material": "", "author": "John Thickstun;Zaid Harchaoui;Dean P. Foster;Sham M. Kakade", "authorids": "thickstn@cs.washington.edu;zaid@uw.edu;sham@cs.washington.edu;dean@foster.net", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nthickstun2019coupled,\ntitle={Coupled Recurrent Models for Polyphonic Music Composition},\nauthor={John Thickstun and Zaid Harchaoui and Dean P. Foster and Sham M. Kakade},\nyear={2019},\nurl={https://openreview.net/forum?id=r1V0m3C5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1V0m3C5YQ", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;4;3", "wc_review": "407;1668;127", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "76;413;110", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 734.0, 670.2571645769008 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 199.66666666666666, 151.4867064208019 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9707253433941508, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13551359953087213812&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "r1VPNiA5Fm", "title": "The Universal Approximation Power of Finite-Width Deep ReLU Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We show that finite-width deep ReLU neural networks yield rate-distortion optimal approximation (B\u00f6lcskei et al., 2018) of a wide class of functions, including polynomials, windowed sinusoidal functions, one-dimensional oscillatory textures, and the Weierstrass function, a fractal function which is continuous but nowhere differentiable. Together with the recently established universal approximation result for affine function systems (B\u00f6lcskei et al., 2018), this demonstrates that deep neural networks approximate vastly different signal structures generated by the affine group, the Weyl-Heisenberg group, or through warping, and even certain fractals, all with approximation error decaying exponentially in the number of neurons. We also prove that in the approximation of sufficiently smooth functions finite-width deep networks require strictly fewer neurons than finite-depth wide networks.", "keywords": "rate-distortion optimality;ReLU;deep learning;approximation theory;Weierstrass function", "primary_area": "", "supplementary_material": "", "author": "Dmytro Perekrestenko;Philipp Grohs;Dennis Elbr\u00e4chter;Helmut B\u00f6lcskei", "authorids": "pdmytro@nari.ee.ethz.ch;philipp.grohs@univie.ac.at;dennis.elbraechter@univie.ac.at;boelcskei@nari.ee.ethz.ch", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nperekrestenko2019the,\ntitle={The Universal Approximation Power of Finite-Width Deep Re{LU} Networks},\nauthor={Dmytro Perekrestenko and Philipp Grohs and Dennis Elbr\u00e4chter and Helmut B\u00f6lcskei},\nyear={2019},\nurl={https://openreview.net/forum?id=r1VPNiA5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=r1VPNiA5Fm", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;3", "wc_review": "204;549;753", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "483;773;883", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 502.0, 226.57890457851542 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 713.0, 168.72067646458353 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5637483276705616741&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5 }, { "id": "r1VmtsC5t7", "title": "Is PGD-Adversarial Training Necessary? Alternative Training via a Soft-Quantization Network with Noisy-Natural Samples Only", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Recent work on adversarial attack and defense suggests that projected gradient descent (PGD) is a universal $l_\\infty$ first-order attack, and PGD adversarial training can significantly improve network robustness against a wide range of first-order $l_\\infty$-bounded attacks, represented as the state-of-the-art defense method. However, an obvious weakness of PGD adversarial training is its highly-computational cost in generating adversarial samples, making it computationally infeasible for large and high-resolution real datasets such as the ImageNet dataset. In addition, recent work also has suggested a simple ``close-form'' solution to a robust model on MNIST. Therefore, a natural question raised is that is PGD adversarial training really necessary for robust defense? In this paper, surprisingly, we give a negative answer by proposing a training paradigm that is comparable to PGD adversarial training on several standard datasets, while only using noisy-natural samples. Specifically, we reformulate the min-max objective in PGD adversarial training by a minimization problem to minimize the original network loss plus $l_1$ norms of its gradients evaluated on the inputs (including adversarial samples). The original loss can be solved by natural training; for the $l_1$-norm loss, we propose a computationally-feasible solution by embedding a differentiable soft-quantization layer after the input layer of a network. We show formally that the soft-quantization layer trained with noisy-natural samples is an alternative approach to minimizing the $l_1$-gradient norms as in PGD adversarial training. Extensive empirical evaluations on three standard datasets including MNIST, CIFAR-10 and ImageNet show that our proposed models are comparable to PGD-adversarially-trained models under PGD and BPDA attacks using both cross-entropy and $CW_\\infty$ losses. Remarkably, our method achieves a 24X speed-up on MNIST while maintaining a comparable defensive ability, and for the first time fine-tunes a robust Imagenet model within only two days. Code for the experiments will be released on Github.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Tianhang Zheng;Changyou Chen;Kui Ren", "authorids": "tzheng4@buffalo.edu;cchangyou@gmail.com;kuiren@buffalo.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=r1VmtsC5t7", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7716654396743480263&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "r1Vx_oA5YQ", "title": "Integrated Steganography and Steganalysis with Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, generative adversarial network is the hotspot in research areas and industrial application areas. It's application on data generation in computer vision is most common usage. This paper extends its application to data hiding and security area. In this paper, we propose the novel framework to integrate steganography and steganalysis processes. The proposed framework applies generative adversarial networks as the core structure. The discriminative model simulate the steganalysis process, which can help us understand the sensitivity of cover images to semantic changes. The steganography generative model is to generate stego image which is aligned with the original cover image, and attempts to confuse steganalysis discriminative model. The introduction of cycle discriminative model and inconsistent loss can help to enhance the quality and security of generated stego image in the iterative training process. Training dataset is mixed with intact images as well as intentional attacked images. The mix training process can further improve the robustness and security of new framework. Through the qualitative, quantitative experiments and analysis, this novel framework shows compelling performance and advantages over the current state-of-the-art methods in steganography and steganalysis benchmarks.", "keywords": "Steganography;Steganography;Security;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Chong Yu", "authorids": "dxxzdxxz@126.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nyu2019integrated,\ntitle={Integrated Steganography and Steganalysis with Generative Adversarial Networks},\nauthor={Chong Yu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1Vx_oA5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1Vx_oA5YQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "2;5;4", "wc_review": "135;253;404", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 264.0, 110.09389931629575 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.1889822365046136, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3618842872755685007&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "r1e-nj05FQ", "title": "Evolving intrinsic motivations for altruistic behavior", "track": "main", "status": "Withdraw", "tldr": "We introduce a biologically-inspired modular evolutionary algorithm in which deep RL agents learn to cooperate in a difficult multi-agent social game, which could help to explain the evolution of altruism.", "abstract": "Multi-agent cooperation is an important feature of the natural world. Many tasks involve individual incentives that are misaligned with the common good, yet a wide range of organisms from bacteria to insects and humans are able to overcome their differences and collaborate. Therefore, the emergence of cooperative behavior amongst self-interested individuals is an important question for the fields of multi-agent reinforcement learning (MARL) and evolutionary theory. Here, we study a particular class of multi-agent problems called intertemporal social dilemmas (ISDs), where the conflict between the individual and the group is particularly sharp. By combining MARL with appropriately structured natural selection, we demonstrate that individual inductive biases for cooperation can be learned in a model-free way. To achieve this, we introduce an innovative modular architecture for deep reinforcement learning agents which supports multi-level selection. We present results in two challenging environments, and interpret these in the context of cultural and ecological evolution.", "keywords": "evolution;reinforcement learning;intrinsic reward;multi-agent;social dilemmas;cooperation", "primary_area": "", "supplementary_material": "", "author": "Jane X. Wang;Edward Hughes;Chrisantha Fernando;Wojciech M. Czarnecki;Edgar A. Duenez-Guzman;Joel Z. Leibo", "authorids": "wangjane@google.com;edwardhughes@google.com;chrisantha@google.com;lejlot@google.com;duenez@google.com;jzl@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1e-nj05FQ", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;3;2", "wc_review": "278;210;378", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 288.6666666666667, 68.99919484232326 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 104, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15151497280897719516&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Neural network gradient-based learning of black-box function interfaces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/849", "id": "r1e13s05YX", "author_site": "Alon Jacovi, guy hadash, Einat Kermany, Boaz Carmeli, Ofer Lavi, George M. Kour, Jonathan Berant", "tldr": "Training DNNs to interface w\\ black box functions w\\o intermediate labels by using an estimator sub-network that can be replaced with the black box after training", "abstract": "Deep neural networks work well at approximating complicated functions when provided with data and trained by gradient descent methods. At the same time, there is a vast amount of existing functions that programmatically solve different tasks in a precise manner eliminating the need for training. In many cases, it is possible to decompose a task to a series of functions, of which for some we may prefer to use a neural network to learn the functionality, while for others the preferred method would be to use existing black-box functions. We propose a method for end-to-end training of a base neural network that integrates calls to existing black-box functions. We do so by approximating the black-box functionality with a differentiable neural network in a way that drives the base network to comply with the black-box function interface during the end-to-end optimization process. At inference time, we replace the differentiable estimator with its external black-box non-differentiable counterpart such that the base network output matches the input arguments of the black-box function. Using this ``Estimate and Replace'' paradigm, we train a neural network, end to end, to compute the input to black-box functionality while eliminating the need for intermediate labels. We show that by leveraging the existing precise black-box function during inference, the integrated model generalizes better than a fully differentiable model, and learns more efficiently compared to RL-based methods.", "keywords": "neural networks;black box functions;gradient descent", "primary_area": "", "supplementary_material": "", "author": "Alon Jacovi;Guy Hadash;Einat Kermany;Boaz Carmeli;Ofer Lavi;George Kour;Jonathan Berant", "authorids": "alon.jacovi@il.ibm.com;guyh@il.ibm.com;einatke@il.ibm.com;boazc@il.ibm.com;oferl@il.ibm.com;gkour@ibm.com;joberant@cs.tau.ac.il", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\njacovi2018neural,\ntitle={Neural network gradient-based learning of black-box function interfaces},\nauthor={Alon Jacovi and Guy Hadash and Einat Kermany and Boaz Carmeli and Ofer Lavi and George Kour and Jonathan Berant},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1e13s05YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;3", "wc_review": "264;378;493", "wc_reply_reviewers": "0;0;18", "wc_reply_authors": "100;169;424", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 378.3333333333333, 93.48915563969022 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 231.0, 139.34848402476433 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=103487598035964441&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1e13s05YX", "pdf": "https://openreview.net/pdf?id=r1e13s05YX", "email": ";;;;;;", "author_num": 7 }, { "title": "Self-Tuning Networks: Bilevel Optimization of Hyperparameters using Structured Best-Response Functions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/824", "id": "r1eEG20qKQ", "author_site": "Matthew MacKay, Paul Vicol, Jonathan Lorraine, David Duvenaud, Roger Grosse", "tldr": "We use a hypernetwork to predict optimal weights given hyperparameters, and jointly train everything together.", "abstract": "Hyperparameter optimization can be formulated as a bilevel optimization problem, where the optimal parameters on the training set depend on the hyperparameters. We aim to adapt regularization hyperparameters for neural networks by fitting compact approximations to the best-response function, which maps hyperparameters to optimal weights and biases. We show how to construct scalable best-response approximations for neural networks by modeling the best-response as a single network whose hidden units are gated conditionally on the regularizer. We justify this approximation by showing the exact best-response for a shallow linear network with L2-regularized Jacobian can be represented by a similar gating mechanism. We fit this model using a gradient-based hyperparameter optimization algorithm which alternates between approximating the best-response around the current hyperparameters and optimizing the hyperparameters using the approximate best-response function. Unlike other gradient-based approaches, we do not require differentiating the training loss with respect to the hyperparameters, allowing us to tune discrete hyperparameters, data augmentation hyperparameters, and dropout probabilities. Because the hyperparameters are adapted online, our approach discovers hyperparameter schedules that can outperform fixed hyperparameter values. Empirically, our approach outperforms competing hyperparameter optimization methods on large-scale deep learning problems. We call our networks, which update their own hyperparameters online during training, Self-Tuning Networks (STNs).", "keywords": "hyperparameter optimization;game theory;optimization", "primary_area": "", "supplementary_material": "", "author": "Matthew Mackay;Paul Vicol;Jonathan Lorraine;David Duvenaud;Roger Grosse", "authorids": "mmackay@cs.toronto.edu;pvicol@cs.toronto.edu;lorraine@cs.toronto.edu;duvenaud@cs.toronto.edu;rgrosse@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nmackay2018selftuning,\ntitle={Self-Tuning Networks: Bilevel Optimization of Hyperparameters using Structured Best-Response Functions},\nauthor={Matthew Mackay and Paul Vicol and Jonathan Lorraine and David Duvenaud and Roger Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1eEG20qKQ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=r1eEG20qKQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;4", "wc_review": "225;569;97", "wc_reply_reviewers": "19;0;0", "wc_reply_authors": "439;1083;273", "reply_reviewers": "1;0;0", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 297.0, 199.30546070458448 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 8.956685895029603 ], "wc_reply_authors_avg": [ 598.3333333333334, 349.3473279639556 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 216, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13746959771027006799&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1eEG20qKQ", "pdf": "https://openreview.net/pdf?id=r1eEG20qKQ", "email": ";;;;", "author_num": 5 }, { "id": "r1eJssCqY7", "title": "TabNN: A Universal Neural Network Solution for Tabular Data", "track": "main", "status": "Reject", "tldr": "We propose a universal neural network solution to derive effective NN architectures for tabular data automatically.", "abstract": "Neural Network (NN) has achieved state-of-the-art performances in many tasks within image, speech, and text domains. Such great success is mainly due to special structure design to fit the particular data patterns, such as CNN capturing spatial locality and RNN modeling sequential dependency. Essentially, these specific NNs achieve good performance by leveraging the prior knowledge over corresponding domain data. Nevertheless, there are many applications with all kinds of tabular data in other domains. Since there are no shared patterns among these diverse tabular data, it is hard to design specific structures to fit them all. Without careful architecture design based on domain knowledge, it is quite challenging for NN to reach satisfactory performance in these tabular data domains. To fill the gap of NN in tabular data learning, we propose a universal neural network solution, called TabNN, to derive effective NN architectures for tabular data in all kinds of tasks automatically. Specifically, the design of TabNN follows two principles: \\emph{to explicitly leverages expressive feature combinations} and \\emph{to reduce model complexity}. Since GBDT has empirically proven its strength in modeling tabular data, we use GBDT to power the implementation of TabNN. Comprehensive experimental analysis on a variety of tabular datasets demonstrate that TabNN can achieve much better performance than many baseline solutions.", "keywords": "neural network;machine learning;tabular data", "primary_area": "", "supplementary_material": "", "author": "Guolin Ke;Jia Zhang;Zhenhui Xu;Jiang Bian;Tie-Yan Liu", "authorids": "guolin.ke@microsoft.com;jia.zhang@microsoft.com;zhenhui.xu@pku.edu.cn;jiang.bian@microsoft.com;tyliu@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nke2019tabnn,\ntitle={Tab{NN}: A Universal Neural Network Solution for Tabular Data},\nauthor={Guolin Ke and Jia Zhang and Zhenhui Xu and Jiang Bian and Tie-Yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1eJssCqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1eJssCqY7", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;2;4", "wc_review": "413;149;237", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "478;477;468", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 266.3333333333333, 109.75528334536894 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 474.3333333333333, 4.4969125210773475 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184542, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16874221028640223828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1eO_oCqtQ", "title": "Gaussian-gated LSTM: Improved convergence by reducing state updates", "track": "main", "status": "Reject", "tldr": "Gaussian-gated LSTM is a novel time-gated LSTM RNN network that enables faster and better training on long sequence data.", "abstract": "Recurrent neural networks can be difficult to train on long sequence data due to the well-known vanishing gradient problem. Some architectures incorporate methods to reduce RNN state updates, therefore allowing the network to preserve memory over long temporal intervals. To address these problems of convergence, this paper proposes a timing-gated LSTM RNN model, called the Gaussian-gated LSTM (g-LSTM). The time gate controls when a neuron can be updated during training, enabling longer memory persistence and better error-gradient flow. This model captures long-temporal dependencies better than an LSTM and the time gate parameters can be learned even from non-optimal initialization values. Because the time gate limits the updates of the neuron state, the number of computes needed for the network update is also reduced. By adding a computational budget term to the training loss, we can obtain a network which further reduces the number of computes by at least 10x. Finally, by employing a temporal curriculum learning schedule for the g-LSTM, we can reduce the convergence time of the equivalent LSTM network on long sequences.", "keywords": "time gate;faster convergence;trainability;rnn;computational budget", "primary_area": "", "supplementary_material": "", "author": "Matthew Thornton;Jithendar Anumula;Shih-Chii Liu", "authorids": "mattsthornton@gmail.com;anumula@ini.uzh.ch;shih@ini.uzh.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nthornton2019gaussiangated,\ntitle={Gaussian-gated {LSTM}: Improved convergence by reducing state updates},\nauthor={Matthew Thornton and Jithendar Anumula and Shih-Chii Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1eO_oCqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1eO_oCqtQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;4", "wc_review": "326;356;219", "wc_reply_reviewers": "0;483;0", "wc_reply_authors": "196;331;270", "reply_reviewers": "0;2;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 300.3333333333333, 58.80098260705815 ], "wc_reply_reviewers_avg": [ 161.0, 227.6883835420683 ], "wc_reply_authors_avg": [ 265.6666666666667, 55.19863122296502 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8T1sWcSwzhUJ:scholar.google.com/&scioq=Gaussian-gated+LSTM:+Improved+convergence+by+reducing+state+updates&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Unsupervised Control Through Non-Parametric Discriminative Rewards", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/829", "id": "r1eVMnA9K7", "author_site": "David Warde-Farley, Tom Wiele, Tejas Kulkarni, Catalin Ionescu, Steven Hansen, Volodymyr Mnih", "tldr": "Unsupervised reinforcement learning method for learning a policy to robustly achieve perceptually specified goals.", "abstract": "Learning to control an environment without hand-crafted rewards or expert data remains challenging and is at the frontier of reinforcement learning research. We present an unsupervised learning algorithm to train agents to achieve perceptually-specified goals using only a stream of observations and actions. Our agent simultaneously learns a goal-conditioned policy and a goal achievement reward function that measures how similar a state is to the goal state. This dual optimization leads to a co-operative game, giving rise to a learned reward function that reflects similarity in controllable aspects of the environment instead of distance in the space of observations. We demonstrate the efficacy of our agent to learn, in an unsupervised manner, to reach a diverse set of goals on three domains -- Atari, the DeepMind Control Suite and DeepMind Lab.", "keywords": "deep reinforcement learning;goals;UVFA;mutual information", "primary_area": "", "supplementary_material": "", "author": "David Warde-Farley;Tom Van de Wiele;Tejas Kulkarni;Catalin Ionescu;Steven Hansen;Volodymyr Mnih", "authorids": "d.warde.farley@gmail.com;tomvandewiele@google.com;tejasdkulkarni@gmail.com;cdi@google.com;stevenhansen@google.com;vmnih@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nwarde-farley2018unsupervised,\ntitle={Unsupervised Control Through Non-Parametric Discriminative Rewards},\nauthor={David Warde-Farley and Tom Van de Wiele and Tejas Kulkarni and Catalin Ionescu and Steven Hansen and Volodymyr Mnih},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1eVMnA9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;7;8", "confidence": "5;3;5", "wc_review": "344;942;994", "wc_reply_reviewers": "322;30;111", "wc_reply_authors": "882;1261;1722", "reply_reviewers": "1;1;1", "reply_authors": "2;2;3", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 760.0, 294.9214584710083 ], "wc_reply_reviewers_avg": [ 154.33333333333334, 123.0835308055288 ], "wc_reply_authors_avg": [ 1288.3333333333333, 343.47278721254696 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 201, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11006025124069493159&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=r1eVMnA9K7", "pdf": "https://openreview.net/pdf?id=r1eVMnA9K7", "email": ";;;;;", "author_num": 6 }, { "id": "r1eWW2RqFX", "title": "PointGrow: Autoregressively Learned Point Cloud Generation with Self-Attention", "track": "main", "status": "Withdraw", "tldr": "An autoregressive deep learning model for generating diverse point clouds.", "abstract": "A point cloud is an agile 3D representation, efficiently modeling an object's surface geometry. However, these surface-centric properties also pose challenges on designing tools to recognize and synthesize point clouds. This work presents a novel autoregressive model, PointGrow, which generates realistic point cloud samples from scratch or conditioned from given semantic contexts. Our model operates recurrently, with each point sampled according to a conditional distribution given its previously-generated points. Since point cloud object shapes are typically encoded by long-range interpoint dependencies, we augment our model with dedicated self-attention modules to capture these relations. Extensive evaluation demonstrates that PointGrow achieves satisfying performance on both unconditional and conditional point cloud generation tasks, with respect to fidelity, diversity and semantic preservation. Further, conditional PointGrow learns a smooth manifold of given images where 3D shape interpolation and arithmetic calculation can be performed inside.", "keywords": "point cloud generation;autoregressive models;self-attention", "primary_area": "", "supplementary_material": "", "author": "Yongbin Sun;Yue Wang;Ziwei Liu;Joshua E. Siegel;Sanjay Sarma", "authorids": "yb_sun@mit.edu;yuewang@csail.mit.edu;zwliu.hust@gmail.com;j_siegel@mit.edu;sesarma@mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1eWW2RqFX", "pdf_size": 0, "rating": "3;6;6", "confidence": "4;5;4", "wc_review": "295;384;282", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 320.3333333333333, 45.330882286680556 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 248, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11174852044297249411&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "title": "Interpolation-Prediction Networks for Irregularly Sampled Time Series", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1058", "id": "r1efr3C9Ym", "author_site": "Satya Narayan Shukla, Benjamin M Marlin", "tldr": "This paper presents a new deep learning architecture for addressing the problem of supervised learning with sparse and irregularly sampled multivariate time series.", "abstract": "In this paper, we present a new deep learning architecture for addressing the problem of supervised learning with sparse and irregularly sampled multivariate time series. The architecture is based on the use of a semi-parametric interpolation network followed by the application of a prediction network. The interpolation network allows for information to be shared across multiple dimensions of a multivariate time series during the interpolation stage, while any standard deep learning model can be used for the prediction network. This work is motivated by the analysis of physiological time series data in electronic health records, which are sparse, irregularly sampled, and multivariate. We investigate the performance of this architecture on both classification and regression tasks, showing that our approach outperforms a range of baseline and recently proposed models.\n", "keywords": "irregular sampling;multivariate time series;supervised learning;interpolation;missing data", "primary_area": "", "supplementary_material": "", "author": "Satya Narayan Shukla;Benjamin Marlin", "authorids": "snshukla@cs.umass.edu;marlin@cs.umass.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nshukla2018interpolationprediction,\ntitle={Interpolation-Prediction Networks for Irregularly Sampled Time Series},\nauthor={Satya Narayan Shukla and Benjamin Marlin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1efr3C9Ym},\n}", "github": "[![github](/images/github_icon.svg) mlds-lab/interp-net](https://github.com/mlds-lab/interp-net)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "wc_review": "185;380;1219", "wc_reply_reviewers": "0;0;726", "wc_reply_authors": "417;530;1656", "reply_reviewers": "0;0;3", "reply_authors": "1;1;5", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 594.6666666666666, 448.5906324875226 ], "wc_reply_reviewers_avg": [ 242.0, 342.239682094289 ], "wc_reply_authors_avg": [ 867.6666666666666, 559.3414778906432 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 197, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15477406781147246766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1efr3C9Ym", "pdf": "https://openreview.net/pdf?id=r1efr3C9Ym", "email": ";", "author_num": 2 }, { "title": "Riemannian Adaptive Optimization Methods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/987", "id": "r1eiqi09K7", "author_site": "Gary B\u00e9cigneul, Octavian Ganea", "tldr": "Adapting Adam, Amsgrad, Adagrad to Riemannian manifolds. ", "abstract": "Several first order stochastic optimization methods commonly used in the Euclidean domain such as stochastic gradient descent (SGD), accelerated gradient descent or variance reduced methods have already been adapted to certain Riemannian settings. However, some of the most popular of these optimization tools - namely Adam, Adagrad and the more recent Amsgrad - remain to be generalized to Riemannian manifolds. We discuss the difficulty of generalizing such adaptive schemes to the most agnostic Riemannian setting, and then provide algorithms and convergence proofs for geodesically convex objectives in the particular case of a product of Riemannian manifolds, in which adaptivity is implemented across manifolds in the cartesian product. Our generalization is tight in the sense that choosing the Euclidean space as Riemannian manifold yields the same algorithms and regret bounds as those that were already known for the standard algorithms. Experimentally, we show faster convergence and to a lower train loss value for Riemannian adaptive methods over their corresponding baselines on the realistic task of embedding the WordNet taxonomy in the Poincare ball.", "keywords": "Riemannian optimization;adaptive;hyperbolic;curvature;manifold;adam;amsgrad;adagrad;rsgd;convergence", "primary_area": "", "supplementary_material": "", "author": "Gary Becigneul;Octavian-Eugen Ganea", "authorids": "gary.becigneul@inf.ethz.ch;octavian.ganea@inf.ethz.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nbecigneul2018riemannian,\ntitle={Riemannian Adaptive Optimization Methods},\nauthor={Gary Becigneul and Octavian-Eugen Ganea},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1eiqi09K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;5;3", "wc_review": "299;346;52", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "385;540;28", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 232.33333333333334, 128.9504642187155 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 317.6666666666667, 214.37713393819678 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 311, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1396968712065287327&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1eiqi09K7", "pdf": "https://openreview.net/pdf?id=r1eiqi09K7", "email": ";", "author_num": 2 }, { "id": "r1ejxnCctX", "title": "Representation Flow for Action Recognition", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "In this paper, we propose a convolutional layer inspired by optical flow algorithms to learn motion representations. Our representation flow layer is a fully-differentiable layer designed to capture the `flow' of any representation channel within a convolutional neural network for action recognition. Its parameters for iterative flow optimization are learned in an end-to-end fashion together with the other model parameters, maximizing the action recognition performance. Furthermore, we newly introduce the concept of learning `flow of flow' representations by stacking multiple representation flow layers. We conducted extensive experimental evaluations, confirming its advantages over previous recognition models using traditional optical flows in both computational speed and performance.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "AJ Piergiovanni;Michael S. Ryoo", "authorids": "ajpiergi@indiana.edu;mryoo@indiana.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1ejxnCctX", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;4;5", "wc_review": "596;520;685", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "851;271;641", "reply_reviewers": "0;0;0", "reply_authors": "7;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 600.3333333333334, 67.43062278289361 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 587.6666666666666, 239.76840677805924 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 2.8284271247461903 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 209, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12306061084535496096&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "r1elIi09K7", "title": "Learning a Neural-network-based Representation for Open Set Recognition", "track": "main", "status": "Reject", "tldr": "In this paper, we present a neural network based representation for addressing the open set recognition problem.", "abstract": "In this paper, we present a neural network based representation for addressing the open set recognition problem. In this representation instances from the same class are close to each other while instances from different classes are further apart, resulting in statistically significant improvement when compared to other approaches on three datasets from two different domains. \n", "keywords": "open set recognition", "primary_area": "", "supplementary_material": "", "author": "Mehadi Hassen;Philip K. Chan", "authorids": "mhassen2005@my.fit.edu;pkc@cs.fit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhassen2019learning,\ntitle={Learning a Neural-network-based Representation for Open Set Recognition},\nauthor={Mehadi Hassen and Philip K. Chan},\nyear={2019},\nurl={https://openreview.net/forum?id=r1elIi09K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1elIi09K7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "382;357;444", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "68;190;489", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 394.3333333333333, 36.572606627851336 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 249.0, 176.86341245906874 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15464454179087536075&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "r1erRoCqtX", "title": "LSH Microbatches for Stochastic Gradients: Value in Rearrangement", "track": "main", "status": "Reject", "tldr": "Accelerating SGD by arranging examples differently", "abstract": " Metric embeddings are immensely useful representations of associations between entities (images, users, search queries, words, and more). Embeddings are learned by optimizing a loss objective of the general form of a sum over example associations. Typically, the optimization uses stochastic gradient updates over minibatches of examples that are arranged independently at random. In this work, we propose the use of {\\em structured arrangements} through randomized {\\em microbatches} of examples that are more likely to include similar ones. We make a principled argument for the properties of our arrangements that accelerate the training and present efficient algorithms to generate microbatches that respect the marginal distribution of training examples. Finally, we observe experimentally that our structured arrangements accelerate training by 3-20\\%. Structured arrangements emerge as a powerful and novel performance knob for SGD that is independent and complementary to other SGD hyperparameters and thus is a candidate for wide deployment.", "keywords": "Stochastic Gradient Descent;Metric Embeddings;Locality Sensitive Hashing;Microbatches;Sample coordination", "primary_area": "", "supplementary_material": "", "author": "Eliav Buchnik;Edith Cohen;Avinatan Hassidim;Yossi Matias", "authorids": "eliavbuh@gmail.com;edith@cohenwang.com;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbuchnik2019lsh,\ntitle={{LSH} Microbatches for Stochastic Gradients: Value in Rearrangement},\nauthor={Eliav Buchnik and Edith Cohen and Avinatan Hassidim and Yossi Matias},\nyear={2019},\nurl={https://openreview.net/forum?id=r1erRoCqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer3;AnonReviewer5", "site": "https://openreview.net/forum?id=r1erRoCqtX", "pdf_size": 0, "rating": "3;4;4;4", "confidence": "2;4;4;3", "wc_review": "349;477;246;726", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "488;775;477;773", "reply_reviewers": "0;0;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 3.75, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.82915619758885 ], "wc_review_avg": [ 449.5, 179.38854478477717 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 628.25, 145.80359220540487 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:xbX0-1hVgjAJ:scholar.google.com/&scioq=LSH+Microbatches+for+Stochastic+Gradients:+Value+in+Rearrangement&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1esnoAqt7", "title": "Morpho-MNIST: Quantitative Assessment and Diagnostics for Representation Learning", "track": "main", "status": "Reject", "tldr": "This paper introduces Morpho-MNIST, a collection of shape metrics and perturbations, in a step towards quantitative evaluation of representation learning.", "abstract": "Revealing latent structure in data is an active field of research, having introduced exciting technologies such as variational autoencoders and adversarial networks, and is essential to push machine learning towards unsupervised knowledge discovery. However, a major challenge is the lack of suitable benchmarks for an objective and quantitative evaluation of learned representations. To address this issue we introduce Morpho-MNIST, a framework that aims to answer: \"to what extent has my model learned to represent specific factors of variation in the data?\" We extend the popular MNIST dataset by adding a morphometric analysis enabling quantitative comparison of trained models, identification of the roles of latent variables, and characterisation of sample diversity. We further propose a set of quantifiable perturbations to assess the performance of unsupervised and supervised methods on challenging tasks such as outlier detection and domain adaptation.", "keywords": "quantitative evaluation;diagnostics;generative models;representation learning;morphometrics;image perturbations", "primary_area": "", "supplementary_material": "", "author": "Daniel C. Castro;Jeremy Tan;Bernhard Kainz;Ender Konukoglu;Ben Glocker", "authorids": "d.coelho-de-castro15@imperial.ac.uk;j.tan17@imperial.ac.uk;b.kainz@imperial.ac.uk;kender@vision.ee.ethz.ch;b.glocker@imperial.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ncastro2019morphomnist,\ntitle={Morpho-{MNIST}: Quantitative Assessment and Diagnostics for Representation Learning},\nauthor={Daniel C. Castro and Jeremy Tan and Bernhard Kainz and Ender Konukoglu and Ben Glocker},\nyear={2019},\nurl={https://openreview.net/forum?id=r1esnoAqt7},\n}", "github": "[![github](/images/github_icon.svg) dccastro/Morpho-MNIST](https://github.com/dccastro/Morpho-MNIST)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1esnoAqt7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;3", "wc_review": "234;84;221", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "330;355;588", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 179.66666666666666, 67.8544193271317 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 424.3333333333333, 116.17898069597425 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 76, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9456890875566188139&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "r1espiA9YQ", "title": "Towards More Theoretically-Grounded Particle Optimization Sampling for Deep Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many deep-learning based methods such as Bayesian deep learning (DL) and deep reinforcement learning (RL) have heavily relied on the ability of a model being able to efficiently explore via Bayesian sampling. Particle-optimization sampling (POS) is a recently developed technique to generate high-quality samples from a target distribution by iteratively updating a set of interactive particles, with a representative algorithm the Stein variational gradient descent (SVGD). Though obtaining significant empirical success, the {\\em non-asymptotic} convergence behavior of SVGD remains unknown. In this paper, we generalize POS to a stochasticity setting by injecting random noise in particle updates, called stochastic particle-optimization sampling (SPOS). Notably, for the first time, we develop {\\em non-asymptotic convergence theory} for the SPOS framework, characterizing convergence of a sample approximation w.r.t.\\! the number of particles and iterations under both convex- and noncovex-energy-function settings. Interestingly, we provide theoretical understanding of a pitfall of SVGD that can be avoided in the proposed SPOS framework, {\\it i.e.}, particles tend to collapse to a local mode in SVGD under some particular conditions. Our theory is based on the analysis of nonlinear stochastic differential equations, which serves as an extension and a complementary development to the asymptotic convergence theory for SVGD such as (Liu, 2017). With such theoretical guarantees, SPOS can be safely and effectively applied on both Bayesian DL and deep RL tasks. Extensive results demonstrate the effectiveness of our proposed framework.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jianyi Zhang;Ruiyi Zhang;Changyou Chen", "authorids": "15300180019@fudan.edu.cn;rz68@duke.edu;cchangyou@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019towards,\ntitle={Towards More Theoretically-Grounded Particle Optimization Sampling for Deep Learning},\nauthor={Jianyi Zhang and Ruiyi Zhang and Changyou Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=r1espiA9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1espiA9YQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;4", "wc_review": "625;362;482", "wc_reply_reviewers": "420;0;0", "wc_reply_authors": "2940;529;238", "reply_reviewers": "1;0;0", "reply_authors": "8;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 489.6666666666667, 107.50607217992645 ], "wc_reply_reviewers_avg": [ 140.0, 197.9898987322333 ], "wc_reply_authors_avg": [ 1235.6666666666667, 1210.9870170879437 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 3.3333333333333335, 3.2998316455372216 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Jy4SxVMitMYJ:scholar.google.com/&scioq=Towards+More+Theoretically-Grounded+Particle+Optimization+Sampling+for+Deep+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "r1exVhActQ", "title": "DEEP-TRIM: REVISITING L1 REGULARIZATION FOR CONNECTION PRUNING OF DEEP NETWORK", "track": "main", "status": "Reject", "tldr": "We revisit the simple idea of pruning connections of DNNs through $\\ell_1$ regularization achieving state-of-the-art results on multiple datasets with theoretic guarantees.", "abstract": "State-of-the-art deep neural networks (DNNs) typically have tens of millions of parameters, which might not fit into the upper levels of the memory hierarchy, thus increasing the inference time and energy consumption significantly, and prohibiting their use on edge devices such as mobile phones. The compression of DNN models has therefore become an active area of research recently, with \\emph{connection pruning} emerging as one of the most successful strategies. A very natural approach is to prune connections of DNNs via $\\ell_1$ regularization, but recent empirical investigations have suggested that this does not work as well in the context of DNN compression. In this work, we revisit this simple strategy and analyze it rigorously, to show that: (a) any \\emph{stationary point} of an $\\ell_1$-regularized layerwise-pruning objective has its number of non-zero elements bounded by the number of penalized prediction logits, regardless of the strength of the regularization; (b) successful pruning highly relies on an accurate optimization solver, and there is a trade-off between compression speed and distortion of prediction accuracy, controlled by the strength of regularization. Our theoretical results thus suggest that $\\ell_1$ pruning could be successful provided we use an accurate optimization solver. We corroborate this in our experiments, where we show that simple $\\ell_1$ regularization with an Adamax-L1(cumulative) solver gives pruning ratio competitive to the state-of-the-art.", "keywords": "L1 regularization;deep neural network;deep compression", "primary_area": "", "supplementary_material": "", "author": "Chih-Kuan Yeh;Ian E.H. Yen;Hong-You Chen;Chun-Pei Yang;Shou-De Lin;Pradeep Ravikumar", "authorids": "cjyeh@cs.cmu.edu;eyan2@snapchat.com;applebasket70179@gmail.com;skylyyang@gmail.com;sdlin@csie.ntu.edu.tw;pradeep.ravikumar@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nyeh2019deeptrim,\ntitle={{DEEP}-{TRIM}: {REVISITING} L1 {REGULARIZATION} {FOR} {CONNECTION} {PRUNING} {OF} {DEEP} {NETWORK}},\nauthor={Chih-Kuan Yeh and Ian E.H. Yen and Hong-You Chen and Chun-Pei Yang and Shou-De Lin and Pradeep Ravikumar},\nyear={2019},\nurl={https://openreview.net/forum?id=r1exVhActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=r1exVhActQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;3", "wc_review": "260;238;292", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "250;315;300", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 263.3333333333333, 22.17105219775452 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 288.3333333333333, 27.78888666755511 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18249881634292259345&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1ez_sRcFQ", "title": "Pixel Redrawn For A Robust Adversarial Defense", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, an adversarial example becomes a serious problem to be aware of because it can fool trained neural networks easily.\nTo prevent the issue, many researchers have proposed several defense techniques such as adversarial training, input transformation, stochastic activation pruning, etc.\nIn this paper, we propose a novel defense technique, Pixel Redrawn (PR) method, which redraws every pixel of training images to convert them into distorted images.\nThe motivation for our PR method is from the observation that the adversarial attacks have redrawn some pixels of the original image with the known parameters of the trained neural network.\nMimicking these attacks, our PR method redraws the image without any knowledge of the trained neural network.\nThis method can be similar to the adversarial training method but our PR method can be used to prevent future attacks.\nExperimental results on several benchmark datasets indicate our PR method not only relieves the over-fitting issue when we train neural networks with a large number of epochs, but it also boosts the robustness of the neural network.", "keywords": "adversarial machine learning;deep learning;adversarial example", "primary_area": "", "supplementary_material": "", "author": "Jiacang Ho;Dae-Ki Kang", "authorids": "ho_jiacang@hotmail.com;dkkang@dongseo.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nho2019pixel,\ntitle={Pixel Redrawn For A Robust Adversarial Defense},\nauthor={Jiacang Ho and Dae-Ki Kang},\nyear={2019},\nurl={https://openreview.net/forum?id=r1ez_sRcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1ez_sRcFQ", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;3", "wc_review": "447;110;185", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "352;102;108", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 247.33333333333334, 144.46760498080144 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 187.33333333333334, 116.46267880980395 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12422525997970297529&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Minimal Random Code Learning: Getting Bits Back from Compressed Model Parameters", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1016", "id": "r1f0YiCctm", "author_site": "Marton Havasi, Robert Peharz, Jos\u00e9 Miguel Hern\u00e1ndez Lobato", "tldr": "This paper proposes an effective method to compress neural networks based on recent results in information theory.", "abstract": "While deep neural networks are a highly successful model class, their large memory footprint puts considerable strain on energy consumption, communication bandwidth, and storage requirements. Consequently, model size reduction has become an utmost goal in deep learning. A typical approach is to train a set of deterministic weights, while applying certain techniques such as pruning and quantization, in order that the empirical weight distribution becomes amenable to Shannon-style coding schemes. However, as shown in this paper, relaxing weight determinism and using a full variational distribution over weights allows for more efficient coding schemes and consequently higher compression rates. In particular, following the classical bits-back argument, we encode the network weights using a random sample, requiring only a number of bits corresponding to the Kullback-Leibler divergence between the sampled variational distribution and the encoding distribution. By imposing a constraint on the Kullback-Leibler divergence, we are able to explicitly control the compression rate, while optimizing the expected loss on the training set. The employed encoding scheme can be shown to be close to the optimal information-theoretical lower bound, with respect to the employed variational family. Our method sets new state-of-the-art in neural network compression, as it strictly dominates previous approaches in a Pareto sense: On the benchmarks LeNet-5/MNIST and VGG-16/CIFAR-10, our approach yields the best test performance for a fixed memory budget, and vice versa, it achieves the highest compression rates for a fixed test performance.", "keywords": "compression;neural networks;bits-back argument;Bayesian;Shannon;information theory", "primary_area": "", "supplementary_material": "", "author": "Marton Havasi;Robert Peharz;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "mh740@cam.ac.uk;rp587@cam.ac.uk;jmh233@cam.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhavasi2018minimal,\ntitle={Minimal Random Code Learning: Getting Bits Back from Compressed Model Parameters},\nauthor={Marton Havasi and Robert Peharz and Jos\u00e9 Miguel Hern\u00e1ndez-Lobato},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1f0YiCctm},\n}", "github": "[![github](/images/github_icon.svg) cambridge-mlg/miracle](https://github.com/cambridge-mlg/miracle) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=r1f0YiCctm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "2;3;4", "wc_review": "85;211;402", "wc_reply_reviewers": "0;0;30", "wc_reply_authors": "100;224;389", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 232.66666666666666, 130.31841346827736 ], "wc_reply_reviewers_avg": [ 10.0, 14.142135623730951 ], "wc_reply_authors_avg": [ 237.66666666666666, 118.37886447992123 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17962712491875468296&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=r1f0YiCctm", "pdf": "https://openreview.net/pdf?id=r1f0YiCctm", "email": ";;", "author_num": 3 }, { "id": "r1f78iAcFm", "title": "GRAPH TRANSFORMATION POLICY NETWORK FOR CHEMICAL REACTION PREDICTION", "track": "main", "status": "Reject", "tldr": "", "abstract": "We address a fundamental problem in chemistry known as chemical reaction product prediction. Our main insight is that the input reactant and reagent molecules can be jointly represented as a graph, and the process of generating product molecules from reactant molecules can be formulated as a sequence of graph transformations. To this end, we propose Graph Transformation Policy Network (GTPN) - a novel generic method that combines the strengths of graph neural networks and reinforcement learning to learn the reactions directly from data with minimal chemical knowledge. Compared to previous methods, GTPN has some appealing properties such as: end-to-end learning, and making no assumption about the length or the order of graph transformations. In order to guide model search through the complex discrete space of sets of bond changes effectively, we extend the standard policy gradient loss by adding useful constraints. Evaluation results show that GTPN improves the top-1 accuracy over the current state-of-the-art method by about 3% on the large USPTO dataset. Our model's performances and prediction errors are also analyzed carefully in the paper.", "keywords": "Chemical Reaction;Graph Transformation;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Kien Do;Truyen Tran;Svetha Venkatesh", "authorids": "dkdo@deakin.edu.au;truyen.tran@deakin.edu.au;svetha.venkatesh@deakin.edu.au", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndo2019graph,\ntitle={{GRAPH} {TRANSFORMATION} {POLICY} {NETWORK} {FOR} {CHEMICAL} {REACTION} {PREDICTION}},\nauthor={Kien Do and Truyen Tran and Svetha Venkatesh},\nyear={2019},\nurl={https://openreview.net/forum?id=r1f78iAcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1f78iAcFm", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;5", "wc_review": "448;576;588", "wc_reply_reviewers": "48;0;14", "wc_reply_authors": "1235;1225;2048", "reply_reviewers": "1;0;1", "reply_authors": "3;2;4", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 537.3333333333334, 63.35788997608919 ], "wc_reply_reviewers_avg": [ 20.666666666666668, 20.154955277107963 ], "wc_reply_authors_avg": [ 1502.6666666666667, 385.63050824447083 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 213, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3075996062238805253&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "r1fE3sAcYQ", "title": "Overcoming Multi-model Forgetting", "track": "main", "status": "Reject", "tldr": "We identify a phenomenon, neural brainwashing, and introduce a statistically-justified weight plasticity loss to overcome this.", "abstract": "We identify a phenomenon, which we refer to as *multi-model forgetting*, that occurs when sequentially training multiple deep networks with partially-shared parameters; the performance of previously-trained models degrades as one optimizes a subsequent one, due to the overwriting of shared parameters. To overcome this, we introduce a statistically-justified weight plasticity loss that regularizes the learning of a model's shared parameters according to their importance for the previous models, and demonstrate its effectiveness when training two models sequentially and for neural architecture search. Adding weight plasticity in neural architecture search preserves the best models to the end of the search and yields improved results in both natural language processing and computer vision tasks.", "keywords": "multi-model forgetting;deep learning;machine learning;multi-model training;neural architecture search", "primary_area": "", "supplementary_material": "", "author": "Yassine Benyahia*;Kaicheng Yu*;Kamil Bennani-Smires;Martin Jaggi;Anthony Davison;Mathieu Salzmann;Claudiu Musat", "authorids": "yassine.benyahia1@gmail.com;kaicheng.yu@epfl.ch;kamil.bennani-smires@swisscom.com;martin.jaggi@epfl.ch;anthony.davison@epfl.ch;mathieu.salzmann@epfl.ch;claudiu.musat@swisscom.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nbenyahia*2019overcoming,\ntitle={Overcoming Multi-model Forgetting},\nauthor={Yassine Benyahia* and Kaicheng Yu* and Kamil Bennani-Smires and Martin Jaggi and Anthony Davison and Mathieu Salzmann and Claudiu Musat},\nyear={2019},\nurl={https://openreview.net/forum?id=r1fE3sAcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1fE3sAcYQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "5;4;2", "wc_review": "227;292;50", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "447;470;284", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 189.66666666666666, 102.26219025405018 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 400.3333333333333, 82.79425637620575 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16148647356046938402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "r1fO8oC9Y7", "title": "Multi-Task Learning for Semantic Parsing with Cross-Domain Sketch", "track": "main", "status": "Reject", "tldr": "General-to-detailed neural network(GDNN) with Multi-Task Learning by incorporating cross-domain sketch(CDS) for semantic parsing", "abstract": "Semantic parsing which maps a natural language sentence into a formal machine-readable representation of its meaning, is highly constrained by the limited annotated training data. Inspired by the idea of coarse-to-fine, we propose a general-to-detailed neural network(GDNN) by incorporating cross-domain sketch(CDS) among utterances and their logic forms. For utterances in different domains, the General Network will extract CDS using an encoder-decoder model in a multi-task learning setup. Then for some utterances in a specific domain, the Detailed Network will generate the detailed target parts using sequence-to-sequence architecture with advanced attention to both utterance and generated CDS. Our experiments show that compared to direct multi-task learning, CDS has improved the performance in semantic parsing task which converts users' requests into meaning representation language(MRL). We also use experiments to illustrate that CDS works by adding some constraints to the target decoding process, which further proves the effectiveness and rationality of CDS.", "keywords": "semantic parsing;natural language understanding;machine learning", "primary_area": "", "supplementary_material": "", "author": "Huan Wang;Yuxiang Hu;Li Dong;Feijun Jiang;Zaiqing Nie", "authorids": "odile.wh@alibaba-inc.com;yuxiang.hyx@alibaba-inc.com;li.dong@ed.ac.uk;feijun.jiangfj@alibaba-inc.com;zaiqing.nzq@alibaba-inc.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nwang2019multitask,\ntitle={Multi-Task Learning for Semantic Parsing with Cross-Domain Sketch},\nauthor={Huan Wang and Yuxiang Hu and Li Dong and Feijun Jiang and Zaiqing Nie},\nyear={2019},\nurl={https://openreview.net/forum?id=r1fO8oC9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1fO8oC9Y7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "543;242;267", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "178;231;253", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 350.6666666666667, 136.38263167361973 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 220.66666666666666, 31.47838764754143 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GoRpDiX4d88J:scholar.google.com/&scioq=Multi-Task+Learning+for+Semantic+Parsing+with+Cross-Domain+Sketch&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1fWmnR5tm", "title": "Learning to Search Efficient DenseNet with Layer-wise Pruning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have achieved outstanding performance in many real-world applications with the expense of huge computational resources. The DenseNet, one of the recently proposed neural network architecture, has achieved the state-of-the-art performance in many visual tasks. However, it has great redundancy due to the dense connections of the internal structure, which leads to high computational costs in training such dense networks. To address this issue, we design a reinforcement learning framework to search for efficient DenseNet architectures with layer-wise pruning (LWP) for different tasks, while retaining the original advantages of DenseNet, such as feature reuse, short paths, etc. In this framework, an agent evaluates the importance of each connection between any two block layers, and prunes the redundant connections. In addition, a novel reward-shaping trick is introduced to make DenseNet reach a better trade-off between accuracy and float point operations (FLOPs). Our experiments show that DenseNet with LWP is more compact and efficient than existing alternatives. ", "keywords": "reinforcement learning;DenseNet;neural network compression", "primary_area": "", "supplementary_material": "", "author": "Xuanyang Zhang;Hao liu;Zhanxing Zhu;Zenglin Xu", "authorids": "xuanyang91.zhang@gmail.com;uestcliuhao@gmail.com;zhanxing.zhu@pku.edu.cn;zenglin@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2019learning,\ntitle={Learning to Search Efficient DenseNet with Layer-wise Pruning},\nauthor={Xuanyang Zhang and Hao liu and Zhanxing Zhu and Zenglin Xu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1fWmnR5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1fWmnR5tm", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;5", "wc_review": "475;196;131", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 267.3333333333333, 149.22093984722414 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17073128841843051662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "r1fiFs09YX", "title": "Sample-efficient policy learning in multi-agent Reinforcement Learning via meta-learning", "track": "main", "status": "Reject", "tldr": "Our work applies meta-learning to multi-agent Reinforcement Learning to help our agent efficiently adapted to new coming opponents.", "abstract": "To gain high rewards in muti-agent scenes, it is sometimes necessary to understand other agents and make corresponding optimal decisions. We can solve these tasks by first building models for other agents and then finding the optimal policy with these models. To get an accurate model, many observations are needed and this can be sample-inefficient. What's more, the learned model and policy can overfit to current agents and cannot generalize if the other agents are replaced by new agents. In many practical situations, each agent we face can be considered as a sample from a population with a fixed but unknown distribution. Thus we can treat the task against some specific agents as a task sampled from a task distribution. We apply meta-learning method to build models and learn policies. Therefore when new agents come, we can adapt to them efficiently. Experiments on grid games show that our method can quickly get high rewards.", "keywords": "Multi-agent;Reinforcement Learning;Meta-learning", "primary_area": "", "supplementary_material": "", "author": "Jialian Li;Hang Su;Jun Zhu", "authorids": "lijialian7@163.com;suhangss@mail.tsinghua.edu.cn;dcszj@mail.tsinghua.edu.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nli2019sampleefficient,\ntitle={Sample-efficient policy learning in multi-agent Reinforcement Learning via meta-learning},\nauthor={Jialian Li and Hang Su and Jun Zhu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1fiFs09YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1fiFs09YX", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;3", "wc_review": "356;468;1236", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 686.6666666666666, 391.1191918357142 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9EnXLT6lDxQJ:scholar.google.com/&scioq=Sample-efficient+policy+learning+in+multi-agent+Reinforcement+Learning+via+meta-learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "r1g-TiC9FX", "title": "Neural Collobrative Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This paper presents a conceptually general and modularized neural collaborative network (NCN), which overcomes the limitations of the traditional convolutional neural networks (CNNs) in several aspects. Firstly, our NCN can directly handle non-Euclidean data without any pre-processing (e.g., graph normalizations) by defining a simple yet basic unit named neuron array for feature representation. Secondly, our NCN is capable of achieving both rotational equivariance and invariance properties via a simple yet powerful neuron collaboration mechanism, which imposes a ``glocal'' operation to capture both global and local information among neuron arrays within each layer. Thirdly, compared to the state-of-the-art networks that using large CNN kernels, our NCN with considerably fewer parameters can also achieve their strengths in feature learning by only exploiting highly efficient 1x1 convolution operations. Extensive experimental analyses on learning feature representation, handling novel viewpoints, and handling non-euclidean data demonstrate that our NCN can not only achieve state-of-the-art performance but also overcome the limitation of the conventional CNNs. The source codes will be released to facilite future researches after the review period for ensuring the anonymity.", "keywords": "deep learning;neural architecture search;collaboration representation learning", "primary_area": "", "supplementary_material": "", "author": "Guangrun Wang;Keze Wang;Liang Lin", "authorids": "wanggrun@mail2.sysu.edu.cn;kezewang@gmail.com;linliang@ieee.org", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=r1g-TiC9FX", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r1g1LoAcFm", "title": "Using Ontologies To Improve Performance In Massively Multi-label Prediction", "track": "main", "status": "Reject", "tldr": " We propose a new method for using ontology information to improve performance on massively multi-label prediction/classification problems.", "abstract": "Massively multi-label prediction/classification problems arise in environments like health-care or biology where it is useful to make very precise predictions. One challenge with massively multi-label problems is that there is often a long-tailed frequency distribution for the labels, resulting in few positive examples for the rare labels. We propose a solution to this problem by modifying the output layer of a neural network to create a Bayesian network of sigmoids which takes advantage of ontology relationships between the labels to help share information between the rare and the more common labels. We apply this method to the two massively multi-label tasks of disease prediction (ICD-9 codes) and protein function prediction (Gene Ontology terms) and obtain significant improvements in per-label AUROC and average precision.", "keywords": "multi-label;Bayesian network;ontology", "primary_area": "", "supplementary_material": "", "author": "Ethan Steinberg;Peter J. Liu", "authorids": "ethan.steinberg@gmail.com;peterjliu@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsteinberg2019using,\ntitle={Using Ontologies To Improve Performance In Massively Multi-label Prediction},\nauthor={Ethan Steinberg and Peter J. Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1g1LoAcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1g1LoAcFm", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;3", "wc_review": "227;398;269", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "207;262;152", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 298.0, 72.75987905432498 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 207.0, 44.90731195102493 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CIzTcyWmeysJ:scholar.google.com/&scioq=Using+Ontologies+To+Improve+Performance+In+Massively+Multi-label+Prediction&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Characterizing Audio Adversarial Examples Using Temporal Dependency", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1113", "id": "r1g4E3C9t7", "author_site": "Zhuolin Yang, Bo Li, Pin-Yu Chen, Dawn Song", "tldr": "Adversarial audio discrimination using temporal dependency", "abstract": "Recent studies have highlighted adversarial examples as a ubiquitous threat to different neural network models and many downstream applications. Nonetheless, as unique data properties have inspired distinct and powerful learning principles, this paper aims to explore their potentials towards mitigating adversarial inputs. In particular, our results reveal the importance of using the temporal dependency in audio data to gain discriminate power against adversarial examples. Tested on the automatic speech recognition (ASR) tasks and three recent audio adversarial attacks, we find that (i) input transformation developed from image adversarial defense provides limited robustness improvement and is subtle to advanced attacks; (ii) temporal dependency can be exploited to gain discriminative power against audio adversarial examples and is resistant to adaptive attacks considered in our experiments. Our results not only show promising means of improving the robustness of ASR systems, but also offer novel insights in exploiting domain-specific data properties to mitigate negative effects of adversarial examples.", "keywords": "audio adversarial example;mitigation;detection;machine learning", "primary_area": "", "supplementary_material": "", "author": "Zhuolin Yang;Bo Li;Pin-Yu Chen;Dawn Song", "authorids": "lucas110550@sjtu.edu.cn;lxbosky@gmail.com;pin-yu.chen@ibm.com;dawnsong@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nyang2018characterizing,\ntitle={Characterizing Audio Adversarial Examples Using Temporal Dependency},\nauthor={Zhuolin Yang and Bo Li and Pin-Yu Chen and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1g4E3C9t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "wc_review": "332;236;531", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "88;281;213", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 366.3333333333333, 122.85583240349462 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 194.0, 79.92913528036361 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 217, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13178017338053441718&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1g4E3C9t7", "pdf": "https://openreview.net/pdf?id=r1g4E3C9t7", "email": ";;;", "author_num": 4 }, { "id": "r1g5Gh05KQ", "title": "Advanced Neuroevolution: A gradient-free algorithm to train Deep Neural Networks", "track": "main", "status": "Withdraw", "tldr": "A new algorithm to train deep neural networks. Tested on optimization functions and MNIST.", "abstract": "In this paper we present a novel optimization algorithm called Advanced Neuroevolution. The aim for this algorithm is to train deep neural networks, and eventually act as an alternative to Stochastic Gradient Descent (SGD) and its variants as needed.We evaluated our algorithm on the MNIST dataset, as well as on several global optimization problems such as the Ackley function. We find the algorithm performing relatively well for both cases, overtaking other global optimization algorithms such as Particle Swarm Optimization (PSO) and Evolution Strategies (ES).\n", "keywords": "Evolutionary Algorithm;Optimization;MNIST", "primary_area": "", "supplementary_material": "", "author": "Ahmed Aly;David Weikersdorfer;Claire Delaunay", "authorids": "aaa2cn@virginia.edu;dweikersdorfer@nvidia.com;cdelaunay@nvidia.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1g5Gh05KQ", "pdf_size": 0, "rating": "1;1;5", "confidence": "5;5;4", "wc_review": "298;697;1074", "wc_reply_reviewers": "308;407;0", "wc_reply_authors": "1040;813;1089", "reply_reviewers": "1;1;0", "reply_authors": "2;1;2", "rating_avg": [ 2.3333333333333335, 1.8856180831641267 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 689.6666666666666, 316.8431087392553 ], "wc_reply_reviewers_avg": [ 238.33333333333334, 173.3057670387483 ], "wc_reply_authors_avg": [ 980.6666666666666, 120.23403104871025 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:OoFu66TVeS0J:scholar.google.com/&scioq=Advanced+Neuroevolution:+A+gradient-free+algorithm+to+train+Deep+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "r1g5b2RcKm", "title": "MLPrune: Multi-Layer Pruning for Automated Neural Network Compression", "track": "main", "status": "Reject", "tldr": "MLPrune: an automated pruning method that doesn't require any tuning for per-layer compression ratio, achieves state-of-the-art pruning results on AlexNet and VGG16.", "abstract": "Model compression can significantly reduce the computation and memory footprint of large neural networks. To achieve a good trade-off between model size and accuracy, popular compression techniques usually rely on hand-crafted heuristics and\nrequire manually setting the compression ratio of each layer. This process is typically costly and suboptimal. In this paper, we propose a Multi-Layer Pruning method (MLPrune), which is theoretically sound, and can automatically decide appropriate compression ratios for all layers. Towards this goal, we use an efficient approximation of the Hessian as our pruning criterion, based on a Kronecker-factored Approximate Curvature method. We demonstrate the effectiveness of our method on several datasets and architectures, outperforming previous state-of-the-art by a large margin. Our experiments show that we can compress AlexNet and VGG16 by 25x without loss in accuracy on ImageNet. Furthermore, our method has much fewer hyper-parameters and requires no expert knowledge.", "keywords": "Automated Model Compression;Neural Network Pruning", "primary_area": "", "supplementary_material": "", "author": "Wenyuan Zeng;Raquel Urtasun", "authorids": "zengwenyuan1995@gmail.com;urtasun@uber.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzeng2019mlprune,\ntitle={{MLP}rune: Multi-Layer Pruning for Automated Neural Network Compression},\nauthor={Wenyuan Zeng and Raquel Urtasun},\nyear={2019},\nurl={https://openreview.net/forum?id=r1g5b2RcKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1g5b2RcKm", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "wc_review": "448;369;239", "wc_reply_reviewers": "0;0;14", "wc_reply_authors": "443;775;236", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 352.0, 86.16650548018451 ], "wc_reply_reviewers_avg": [ 4.666666666666667, 6.599663291074443 ], "wc_reply_authors_avg": [ 484.6666666666667, 222.00950930584534 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5448543023361852467&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1g7y2RqYX", "title": "Label Propagation Networks", "track": "main", "status": "Reject", "tldr": "Neural net for graph-based semi-supervised learning; revisits the classics and propagates *labels* rather than feature representations", "abstract": "Graph networks have recently attracted considerable interest, and in particular in the context of semi-supervised learning. These methods typically work by generating node representations that are propagated throughout a given weighted graph.\n\nHere we argue that for semi-supervised learning, it is more natural to consider propagating labels in the graph instead. Towards this end, we propose a differentiable neural version of the classic Label Propagation (LP) algorithm. This formulation can be used for learning edge weights, unlike other methods where weights are set heuristically. Starting from a layer implementing a single iteration of LP, we proceed by adding several important non-linear steps that significantly enhance the label-propagating mechanism.\n\nExperiments in two distinct settings demonstrate the utility of our approach.\n", "keywords": "semi supervised learning;graph networks;deep learning architectures", "primary_area": "", "supplementary_material": "", "author": "Kojin Oshiba;Nir Rosenfeld;Amir Globerson", "authorids": "kojinoshiba@college.harvard.edu;nirr@g.harvard.edu;amir.globerson@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\noshiba2019label,\ntitle={Label Propagation Networks},\nauthor={Kojin Oshiba and Nir Rosenfeld and Amir Globerson},\nyear={2019},\nurl={https://openreview.net/forum?id=r1g7y2RqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1g7y2RqYX", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;2;4", "wc_review": "572;216;340", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "740;265;593", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 376.0, 147.5488619633058 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 532.6666666666666, 198.5553379343457 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Equi-normalization of Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/941", "id": "r1gEqiC9FX", "author_site": "Pierre Stock, Benjamin Graham, R\u00e9mi Gribonval, Herv\u00e9 J\u00e9gou", "tldr": "Fast iterative algorithm to balance the energy of a network while staying in the same functional equivalence class", "abstract": "Modern neural networks are over-parametrized. In particular, each rectified linear hidden unit can be modified by a multiplicative factor by adjusting input and out- put weights, without changing the rest of the network. Inspired by the Sinkhorn-Knopp algorithm, we introduce a fast iterative method for minimizing the l2 norm of the weights, equivalently the weight decay regularizer. It provably converges to a unique solution. Interleaving our algorithm with SGD during training improves the test accuracy. For small batches, our approach offers an alternative to batch- and group- normalization on CIFAR-10 and ImageNet with a ResNet-18.", "keywords": "convolutional neural networks;Normalization;Sinkhorn;Regularization", "primary_area": "", "supplementary_material": "", "author": "Pierre Stock;Benjamin Graham;R\u00e9mi Gribonval;Herv\u00e9 J\u00e9gou", "authorids": "pstock@fb.com;benjamingraham@fb.com;remi.gribonval@inria.fr;rvj@fb.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nstock2018equinormalization,\ntitle={Equi-normalization of Neural Networks},\nauthor={Pierre Stock and Benjamin Graham and R\u00e9mi Gribonval and Herv\u00e9 J\u00e9gou},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gEqiC9FX},\n}", "github": "[![github](/images/github_icon.svg) facebookresearch/enorm](https://github.com/facebookresearch/enorm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;3", "wc_review": "486;913;268", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "571;713;610", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 555.6666666666666, 267.88845108033723 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 631.3333333333334, 59.901771444776344 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=r1gEqiC9FX", "pdf": "https://openreview.net/pdf?id=r1gEqiC9FX", "email": ";;;", "author_num": 4 }, { "id": "r1gGpjActQ", "title": "Hint-based Training for Non-Autoregressive Translation", "track": "main", "status": "Reject", "tldr": "We develop a training algorithm for non-autoregressive machine translation models, achieving comparable accuracy to strong autoregressive baselines, but one order of magnitude faster in inference. ", "abstract": "Machine translation is an important real-world application, and neural network-based AutoRegressive Translation (ART) models have achieved very promising accuracy. Due to the unparallelizable nature of the autoregressive factorization, ART models have to generate tokens one by one during decoding and thus suffer from high inference latency. Recently, Non-AutoRegressive Translation (NART) models were proposed to reduce the inference time. However, they could only achieve inferior accuracy compared with ART models. To improve the accuracy of NART models, in this paper, we propose to leverage the hints from a well-trained ART model to train the NART model. We define two hints for the machine translation task: hints from hidden states and hints from word alignments, and use such hints to regularize the optimization of NART models. Experimental results show that the NART model trained with hints could achieve significantly better translation performance than previous NART models on several tasks. In particular, for the WMT14 En-De and De-En task, we obtain BLEU scores of 25.20 and 29.52 respectively, which largely outperforms the previous non-autoregressive baselines. It is even comparable to a strong LSTM-based ART model (24.60 on WMT14 En-De), but one order of magnitude faster in inference.", "keywords": "Natural Language Processing;Machine Translation;Non-Autoregressive Model", "primary_area": "", "supplementary_material": "", "author": "Zhuohan Li;Di He;Fei Tian;Tao Qin;Liwei Wang;Tie-Yan Liu", "authorids": "lizhuohan@pku.edu.cn;dihe@microsoft.com;fetia@microsoft.com;taoqin@microsoft.com;wanglw@cis.pku.edu.cn;tyliu@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nli2019hintbased,\ntitle={Hint-based Training for Non-Autoregressive Translation},\nauthor={Zhuohan Li and Di He and Fei Tian and Tao Qin and Liwei Wang and Tie-Yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gGpjActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1gGpjActQ", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;4", "wc_review": "725;228;254", "wc_reply_reviewers": "352;0;0", "wc_reply_authors": "1723;161;243", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 402.3333333333333, 228.40655760191203 ], "wc_reply_reviewers_avg": [ 117.33333333333333, 165.93439131844315 ], "wc_reply_authors_avg": [ 709.0, 717.7873408375677 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=855462924192062420&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1gKNs0qYX", "title": "Filter Training and Maximum Response: Classification via Discerning", "track": "main", "status": "Reject", "tldr": "The proposed scheme mimics the classification process mediated by a series of one component picking.", "abstract": "This report introduces a training and recognition scheme, in which classification is realized via class-wise discerning. Trained with datasets whose labels are randomly shuffled except for one class of interest, a neural network learns class-wise parameter values, and remolds itself from a feature sorter into feature filters, each of which discerns objects belonging to one of the classes only. Classification of an input can be inferred from the maximum response of the filters. A multiple check with multiple versions of filters can diminish fluctuation and yields better performance. This scheme of discerning, maximum response and multiple check is a method of general viability to improve performance of feedforward networks, and the filter training itself is a promising feature abstraction procedure. In contrast to the direct sorting, the scheme mimics the classification process mediated by a series of one component picking.", "keywords": "filter training;maximum response;multiple check;ensemble learning", "primary_area": "", "supplementary_material": "", "author": "Lei Gu", "authorids": "gul2@uci.edu", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ngu2019filter,\ntitle={Filter Training and Maximum Response: Classification via Discerning},\nauthor={Lei Gu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gKNs0qYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1gKNs0qYX", "pdf_size": 0, "rating": "2;3;6", "confidence": "1;4;3", "wc_review": "14;257;119", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 1.699673171197595 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 130.0, 99.508793581271 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.4193139346887672, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bh2sg4EgPD0J:scholar.google.com/&scioq=Filter+Training+and+Maximum+Response:+Classification+via+Discerning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Generalized Tensor Models for Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/748", "id": "r1gNni0qtm", "author_site": "Valentin Khrulkov, Oleksii Hrinchuk, Ivan Oseledets", "tldr": "Analysis of expressivity and generality of recurrent neural networks with ReLu nonlinearities using Tensor-Train decomposition.", "abstract": "Recurrent Neural Networks (RNNs) are very successful at solving challenging problems with sequential data. However, this observed efficiency is not yet entirely explained by theory. It is known that a certain class of multiplicative RNNs enjoys the property of depth efficiency --- a shallow network of exponentially large width is necessary to realize the same score function as computed by such an RNN. Such networks, however, are not very often applied to real life tasks. In this work, we attempt to reduce the gap between theory and practice by extending the theoretical analysis to RNNs which employ various nonlinearities, such as Rectified Linear Unit (ReLU), and show that they also benefit from properties of universality and depth efficiency. Our theoretical results are verified by a series of extensive computational experiments.", "keywords": "expressive power;recurrent neural networks;Tensor-Train decomposition", "primary_area": "", "supplementary_material": "", "author": "Valentin Khrulkov;Oleksii Hrinchuk;Ivan Oseledets", "authorids": "khrulkov.v@gmail.com;oleksii.hrinchuk@skoltech.ru;i.oseledets@skoltech.ru", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkhrulkov2018generalized,\ntitle={Generalized Tensor Models for Recurrent Neural Networks},\nauthor={Valentin Khrulkov and Oleksii Hrinchuk and Ivan Oseledets},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gNni0qtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;3", "wc_review": "250;665;273", "wc_reply_reviewers": "25;11;0", "wc_reply_authors": "425;421;297", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 396.0, 190.44334240573144 ], "wc_reply_reviewers_avg": [ 12.0, 10.23067283548187 ], "wc_reply_authors_avg": [ 381.0, 59.41941321375251 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16851245394902842047&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1gNni0qtm", "pdf": "https://openreview.net/pdf?id=r1gNni0qtm", "email": ";;", "author_num": 3 }, { "id": "r1gOe209t7", "title": "Reconciling Feature-Reuse and Overfitting in DenseNet with Specialized Dropout", "track": "main", "status": "Reject", "tldr": "Realizing the drawbacks when applying original dropout on DenseNet, we craft the design of dropout method from three aspects, the idea of which could also be applied on other CNN models.", "abstract": "Recently convolutional neural networks (CNNs) achieve great accuracy in visual recognition tasks. DenseNet becomes one of the most popular CNN models due to its effectiveness in feature-reuse. However, like other CNN models, DenseNets also face overfitting problem if not severer. Existing dropout method can be applied but not as effective due to the introduced nonlinear connections. In particular, the property of feature-reuse in DenseNet will be impeded, and the dropout effect will be weakened by the spatial correlation inside feature maps. To address these problems, we craft the design of a specialized dropout method from three aspects, dropout location, dropout granularity, and dropout probability. The insights attained here could potentially be applied as a general approach for boosting the accuracy of other CNN models with similar nonlinear connections. Experimental results show that DenseNets with our specialized dropout method yield better accuracy compared to vanilla DenseNet and state-of-the-art CNN models, and such accuracy boost increases with the model depth.", "keywords": "Specialized dropout;computer vision", "primary_area": "", "supplementary_material": "", "author": "Kun Wan;Boyuan Feng;Lingwei Xie;Yufei Ding", "authorids": "kun@cs.ucsb.edu;boyuan@cs.ucsb.edu;xielingwei@stu.xmu.edu.cn;yufeiding@cs.ucsb.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwan2019reconciling,\ntitle={Reconciling Feature-Reuse and Overfitting in DenseNet with Specialized Dropout},\nauthor={Kun Wan and Boyuan Feng and Lingwei Xie and Yufei Ding},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gOe209t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1gOe209t7", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;4", "wc_review": "267;375;638", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 426.6666666666667, 155.80400793589646 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16673580188578351083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "r1gR2sC9FX", "title": "On the Spectral Bias of Neural Networks", "track": "main", "status": "Reject", "tldr": "We investigate ReLU networks in the Fourier domain and demonstrate peculiar behaviour.", "abstract": "Neural networks are known to be a class of highly expressive functions able to fit even random input-output mappings with 100% accuracy. In this work we present properties of neural networks that complement this aspect of expressivity. By using tools from Fourier analysis, we show that deep ReLU networks are biased towards low frequency functions, meaning that they cannot have local fluctuations without affecting their global behavior. Intuitively, this property is in line with the observation that over-parameterized networks find simple patterns that generalize across data samples. We also investigate how the shape of the data manifold affects expressivity by showing evidence that learning high frequencies gets easier with increasing manifold complexity, and present a theoretical understanding of this behavior. Finally, we study the robustness of the frequency components with respect to parameter perturbation, to develop the intuition that the parameters must be finely tuned to express high frequency functions.", "keywords": "deep learning theory;fourier analysis", "primary_area": "", "supplementary_material": "", "author": "Nasim Rahaman;Aristide Baratin;Devansh Arpit;Felix Draxler;Min Lin;Fred Hamprecht;Yoshua Bengio;Aaron Courville", "authorids": ";aristidebaratin@hotmail.com;devansharpit@gmail.com;;;;yoshua.umontreal@gmail.com;aaron.courville@umontreal.ca", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nrahaman2019on,\ntitle={On the Spectral Bias of Neural Networks},\nauthor={Nasim Rahaman and Aristide Baratin and Devansh Arpit and Felix Draxler and Min Lin and Fred Hamprecht and Yoshua Bengio and Aaron Courville},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gR2sC9FX},\n}", "github": "[![github](/images/github_icon.svg) nasimrahaman/SpectralBias](https://github.com/nasimrahaman/SpectralBias) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=r1gR2sC9FX)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1gR2sC9FX", "pdf_size": 0, "rating": "4;5;6;6", "confidence": "4;3;3;3", "wc_review": "698;276;443;520", "wc_reply_reviewers": "291;0;59;92", "wc_reply_authors": "2266;683;365;490", "reply_reviewers": "2;0;1;1", "reply_authors": "4;1;1;1", "rating_avg": [ 5.25, 0.82915619758885 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 484.25, 151.6877961472181 ], "wc_reply_reviewers_avg": [ 110.5, 109.29890209878597 ], "wc_reply_authors_avg": [ 951.0, 767.6206745522165 ], "reply_reviewers_avg": [ 1.0, 0.7071067811865476 ], "reply_authors_avg": [ 1.75, 1.299038105676658 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": -0.8703882797784891, "gs_citation": 1815, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6023723620228240592&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11 }, { "id": "r1gRCiA5Ym", "title": "Jumpout: Improved Dropout for Deep Neural Networks with Rectified Linear Units", "track": "main", "status": "Reject", "tldr": "Jumpout applies three simple yet effective modifications to dropout, based on novel understandings about the generalization performance of DNN with ReLU in local regions.", "abstract": "Dropout is a simple yet effective technique to improve generalization performance and prevent overfitting in deep neural networks (DNNs). In this paper, we discuss three novel observations about dropout to better understand the generalization of DNNs with rectified linear unit (ReLU) activations: 1) dropout is a smoothing technique that encourages each local linear model of a DNN to be trained on data points from nearby regions; 2) a constant dropout rate can result in effective neural-deactivation rates that are significantly different for layers with different fractions of activated neurons; and 3) the rescaling factor of dropout causes an inconsistency to occur between the normalization during training and testing conditions when batch normalization is also used. The above leads to three simple but nontrivial improvements to dropout resulting in our proposed method \"Jumpout.\" Jumpout samples the dropout rate using a monotone decreasing distribution (such as the right part of a truncated Gaussian), so the local linear model at each data point is trained, with high probability, to work better for data points from nearby than from more distant regions. Instead of tuning a dropout rate for each layer and applying it to all samples, jumpout moreover adaptively normalizes the dropout rate at each layer and every training sample/batch, so the effective dropout rate applied to the activated neurons are kept the same. Moreover, we rescale the outputs of jumpout for a better trade-off that keeps both the variance and mean of neurons more consistent between training and test phases, which mitigates the incompatibility between dropout and batch normalization. Compared to the original dropout, jumpout shows significantly improved performance on CIFAR10, CIFAR100, Fashion- MNIST, STL10, SVHN, ImageNet-1k, etc., while introducing negligible additional memory and computation costs.", "keywords": "Dropout;deep neural networks with ReLU;local linear model", "primary_area": "", "supplementary_material": "", "author": "Shengjie Wang;Tianyi Zhou;Jeff Bilmes", "authorids": "tianyi.david.zhou@gmail.com;tianyi.david.zhou@gmail.com;tianyi.david.zhou@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwang2019jumpout,\ntitle={Jumpout: Improved Dropout for Deep Neural Networks with Rectified Linear Units},\nauthor={Shengjie Wang and Tianyi Zhou and Jeff Bilmes},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gRCiA5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1gRCiA5Ym", "pdf_size": 0, "rating": "4;4;5", "confidence": "5;3;4", "wc_review": "427;164;297", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "520;295;850", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 296.0, 107.37162877905256 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 555.0, 227.92542640082962 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11469935724562706978&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "r1gVqsA9tQ", "title": "ChainGAN: A sequential approach to GANs", "track": "main", "status": "Reject", "tldr": "Multistep generation process for GANs", "abstract": "We propose a new architecture and training methodology for generative adversarial networks. Current approaches attempt to learn the transformation from a noise sample to a generated data sample in one shot. Our proposed generator architecture, called ChainGAN, uses a two-step process. It first attempts to transform a noise vector into a crude sample, similar to a traditional generator. Next, a chain of networks, called editors, attempt to sequentially enhance this sample. We train each of these units independently, instead of with end-to-end backpropagation on the entire chain. Our model is robust, efficient, and flexible as we can apply it to various network architectures. We provide rationale for our choices and experimentally evaluate our model, achieving competitive results on several datasets.", "keywords": "Machine Learning;Sequential Models;GANs", "primary_area": "", "supplementary_material": "", "author": "Safwan Hossain;Kiarash Jamali;Yuchen Li;Frank Rudzicz", "authorids": "safwan.hossain@mail.utoronto.ca;kiarash.jamali@mail.utoronto.ca;ychnlgy.li@utoronto.ca;frank@spoclab.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhossain2019chaingan,\ntitle={Chain{GAN}: A sequential approach to {GAN}s},\nauthor={Safwan Hossain and Kiarash Jamali and Yuchen Li and Frank Rudzicz},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gVqsA9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1gVqsA9tQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "wc_review": "899;184;307", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 463.3333333333333, 312.12853477729686 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2205034357175300383&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r1ge8sCqFX", "title": "An Exhaustive Analysis of Lazy vs. Eager Learning Methods for Real-Estate Property Investment", "track": "main", "status": "Reject", "tldr": "", "abstract": "Accurate rent prediction in real estate investment can help in generating capital gains and guaranty a financial success. In this paper, we carry out a comprehensive analysis and study of eleven machine learning algorithms for rent prediction, including Linear Regression, Multilayer Perceptron, Random Forest, KNN, ML-KNN, Locally Weighted Learning, SMO, SVM, J48, lazy Decision Tree (i.e., lazy DT), and KStar algorithms. \nOur contribution in this paper is twofold: (1) We present a comprehensive analysis of internal and external attributes of a real-estate housing dataset and their correlation with rental prices. (2) We use rental prediction as a platform to study and compare the performance of eager vs. lazy machine learning methods using myriad of ML algorithms. \nWe train our rent prediction models using a Zillow data set of 4K real estate properties in Virginia State of the US, including three house types of single-family, townhouse, and condo. Each data instance in the dataset has 21 internal attributes (e.g., area space, price, number of bed/bath, rent, school rating, so forth). In addition to Zillow data, external attributes like walk/transit score, and crime rate are collected from online data sources. A subset of the collected features - determined by the PCA technique- are selected to tune the parameters of the prediction models. We employ a hierarchical clustering approach to cluster the data based on two factors of house type, and average rent estimate of zip codes. We evaluate and compare the efficacy of the tuned prediction models based on two metrics of R-squared and Mean Absolute Error, applied on unseen data. Based on our study, lazy models like KStar lead to higher accuracy and lower prediction error compared to eager methods like J48 and LR. However, it is not necessarily found to be an overarching conclusion drawn from the comparison between all the lazy and eager methods in this work. ", "keywords": "applied machine learning;housing analytics;eager learning;lazy learning;rent prediction", "primary_area": "", "supplementary_material": "", "author": "Setareh Rafatirad;Maryam Heidari", "authorids": "srafatir@gmu.edu;mheidari@gmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nrafatirad2019an,\ntitle={An Exhaustive Analysis of Lazy vs. Eager Learning Methods for Real-Estate Property Investment},\nauthor={Setareh Rafatirad and Maryam Heidari},\nyear={2019},\nurl={https://openreview.net/forum?id=r1ge8sCqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1ge8sCqFX", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;4", "wc_review": "219;177;213", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 203.0, 18.547236990991408 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4454820190242835882&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "r1gkAoA5FQ", "title": "A bird's eye view on coherence, and a worm's eye view on cohesion", "track": "main", "status": "Reject", "tldr": "We encode linguistic properties, such as, coherence and cohesion, into expert discriminators and improve text generation.", "abstract": "Generating coherent and cohesive long-form texts is a challenging problem in natural language generation. Previous works relied on a large amount of human-generated texts to train neural language models, however, few attempted to explicitly model the desired linguistic properties of natural language text, such as coherence and cohesion using neural networks. In this work, we train two expert discriminators for coherence and cohesion to provide hierarchical feedback for text generation. We also propose a simple variant of policy gradient, called 'negative-critical sequence training' in which the reward 'baseline' is constructed from randomly generated negative samples. We demonstrate the effectiveness of our approach through empirical studies, showing improvements over the strong baseline -- attention-based bidirectional MLE-trained neural language model -- in a number of automated metrics. The proposed model can serve as baseline architectures to promote further research in modeling additional linguistic properties for downstream NLP tasks.", "keywords": "text generation;natural language processing;neural language model", "primary_area": "", "supplementary_material": "", "author": "Woon Sang Cho;Pengchuan Zhang;Yizhe Zhang;Xiujun Li;Mengdi Wang;Jianfeng Gao", "authorids": "woonsang@princeton.edu;penzhan@microsoft.com;yizhe.zhang@microsoft.com;xiul@microsoft.com;mengdiw@princeton.edu;jfgao@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ncho2019a,\ntitle={A bird's eye view on coherence, and a worm's eye view on cohesion},\nauthor={Woon Sang Cho and Pengchuan Zhang and Yizhe Zhang and Xiujun Li and Mengdi Wang and Jianfeng Gao},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gkAoA5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1gkAoA5FQ", "pdf_size": 0, "rating": "2;2;4", "confidence": "4;4;4", "wc_review": "766;419;447", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1138;353;437", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 2.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 544.0, 157.3933501348347 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 642.6666666666666, 351.9283386650691 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7842660773591531555&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1gl7hC5Km", "title": "Adapting Auxiliary Losses Using Gradient Similarity", "track": "main", "status": "Reject", "tldr": "Auxiliary tasks need to match the main task to improve learning; we propose to use cosine distance between gradients of an unknown auxiliary task to protect from negative interference with learning the main task.", "abstract": "One approach to deal with the statistical inefficiency of neural networks is to rely on auxiliary losses that help to build useful representations. However, it is not always trivial to know if an auxiliary task will be helpful for the main task and when it could start hurting. We propose to use the cosine similarity between gradients of tasks as an adaptive weight to detect when an auxiliary loss is helpful to the main loss. We show that our approach is guaranteed to converge to critical points of the main task and demonstrate the practical usefulness of the proposed algorithm in a few domains: multi-task supervised learning on subsets of ImageNet, reinforcement learning on gridworld, and reinforcement learning on Atari games.", "keywords": "auxiliary losses;transfer learning;task similarity;deep learning;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Yunshu Du;Wojciech M. Czarnecki;Siddhant M. Jayakumar;Razvan Pascanu;Balaji Lakshminarayanan", "authorids": "yunshu.du@wsu.edu;lejlot@google.com;sidmj@google.com;razp@google.com;balajiln@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ndu2019adapting,\ntitle={Adapting Auxiliary Losses Using Gradient Similarity},\nauthor={Yunshu Du and Wojciech M. Czarnecki and Siddhant M. Jayakumar and Razvan Pascanu and Balaji Lakshminarayanan},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gl7hC5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1gl7hC5Km", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;3;4", "wc_review": "826;407;458", "wc_reply_reviewers": "396;0;0", "wc_reply_authors": "2862;318;574", "reply_reviewers": "3;0;0", "reply_authors": "5;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 563.6666666666666, 186.662499953496 ], "wc_reply_reviewers_avg": [ 132.0, 186.67619023324855 ], "wc_reply_authors_avg": [ 1251.3333333333333, 1143.698483964293 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 180, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6599446382169807720&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "r1glehC5tQ", "title": "Distinguishability of Adversarial Examples", "track": "main", "status": "Reject", "tldr": "We propose a defensive distinction protection approach and demonstrate the strong distinguishability of adversarial examples.", "abstract": "Machine learning models including traditional models and neural networks can be easily fooled by adversarial examples which are generated from the natural examples with small perturbations. This poses a critical challenge to machine learning security, and impedes the wide application of machine learning in many important domains such as computer vision and malware detection. Unfortunately, even state-of-the-art defense approaches such as adversarial training and defensive distillation still suffer from major limitations and can be circumvented. From a unique angle, we propose to investigate two important research questions in this paper: Are adversarial examples distinguishable from natural examples? Are adversarial examples generated by different methods distinguishable from each other? These two questions concern the distinguishability of adversarial examples. Answering them will potentially lead to a simple yet effective approach, termed as defensive distinction in this paper under the formulation of multi-label classification, for protecting against adversarial examples. We design and perform experiments using the MNIST dataset to investigate these two questions, and obtain highly positive results demonstrating the strong distinguishability of adversarial examples. We recommend that this unique defensive distinction approach should be seriously considered to complement other defense approaches.", "keywords": "Adversarial Examples;Machine Learning;Neural Networks;Distinguishability;Defense", "primary_area": "", "supplementary_material": "", "author": "Yi Qin;Ryan Hunt;Chuan Yue", "authorids": "yiqin@mines.edu;ryhunt@mines.edu;chuanyue@mines.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nqin2019distinguishability,\ntitle={Distinguishability of Adversarial Examples},\nauthor={Yi Qin and Ryan Hunt and Chuan Yue},\nyear={2019},\nurl={https://openreview.net/forum?id=r1glehC5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=r1glehC5tQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;4", "wc_review": "177;281;560", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 339.3333333333333, 161.70824207675858 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7592219893912787305&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "r1gnQ20qYX", "title": "Pearl: Prototype lEArning via Rule Lists", "track": "main", "status": "Reject", "tldr": "a method combining rule list learning and prototype learning ", "abstract": "Deep neural networks have demonstrated promising prediction and classification performance on many healthcare applications. However, the interpretability of those models are often lacking. On the other hand, classical interpretable models such as rule lists or decision trees do not lead to the same level of accuracy as deep neural networks and can often be too complex to interpret (due to the potentially large depth of rule lists). In this work, we present PEARL, Prototype lEArning via Rule Lists, which iteratively uses rule lists to guide a neural network to learn representative data prototypes. The resulting prototype neural network provides accurate prediction, and the prediction can be easily explained by prototype and its guiding rule lists. Thanks to the prediction power of neural networks, the rule lists from\t\t\t\t prototypes are more concise and hence provide better interpretability. On two real-world electronic healthcare records (EHR) datasets, PEARL consistently outperforms all baselines across both datasets, especially achieving performance improvement over conventional rule learning by up to 28% and over prototype learning by up to 3%. Experimental results also show the resulting interpretation of PEARL is simpler than the standard rule learning.", "keywords": "rule list learning;prototype learning;interpretability;healthcare", "primary_area": "", "supplementary_material": "", "author": "Tianfan Fu*;Tian Gao*;Cao Xiao*;Tengfei Ma*;Jimeng Sun", "authorids": "tfu42@gatech.edu;tgao@us.ibm.com;cxiao@us.ibm.com;tengfei.ma1@ibm.com;jsun@cc.gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nfu*2019pearl,\ntitle={Pearl: Prototype l{EA}rning via Rule Lists},\nauthor={Tianfan Fu* and Tian Gao* and Cao Xiao* and Tengfei Ma* and Jimeng Sun},\nyear={2019},\nurl={https://openreview.net/forum?id=r1gnQ20qYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1gnQ20qYX", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;4", "wc_review": "332;1653;375", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "258;634;379", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 786.6666666666666, 612.8416507458422 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 423.6666666666667, 156.71701318689756 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Lmbcnc5WF-EJ:scholar.google.com/&scioq=Pearl:+Prototype+lEArning+via+Rule+Lists&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1l-e3Cqtm", "title": "Deep Probabilistic Video Compression", "track": "main", "status": "Reject", "tldr": "Deep Probabilistic Video Compression Via Sequential Variational Autoencoders", "abstract": "We propose a variational inference approach to deep probabilistic video compression. Our model uses advances in variational autoencoders (VAEs) for sequential data and combines it with recent work on neural image compression. The approach jointly learns to transform the original video into a lower-dimensional representation as well as to entropy code this representation according to a temporally-conditioned probabilistic model. We split the latent space into local (per frame) and global (per segment) variables, and show that training the VAE to utilize both representations leads to an improved rate-distortion performance. Evaluation on small videos from public data sets with varying complexity and diversity show that our model yields competitive results when trained on generic video content. Extreme compression performance is achieved for videos with specialized content if the model is trained on similar videos.", "keywords": "variational inference;video compression;deep generative models", "primary_area": "", "supplementary_material": "", "author": "Jun Han;Salvator Lombardo;Christopher Schroers;Stephan Mandt", "authorids": "jun.han.gr@dartmouth.edu;sal.lombardo@disneyresearch.com;christopher.schroers@disneyresearch.com;stephan.mandt@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhan2019deep,\ntitle={Deep Probabilistic Video Compression},\nauthor={Jun Han and Salvator Lombardo and Christopher Schroers and Stephan Mandt},\nyear={2019},\nurl={https://openreview.net/forum?id=r1l-e3Cqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1l-e3Cqtm", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;5;5", "wc_review": "569;465;270", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "915;868;806", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 434.6666666666667, 123.93636359931746 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 863.0, 44.63929509598765 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=852539572587072629&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "r1l3NiCqY7", "title": "Lipschitz regularized Deep Neural Networks generalize", "track": "main", "status": "Reject", "tldr": "We prove generalization of DNNs by adding a Lipschitz regularization term to the training loss. We resolve a question posed in Zhang et al. (2016).", "abstract": "We show that if the usual training loss is augmented by a Lipschitz regularization term, then the networks generalize. We prove generalization by first establishing a stronger convergence result, along with a rate of convergence. A second result resolves a question posed in Zhang et al. (2016): how can a model distinguish between the case of clean labels, and randomized labels? Our answer is that Lipschitz regularization using the Lipschitz constant of the clean data makes this distinction. In this case, the model learns a different function which we hypothesize correctly fails to learn the dirty labels. ", "keywords": "Deep Neural Networks;Regularization;Generalization;Convergence;Lipschitz;Stability", "primary_area": "", "supplementary_material": "", "author": "Adam M. Oberman;Jeff Calder", "authorids": "adam.oberman@mcgill.ca;jcalder@umn.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\noberman2019lipschitz,\ntitle={Lipschitz regularized Deep Neural Networks generalize},\nauthor={Adam M. Oberman and Jeff Calder},\nyear={2019},\nurl={https://openreview.net/forum?id=r1l3NiCqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1l3NiCqY7", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;2;4", "wc_review": "574;418;840", "wc_reply_reviewers": "250;0;0", "wc_reply_authors": "1204;730;929", "reply_reviewers": "1;0;0", "reply_authors": "3;1;2", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 610.6666666666666, 174.22080498289776 ], "wc_reply_reviewers_avg": [ 83.33333333333333, 117.85113019775793 ], "wc_reply_authors_avg": [ 954.3333333333334, 194.33704970717469 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.3273268353539886, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10501481411228791702&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Wizard of Wikipedia: Knowledge-Powered Conversational Agents", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/739", "id": "r1l73iRqKm", "author_site": "Emily Dinan, Stephen Roller, Kurt Shuster, Angela Fan, Michael Auli, Jason Weston", "tldr": "We build knowledgeable conversational agents by conditioning on Wikipedia + a new supervised task.", "abstract": "In open-domain dialogue intelligent agents should exhibit the use of knowledge, however there are few convincing demonstrations of this to date. The most popular sequence to sequence models typically \u201cgenerate and hope\u201d generic utterances that can be memorized in the weights of the model when mapping from input utterance(s) to output, rather than employing recalled knowledge as context. Use of knowledge has so far proved difficult, in part because of the lack of a supervised learning benchmark task which exhibits knowledgeable open dialogue with clear grounding. To that end we collect and release a large dataset with conversations directly grounded with knowledge retrieved from Wikipedia. We then design architectures capable of retrieving knowledge, reading and conditioning on it, and finally generating natural responses. Our best performing dialogue models are able to conduct knowledgeable discussions on open-domain topics as evaluated by automatic metrics and human evaluations, while our new benchmark allows for measuring further improvements in this important research direction.", "keywords": "dialogue;knowledge;language;conversation", "primary_area": "", "supplementary_material": "", "author": "Emily Dinan;Stephen Roller;Kurt Shuster;Angela Fan;Michael Auli;Jason Weston", "authorids": "edinan@fb.com;roller@fb.com;kshuster@fb.com;angelafan@fb.com;michaelauli@fb.com;jase@fb.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ndinan2018wizard,\ntitle={Wizard of Wikipedia: Knowledge-Powered Conversational Agents},\nauthor={Emily Dinan and Stephen Roller and Kurt Shuster and Angela Fan and Michael Auli and Jason Weston},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1l73iRqKm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=r1l73iRqKm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "5;4;4", "wc_review": "710;228;710", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1709;255;1771", "reply_reviewers": "0;0;0", "reply_authors": "3;1;3", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 549.3333333333334, 227.2169790212773 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1245.0, 700.4931596144723 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1074, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10143779580305681961&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=r1l73iRqKm", "pdf": "https://openreview.net/pdf?id=r1l73iRqKm", "email": ";;;;;", "author_num": 6 }, { "id": "r1l9Nj09YQ", "title": "Towards Language Agnostic Universal Representations", "track": "main", "status": "Reject", "tldr": "By taking inspiration from linguistics, specifically the Universal Grammar hypothesis, we learn language agnostic universal representations which we can utilize to do zero-shot learning across languages.", "abstract": "When a bilingual student learns to solve word problems in math, we expect the student to be able to solve these problem in both languages the student is fluent in, even if the math lessons were only taught in one language. However, current representations in machine learning are language dependent. In this work, we present a method to decouple the language from the problem by learning language agnostic representations and therefore allowing training a model in one language and applying to a different one in a zero shot fashion. We learn these representations by taking inspiration from linguistics, specifically the Universal Grammar hypothesis and learn universal latent representations that are language agnostic (Chomsky, 2014; Montague, 1970). We demonstrate the capabilities of these representations by showing that the models trained on a single language using language agnostic representations achieve very similar accuracies in other languages.", "keywords": "universal representations;language agnostic representations;NLP;GAN", "primary_area": "", "supplementary_material": "", "author": "Armen Aghajanyan;Xia Song;Saurabh Tiwary", "authorids": "araghaja@microsoft.com;xiaso@microsoft.com;satiwary@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\naghajanyan2019towards,\ntitle={Towards Language Agnostic Universal Representations},\nauthor={Armen Aghajanyan and Xia Song and Saurabh Tiwary},\nyear={2019},\nurl={https://openreview.net/forum?id=r1l9Nj09YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1l9Nj09YQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "401;449;324", "wc_reply_reviewers": "0;189;149", "wc_reply_authors": "49;362;551", "reply_reviewers": "0;1;1", "reply_authors": "1;2;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 391.3333333333333, 51.48678363317026 ], "wc_reply_reviewers_avg": [ 112.66666666666667, 81.32376992955722 ], "wc_reply_authors_avg": [ 320.6666666666667, 207.0142238806041 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3501693987514731386&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "r1lFIiR9tQ", "title": "Training generative latent models by variational f-divergence minimization", "track": "main", "status": "Reject", "tldr": "Training generative models using an upper bound of the f divergence.", "abstract": "Probabilistic models are often trained by maximum likelihood, which corresponds to minimizing a specific form of f-divergence between the model and data distribution. We derive an upper bound that holds for all f-divergences, showing the intuitive result that the divergence between two joint distributions is at least as great as the divergence between their corresponding marginals. Additionally, the f-divergence is not formally defined when two distributions have different supports. We thus propose a noisy version of f-divergence which is well defined in such situations. We demonstrate how the bound and the new version of f-divergence can be readily used to train complex probabilistic generative models of data and that the fitted model can depend significantly on the particular divergence used.", "keywords": "variational inference;generative model;f divergence", "primary_area": "", "supplementary_material": "", "author": "Mingtian Zhang;Thomas Bird;Raza Habib;Tianlin Xu;David Barber", "authorids": "mingtian.zhang.17@ucl.ac.uk;thomas.bird@cs.ucl.ac.uk;raza.habib@cs.ucl.ac.uk;t.xu12@lse.ac.uk;david.barber@ucl.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nzhang2019training,\ntitle={Training generative latent models by variational f-divergence minimization},\nauthor={Mingtian Zhang and Thomas Bird and Raza Habib and Tianlin Xu and David Barber},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lFIiR9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1lFIiR9tQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;3", "wc_review": "361;247;330", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "495;429;416", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 312.6666666666667, 48.12714641678044 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 446.6666666666667, 34.58644564308715 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11852530288463840768&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "r1lFYoRcFm", "title": "Quantile Regression Reinforcement Learning with State Aligned Vector Rewards", "track": "main", "status": "Withdraw", "tldr": "We train with state aligned vector rewards an agent predicting state changes from action distributions, using a new reinforcement learning technique inspired by quantile regression.", "abstract": "Learning from a scalar reward in continuous action space environments is difficult and often requires millions if not billions of interactions. We introduce state aligned vector rewards, which are easily defined in metric state spaces and allow our deep reinforcement learning agent to tackle the curse of dimensionality. Our agent learns to map from action distributions to state change distributions implicitly defined in a quantile function neural network. We further introduce a new reinforcement learning technique inspired by quantile regression which does not limit agents to explicitly parameterized action distributions. Our results in high dimensional state spaces show that training with vector rewards allows our agent to learn multiple times faster than an agent training with scalar rewards.", "keywords": "deep reinforcement learning;quantile regression;vector reward", "primary_area": "", "supplementary_material": "", "author": "Oliver Richter;Roger Wattenhofer", "authorids": "richtero@ethz.ch;wattenhofer@ethz.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1lFYoRcFm", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;4", "wc_review": "705;319;1227", "wc_reply_reviewers": "74;0;72", "wc_reply_authors": "831;33;592", "reply_reviewers": "1;0;1", "reply_authors": "2;1;1", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 750.3333333333334, 372.0728721933678 ], "wc_reply_reviewers_avg": [ 48.666666666666664, 34.4222150491349 ], "wc_reply_authors_avg": [ 485.3333333333333, 334.39929558671156 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1469788955052093417&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "r1lM_sA5Fm", "title": "Assumption Questioning: Latent Copying and Reward Exploitation in Question Generation", "track": "main", "status": "Reject", "tldr": "An investigation into latent copy mechanisms for question generation and correlations of external reward models with human evaluation.", "abstract": "Question generation is an important task for improving our ability to process natural language data, with additional challenges over other sequence transformation tasks. Recent approaches use modifications to a Seq2Seq architecture inspired by advances in machine translation, but unlike translation the input and output vocabularies overlap significantly, and there are many different valid questions for each input. Approaches using copy mechanisms and reinforcement learning have shown promising results, but there are ambiguities in the exact implementation that have not yet been investigated. We show that by removing inductive bias from the model and allowing the choice of generation path to become latent, we achieve substantial improvements over implementations biased with both naive and smart heuristics. We perform a human evaluation to confirm these findings. We show that although policy gradient methods may be used to decouple training from the ground truth and optimise directly for quality metrics that have previously been assumed to be good choices, these objectives are poorly aligned with human judgement and the model simply learns to exploit the weaknesses of the reward source. Finally, we show that an adversarial objective learned directly from the ground truth data is not able to generate a useful training signal.", "keywords": "question generation;answer questioning;pointer networks;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Tom Hosking;Sebastian Riedel", "authorids": "thomas.hosking.17@ucl.ac.uk;sebastian.riedel@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhosking2019assumption,\ntitle={Assumption Questioning: Latent Copying and Reward Exploitation in Question Generation},\nauthor={Tom Hosking and Sebastian Riedel},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lM_sA5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1lM_sA5Fm", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "317;139;170", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "520;143;286", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 208.66666666666666, 77.64162686486046 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 316.3333333333333, 155.3969826676896 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:CksUQSMDKcgJ:scholar.google.com/&scioq=Assumption+Questioning:+Latent+Copying+and+Reward+Exploitation+in+Question+Generation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Are adversarial examples inevitable?", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/885", "id": "r1lWUoA9FQ", "author_site": "Ali Shafahi, Ronny Huang, Christoph Studer, Soheil Feizi, Tom Goldstein", "tldr": "This paper identifies classes of problems for which adversarial examples are inescapable, and derives fundamental bounds on the susceptibility of any classifier to adversarial examples. ", "abstract": "A wide range of defenses have been proposed to harden neural networks against adversarial attacks. However, a pattern has emerged in which the majority of adversarial defenses are quickly broken by new attacks. Given the lack of success at generating robust defenses, we are led to ask a fundamental question: Are adversarial attacks inevitable?\nThis paper analyzes adversarial examples from a theoretical perspective, and identifies fundamental bounds on the susceptibility of a classifier to adversarial attacks. We show that, for certain classes of problems, adversarial examples are inescapable. Using experiments, we explore the implications of theoretical guarantees for real-world problems and discuss how factors such as dimensionality and image complexity limit a classifier's robustness against adversarial examples.\n\n", "keywords": "adversarial examples;neural networks;security", "primary_area": "", "supplementary_material": "", "author": "Ali Shafahi;W. Ronny Huang;Christoph Studer;Soheil Feizi;Tom Goldstein", "authorids": "ashafahi@gmail.com;w.ronny.huang@gmail.com;studer@cornell.edu;feizi.soheil@gmail.com;tomg@cs.umd.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nshafahi2018are,\ntitle={Are adversarial examples inevitable?},\nauthor={Ali Shafahi and W. Ronny Huang and Christoph Studer and Soheil Feizi and Tom Goldstein},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lWUoA9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "wc_review": "797;334;494", "wc_reply_reviewers": "188;57;152", "wc_reply_authors": "1137;294;510", "reply_reviewers": "2;1;1", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 541.6666666666666, 192.00057870283158 ], "wc_reply_reviewers_avg": [ 132.33333333333334, 55.25898619731957 ], "wc_reply_authors_avg": [ 647.0, 357.5276213105779 ], "reply_reviewers_avg": [ 1.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 392, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1886933684850079420&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1lWUoA9FQ", "pdf": "https://openreview.net/pdf?id=r1lWUoA9FQ", "email": ";;;;", "author_num": 5 }, { "title": "Enabling Factorized Piano Music Modeling and Generation with the MAESTRO Dataset", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/672", "id": "r1lYRjC9F7", "author_site": "Curtis Hawthorne, Andriy Stasyuk, Adam Roberts, Ian Simon, Anna Huang, Sander Dieleman, Erich K Elsen, Jesse Engel, Douglas Eck", "tldr": "We train a suite of models capable of transcribing, composing, and synthesizing audio waveforms with coherent musical structure, enabled by the new MAESTRO dataset.", "abstract": "Generating musical audio directly with neural networks is notoriously difficult because it requires coherently modeling structure at many different timescales. Fortunately, most music is also highly structured and can be represented as discrete note events played on musical instruments. Herein, we show that by using notes as an intermediate representation, we can train a suite of models capable of transcribing, composing, and synthesizing audio waveforms with coherent musical structure on timescales spanning six orders of magnitude (~0.1 ms to ~100 s), a process we call Wave2Midi2Wave. This large advance in the state of the art is enabled by our release of the new MAESTRO (MIDI and Audio Edited for Synchronous TRacks and Organization) dataset, composed of over 172 hours of virtuosic piano performances captured with fine alignment (~3 ms) between note labels and audio waveforms. The networks and the dataset together present a promising approach toward creating new expressive and interpretable neural models of music.", "keywords": "music;piano transcription;transformer;wavnet;audio synthesis;dataset;midi", "primary_area": "", "supplementary_material": "", "author": "Curtis Hawthorne;Andriy Stasyuk;Adam Roberts;Ian Simon;Cheng-Zhi Anna Huang;Sander Dieleman;Erich Elsen;Jesse Engel;Douglas Eck", "authorids": "fjord@google.com;astas@google.com;adarob@google.com;iansimon@google.com;annahuang@google.com;sedielem@google.com;eriche@google.com;jesseengel@google.com;deck@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nhawthorne2018enabling,\ntitle={Enabling Factorized Piano Music Modeling and Generation with the {MAESTRO} Dataset},\nauthor={Curtis Hawthorne and Andriy Stasyuk and Adam Roberts and Ian Simon and Cheng-Zhi Anna Huang and Sander Dieleman and Erich Elsen and Jesse Engel and Douglas Eck},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lYRjC9F7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 4 community implementations](https://paperswithcode.com/paper/?openreview=r1lYRjC9F7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "8;8;8", "confidence": "4;2;5", "wc_review": "324;275;300", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "190;645;397", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 8.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 299.6666666666667, 20.00555478416488 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 410.6666666666667, 186.00418155395204 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 629, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13653431951208907586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1lYRjC9F7", "pdf": "https://openreview.net/pdf?id=r1lYRjC9F7", "email": ";;;;;;;;", "author_num": 9 }, { "title": "A Variational Inequality Perspective on Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1129", "id": "r1laEnA5Ym", "author_site": "Gauthier Gidel, Hugo Berard, Ga\u00ebtan Vignoud, Pascal Vincent, Simon Lacoste-Julien", "tldr": "We cast GANs in the variational inequality framework and import techniques from this literature to optimize GANs better; we give algorithmic extensions and empirically test their performance for training GANs.", "abstract": "Generative adversarial networks (GANs) form a generative modeling approach known for producing appealing samples, but they are notably difficult to train. One common way to tackle this issue has been to propose new formulations of the GAN objective. Yet, surprisingly few studies have looked at optimization methods designed for this adversarial training. In this work, we cast GAN optimization problems in the general variational inequality framework. Tapping into the mathematical programming literature, we counter some common misconceptions about the difficulties of saddle point optimization and propose to extend methods designed for variational inequalities to the training of GANs. We apply averaging, extrapolation and a computationally cheaper variant that we call extrapolation from the past to the stochastic gradient method (SGD) and Adam.", "keywords": "optimization;variational inequality;games;saddle point;extrapolation;averaging;extragradient;generative modeling;generative adversarial network", "primary_area": "", "supplementary_material": "", "author": "Gauthier Gidel;Hugo Berard;Ga\u00ebtan Vignoud;Pascal Vincent;Simon Lacoste-Julien", "authorids": "gauthier.gidel@umontreal.ca;hugo.berard@gmail.com;gaetan.vignoud@gmail.com;vincentp@iro.umontreal.ca;slacoste@iro.umontreal.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngidel2018a,\ntitle={A Variational Inequality Perspective on Generative Adversarial Networks},\nauthor={Gauthier Gidel and Hugo Berard and Ga\u00ebtan Vignoud and Pascal Vincent and Simon Lacoste-Julien},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1laEnA5Ym},\n}", "github": "[![github](/images/github_icon.svg) GauthierGidel/Variational-Inequality-GAN](https://github.com/GauthierGidel/Variational-Inequality-GAN)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;3;3", "wc_review": "314;381;168", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "598;312;154", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 287.6666666666667, 88.92818575807235 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 354.6666666666667, 183.75587670118804 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 487, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6445881932716952872&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=r1laEnA5Ym", "pdf": "https://openreview.net/pdf?id=r1laEnA5Ym", "email": ";;;;", "author_num": 5 }, { "id": "r1lcM3AcKm", "title": "RNNs with Private and Shared Representations for Semi-Supervised Sequence Learning", "track": "main", "status": "Reject", "tldr": "This paper focuses upon a traditionally overlooked mechanism -- an architecture with explicitly designed private and shared hidden units designed to mitigate the detrimental influence of the auxiliary unsupervised loss over the main supervised task.", "abstract": "Training recurrent neural networks (RNNs) on long sequences using backpropagation through time (BPTT) remains a fundamental challenge. \nIt has been shown that adding a local unsupervised loss term into the optimization objective makes the training of RNNs on long sequences more effective. \nWhile the importance of an unsupervised task can in principle be controlled by a coefficient in the objective function, the gradients with respect to the unsupervised loss term still influence all the hidden state dimensions, which might cause important information about the supervised task to be degraded or erased. \nCompared to existing semi-supervised sequence learning methods, this paper focuses upon a traditionally overlooked mechanism -- an architecture with explicitly designed private and shared hidden units designed to mitigate the detrimental influence of the auxiliary unsupervised loss over the main supervised task.\nWe achieve this by dividing RNN hidden space into a private space for the supervised task and a shared space for both the supervised and unsupervised tasks. We present extensive experiments with the proposed framework on several long sequence modeling benchmark datasets. Results indicate that the proposed framework can yield performance gains in RNN models where long term dependencies are notoriously challenging to deal with. ", "keywords": "recurrent neural network;semi-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Ge Ya Luo;Jie Fu;Pengfei Liu;Zhi Hao Luo;Chris Pal", "authorids": "olga.xu@umontreal.ca;jie.fu@polymtl.ca;pfliu14@fudan.edu.cn;zhi-hao.luo@polymtl.ca;christopher.pal@polymtl.ca", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1lcM3AcKm", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;5", "wc_review": "323;193;412", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "270;218;155", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 309.3333333333333, 89.92713099442732 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 214.33333333333334, 47.020090268829655 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:tNv5qjJXMtsJ:scholar.google.com/&scioq=RNNs+with+Private+and+Shared+Representations+for+Semi-Supervised+Sequence+Learning&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "r1ledo0ctX", "title": "Consistency-based anomaly detection with adaptive multiple-hypotheses predictions", "track": "main", "status": "Reject", "tldr": "We propose an anomaly-detection approach that combines modeling the foreground class via multiple local densities with adversarial training.", "abstract": "In one-class-learning tasks, only the normal case can be modeled with data, whereas the variation of all possible anomalies is too large to be described sufficiently by samples. Thus, due to the lack of representative data, the wide-spread discriminative approaches cannot cover such learning tasks, and rather generative models, which attempt to learn the input density of the normal cases, are used. However, generative models suffer from a large input dimensionality (as in images) and are typically inefficient learners. We propose to learn the data distribution more efficiently with a multi-hypotheses autoencoder. Moreover, the model is criticized by a discriminator, which prevents artificial data modes not supported by data, and which enforces diversity across hypotheses. This consistency-based anomaly detection (ConAD) framework allows the reliable identification of outof- distribution samples. For anomaly detection on CIFAR-10, it yields up to 3.9% points improvement over previously reported results. On a real anomaly detection task, the approach reduces the error of the baseline models from 6.8% to 1.5%.", "keywords": "Anomaly detection;outlier detection;generative models;VAE;GAN", "primary_area": "", "supplementary_material": "", "author": "Duc Tam Nguyen;Zhongyu Lou;Michael Klar;Thomas Brox", "authorids": "nguyen@cs.uni-freiburg.de;zhongyu.lou@de.bosch.com;michael.klar2@de.bosch.com;brox@cs.uni-freiburg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nnguyen2019consistencybased,\ntitle={Consistency-based anomaly detection with adaptive multiple-hypotheses predictions},\nauthor={Duc Tam Nguyen and Zhongyu Lou and Michael Klar and Thomas Brox},\nyear={2019},\nurl={https://openreview.net/forum?id=r1ledo0ctX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=r1ledo0ctX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1ledo0ctX", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "302;298;522", "wc_reply_reviewers": "102;0;0", "wc_reply_authors": "876;402;1002", "reply_reviewers": "1;0;0", "reply_authors": "2;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 374.0, 104.66454350288194 ], "wc_reply_reviewers_avg": [ 34.0, 48.08326112068523 ], "wc_reply_authors_avg": [ 760.0, 258.3176339315611 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:3EQS4U4GFm4J:scholar.google.com/&scioq=Consistency-based+anomaly+detection+with+adaptive+multiple-hypotheses+predictions&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1lgm3C5t7", "title": "Universal discriminative quantum neural networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Quantum mechanics fundamentally forbids deterministic discrimination of quantum states and processes. However, the ability to optimally distinguish various classes of quantum data is an important primitive in quantum information science. In this work, we trained near-term quantum circuits to classify data represented by quantum states using the Adam stochastic optimization algorithm. This is achieved by iterative interactions of a classical device with a quantum processor to discover the parameters of an unknown non-unitary quantum circuit. This circuit learns to simulate the unknown structure of a generalized quantum measurement, or positive-operator valued measure (POVM), that is required to optimally distinguish possible distributions of quantum inputs. Notably we used universal circuit topologies, with a theoretically motivated circuit design which guaranteed that our circuits can perform arbitrary input-output mappings. Our numerical simulations showed that quantum circuits could be trained to discriminate among various pure and mixed quantum states, exhibiting a trade-off between minimizing erroneous and inconclusive outcomes with comparable performance to theoretically optimal POVMs. We trained the circuit on different classes of quantum data and evaluated the generalization error on unseen quantum data. This generalization power hence distinguishes our work from standard circuit optimization and provides an example of quantum machine learning for a task that has inherently no classical analogue. ", "keywords": "quantum machine learning;quantum data classification", "primary_area": "", "supplementary_material": "", "author": "Hongxiang Chen;Leonard Wossnig;Hartmut Neven;Simone Severini;Masoud Mohseni", "authorids": "we.taper@gmail.com;leonard.wossnig.17@ucl.ac.uk;neven@google.com;s.severini@ucl.ac.uk;mohseni@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nchen2019universal,\ntitle={Universal discriminative quantum neural networks},\nauthor={Hongxiang Chen and Leonard Wossnig and Hartmut Neven and Simone Severini and Masoud Mohseni},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lgm3C5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1lgm3C5t7", "pdf_size": 0, "rating": "2;5;5", "confidence": "2;2;3", "wc_review": "159;161;147", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 155.66666666666666, 6.182412330330469 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 92, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15521869808297856971&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "title": "Learning-Based Frequency Estimation Algorithms", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/874", "id": "r1lohoCqY7", "author_site": "Chen-Yu Hsu, Piotr Indyk, Dina Katabi, Ali Vakilian", "tldr": "Data stream algorithms can be improved using deep learning, while retaining performance guarantees.", "abstract": "Estimating the frequencies of elements in a data stream is a fundamental task in data analysis and machine learning. The problem is typically addressed using streaming algorithms which can process very large data using limited storage. Today's streaming algorithms, however, cannot exploit patterns in their input to improve performance. We propose a new class of algorithms that automatically learn relevant patterns in the input data and use them to improve its frequency estimates. The proposed algorithms combine the benefits of machine learning with the formal guarantees available through algorithm theory. We prove that our learning-based algorithms have lower estimation errors than their non-learning counterparts. We also evaluate our algorithms on two real-world datasets and demonstrate empirically their performance gains.", "keywords": "streaming algorithms;heavy-hitters;Count-Min;Count-Sketch", "primary_area": "", "supplementary_material": "", "author": "Chen-Yu Hsu;Piotr Indyk;Dina Katabi;Ali Vakilian", "authorids": "cyhsu@mit.edu;indyk@mit.edu;dina@csail.mit.edu;vakilian@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhsu2018learningbased,\ntitle={Learning-Based Frequency Estimation Algorithms},\nauthor={Chen-Yu Hsu and Piotr Indyk and Dina Katabi and Ali Vakilian},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lohoCqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;1;4", "wc_review": "356;664;166", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "664;859;98", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 395.3333333333333, 205.20125622314197 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 540.3333333333334, 322.7489564489541 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.49999999999999994, "gs_citation": 191, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5052939190574413817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1lohoCqY7", "pdf": "https://openreview.net/pdf?id=r1lohoCqY7", "email": ";;;", "author_num": 4 }, { "id": "r1lpx3A9K7", "title": "Featurized Bidirectional GAN: Adversarial Defense via Adversarially Learned Semantic Inference", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks have been demonstrated to be vulnerable to adversarial attacks, where small perturbations intentionally added to the original inputs can fool the classifier. In this paper, we propose a defense method, Featurized Bidirectional Generative Adversarial Networks (FBGAN), to extract the semantic features of the input and filter the non-semantic perturbation. FBGAN is pre-trained on the clean dataset in an unsupervised manner, adversarially learning a bidirectional mapping between a high-dimensional data space and a low-dimensional semantic space; also mutual information is applied to disentangle the semantically meaningful features. After the bidirectional mapping, the adversarial data can be reconstructed to denoised data, which could be fed into any pre-trained classifier. We empirically show the quality of reconstruction images and the effectiveness of defense.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ruying Bao;Sihang Liang;Qingcan Wang", "authorids": "rbao@princeton.edu;sihangl@princeton.edu;qingcanw@princeton.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbao2019featurized,\ntitle={Featurized Bidirectional {GAN}: Adversarial Defense via Adversarially Learned Semantic Inference},\nauthor={Ruying Bao and Sihang Liang and Qingcan Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lpx3A9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1lpx3A9K7", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;5", "wc_review": "280;348;367", "wc_reply_reviewers": "0;0;138", "wc_reply_authors": "211;563;529", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 331.6666666666667, 37.3482113211448 ], "wc_reply_reviewers_avg": [ 46.0, 65.05382386916237 ], "wc_reply_authors_avg": [ 434.3333333333333, 158.52935234698825 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11152122357872287037&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "From Language to Goals: Inverse Reinforcement Learning for Vision-Based Instruction Following", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1040", "id": "r1lq1hRqYQ", "author_site": "Justin Fu, Anoop Korattikara Balan, Sergey Levine, Sergio Guadarrama", "tldr": "We ground language commands in a high-dimensional visual environment by learning language-conditioned rewards using inverse reinforcement learning.", "abstract": "Reinforcement learning is a promising framework for solving control problems, but its use in practical situations is hampered by the fact that reward functions are often difficult to engineer. Specifying goals and tasks for autonomous machines, such as robots, is a significant challenge: conventionally, reward functions and goal states have been used to communicate objectives. But people can communicate objectives to each other simply by describing or demonstrating them. How can we build learning algorithms that will allow us to tell machines what we want them to do? In this work, we investigate the problem of grounding language commands as reward functions using inverse reinforcement learning, and argue that language-conditioned rewards are more transferable than language-conditioned policies to new environments. We propose language-conditioned reward learning (LC-RL), which grounds language commands as a reward function represented by a deep neural network. We demonstrate that our model learns rewards that transfer to novel tasks and environments on realistic, high-dimensional visual environments with natural language commands, whereas directly learning a language-conditioned policy leads to poor performance.", "keywords": "inverse reinforcement learning;language grounding;instruction following;language-based learning", "primary_area": "", "supplementary_material": "", "author": "Justin Fu;Anoop Korattikara;Sergey Levine;Sergio Guadarrama", "authorids": "justinjfu@eecs.berkeley.edu;kbanoop@google.com;svlevine@eecs.berkeley.edu;sguada@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nfu2018from,\ntitle={From Language to Goals: Inverse Reinforcement Learning for Vision-Based Instruction Following},\nauthor={Justin Fu and Anoop Korattikara and Sergey Levine and Sergio Guadarrama},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lq1hRqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "5;5;9", "confidence": "4;4;5", "wc_review": "320;268;579", "wc_reply_reviewers": "0;0;28", "wc_reply_authors": "528;528;281", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 389.0, 136.0171557806833 ], "wc_reply_reviewers_avg": [ 9.333333333333334, 13.199326582148887 ], "wc_reply_authors_avg": [ 445.6666666666667, 116.43691663538482 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 153, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9128320307925997063&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1lq1hRqYQ", "pdf": "https://openreview.net/pdf?id=r1lq1hRqYQ", "email": ";;;", "author_num": 4 }, { "title": "Backpropamine: training self-modifying neural networks with differentiable neuromodulated plasticity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/926", "id": "r1lrAiA5Ym", "author_site": "Thomas Miconi, Aditya Rawal, Jeff Clune, Kenneth O. Stanley", "tldr": "Neural networks can be trained to modify their own connectivity, improving their online learning performance on challenging tasks.", "abstract": "The impressive lifelong learning in animal brains is primarily enabled by plastic changes in synaptic connectivity. Importantly, these changes are not passive, but are actively controlled by neuromodulation, which is itself under the control of the brain. The resulting self-modifying abilities of the brain play an important role in learning and adaptation, and are a major basis for biological reinforcement learning. Here we show for the first time that artificial neural networks with such neuromodulated plasticity can be trained with gradient descent. Extending previous work on differentiable Hebbian plasticity, we propose a differentiable formulation for the neuromodulation of plasticity. We show that neuromodulated plasticity improves the performance of neural networks on both reinforcement learning and supervised learning tasks. In one task, neuromodulated plastic LSTMs with millions of parameters outperform standard LSTMs on a benchmark language modeling task (controlling for the number of parameters). We conclude that differentiable neuromodulation of plasticity offers a powerful new framework for training neural networks.", "keywords": "meta-learning;reinforcement learning;plasticity;neuromodulation;Hebbian learning;recurrent neural networks", "primary_area": "", "supplementary_material": "", "author": "Thomas Miconi;Aditya Rawal;Jeff Clune;Kenneth O. Stanley", "authorids": "tmiconi@uber.com;aditya.rawal@uber.com;jeffclune@uber.com;kstanley@uber.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nmiconi2018backpropamine,\ntitle={Backpropamine: training self-modifying neural networks with differentiable neuromodulated plasticity},\nauthor={Thomas Miconi and Aditya Rawal and Jeff Clune and Kenneth O. Stanley},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lrAiA5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;5;9", "confidence": "4;4;4", "wc_review": "507;384;429", "wc_reply_reviewers": "355;272;0", "wc_reply_authors": "1302;421;417", "reply_reviewers": "1;2;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 2.160246899469287 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 440.0, 50.813384063650005 ], "wc_reply_reviewers_avg": [ 209.0, 151.62013938348252 ], "wc_reply_authors_avg": [ 713.3333333333334, 416.2533950478829 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3700434839782890310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=r1lrAiA5Ym", "pdf": "https://openreview.net/pdf?id=r1lrAiA5Ym", "email": ";;;", "author_num": 4 }, { "id": "r1luCsCqFm", "title": "Learn From Neighbour: A Curriculum That Train Low Weighted Samples By Imitating", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep neural networks, which gain great success in a wide spectrum of applications, are often time, compute and storage hungry. Curriculum learning proposed to boost training of network by a syllabus from easy to hard. However, the relationship between data complexity and network training is unclear: why hard example harm the performance at beginning but helps at end. In this paper, we aim to investigate on this problem. Similar to internal covariate shift in network forward pass, the distribution changes in weight of top layers also affects training of preceding layers during the backward pass. We call this phenomenon inverse \"internal covariate shift\". Training hard examples aggravates the distribution shifting and damages the training. To address this problem, we introduce a curriculum loss that consists of two parts: a) an adaptive weight that mitigates large early punishment; b) an additional representation loss for low weighted samples. The intuition of the loss is very simple. We train top layers on \"good\" samples to reduce large shifting, and encourage \"bad\" samples to learn from \"good\" sample. In detail, the adaptive weight assigns small values to hard examples, reducing the influence of noisy gradients. On the other hand, the less-weighted hard sample receives the proposed representation loss. Low-weighted data gets nearly no training signal and can stuck in embedding space for a long time. The proposed representation loss aims to encourage their training. This is done by letting them learn a better representation from its superior neighbours but not participate in learning of top layers. In this way, the fluctuation of top layers is reduced and hard samples also received signals for training. We found in this paper that curriculum learning needs random sampling between tasks for better training. Our curriculum loss is easy to combine with existing stochastic algorithms like SGD. Experimental result shows an consistent improvement over several benchmark datasets.", "keywords": "Curriculum Learning;Internal Covariate Shift", "primary_area": "", "supplementary_material": "", "author": "Benyuan Sun;Yizhou Wang", "authorids": "sunbenyuan@pku.edu.cn;yizhou.wang@pku.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsun2019learn,\ntitle={Learn From Neighbour: A Curriculum That Train Low Weighted Samples By Imitating},\nauthor={Benyuan Sun and Yizhou Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=r1luCsCqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1luCsCqFm", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;3;4", "wc_review": "836;577;322", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 578.3333333333334, 209.84173930740175 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dTKpjEXlIvQJ:scholar.google.com/&scioq=Learn+From+Neighbour:+A+Curriculum+That+Train+Low+Weighted+Samples+By+Imitating&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Recurrent Experience Replay in Distributed Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/648", "id": "r1lyTjAqYX", "author_site": "Steven Kapturowski, Georg Ostrovski, John Quan, Remi Munos, Will Dabney", "tldr": "Investigation on combining recurrent neural networks and experience replay leading to state-of-the-art agent on both Atari-57 and DMLab-30 using single set of hyper-parameters.", "abstract": "Building on the recent successes of distributed training of RL agents, in this paper we investigate the training of RNN-based RL agents from distributed prioritized experience replay. We study the effects of parameter lag resulting in representational drift and recurrent state staleness and empirically derive an improved training strategy. Using a single network architecture and fixed set of hyper-parameters, the resulting agent, Recurrent Replay Distributed DQN, quadruples the previous state of the art on Atari-57, and matches the state of the art on DMLab-30. It is the first agent to exceed human-level performance in 52 of the 57 Atari games.", "keywords": "RNN;LSTM;experience replay;distributed training;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Steven Kapturowski;Georg Ostrovski;John Quan;Remi Munos;Will Dabney", "authorids": "skapturowski@google.com;ostrovski@google.com;johnquan@google.com;munos@google.com;wdabney@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nkapturowski2018recurrent,\ntitle={Recurrent Experience Replay in Distributed Reinforcement Learning},\nauthor={Steven Kapturowski and Georg Ostrovski and Will Dabney and John Quan and Remi Munos},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1lyTjAqYX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=r1lyTjAqYX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "2;3;4", "wc_review": "327;297;976", "wc_reply_reviewers": "23;0;327", "wc_reply_authors": "435;290;1039", "reply_reviewers": "1;0;2", "reply_authors": "1;1;2", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 533.3333333333334, 313.2521171339292 ], "wc_reply_reviewers_avg": [ 116.66666666666667, 149.02423367433303 ], "wc_reply_authors_avg": [ 588.0, 324.35268869961084 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 629, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9232121321169897083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=r1lyTjAqYX", "pdf": "https://openreview.net/pdf?id=r1lyTjAqYX", "email": ";;;;", "author_num": 5 }, { "title": "A Generative Model For Electron Paths", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1083", "id": "r1x4BnCqKX", "author_site": "John Bradshaw, Matt Kusner, Brooks Paige, Marwin Segler, Jos\u00e9 Miguel Hern\u00e1ndez Lobato", "tldr": "A generative model for reaction prediction that learns the mechanistic electron steps of a reaction directly from raw reaction data.", "abstract": "Chemical reactions can be described as the stepwise redistribution of electrons in molecules. As such, reactions are often depicted using \"arrow-pushing\" diagrams which show this movement as a sequence of arrows. We propose an electron path prediction model (ELECTRO) to learn these sequences directly from raw reaction data. Instead of predicting product molecules directly from reactant molecules in one shot, learning a model of electron movement has the benefits of (a) being easy for chemists to interpret, (b) incorporating constraints of chemistry, such as balanced atom counts before and after the reaction, and (c) naturally encoding the sparsity of chemical reactions, which usually involve changes in only a small number of atoms in the reactants. We design a method to extract approximate reaction paths from any dataset of atom-mapped reaction SMILES strings. Our model achieves excellent performance on an important subset of the USPTO reaction dataset, comparing favorably to the strongest baselines. Furthermore, we show that our model recovers a basic knowledge of chemistry without being explicitly trained to do so.", "keywords": "Molecules;Reaction Prediction;Graph Neural Networks;Deep Generative Models", "primary_area": "", "supplementary_material": "", "author": "John Bradshaw;Matt J. Kusner;Brooks Paige;Marwin H. S. Segler;Jos\u00e9 Miguel Hern\u00e1ndez-Lobato", "authorids": "jab255@cam.ac.uk;mkusner@turing.ac.uk;bpaige@turing.ac.uk;marwin.segler@benevolent.ai;jmh233@cam.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nbradshaw2018a,\ntitle={A Generative Model For Electron Paths},\nauthor={John Bradshaw and Matt J. Kusner and Brooks Paige and Marwin H. S. Segler and Jos\u00e9 Miguel Hern\u00e1ndez-Lobato},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1x4BnCqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;8;8", "confidence": "4;4;4", "wc_review": "537;630;604", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1243;962;602", "reply_reviewers": "0;0;0", "reply_authors": "2;3;1", "rating_avg": [ 6.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 590.3333333333334, 39.177658032211276 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 935.6666666666666, 262.3487924288749 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 90, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16966077960923717171&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1x4BnCqKX", "pdf": "https://openreview.net/pdf?id=r1x4BnCqKX", "email": ";;;;", "author_num": 5 }, { "id": "r1xFE3Rqt7", "title": "Adaptive Mixture of Low-Rank Factorizations for Compact Neural Modeling", "track": "main", "status": "Reject", "tldr": "We propose a simple modification to low-rank factorization that improves performances (in both image and language tasks) while still being compact.", "abstract": "Modern deep neural networks have a large amount of weights, which make them difficult to deploy on computation constrained devices such as mobile phones. One common approach to reduce the model size and computational cost is to use low-rank factorization to approximate a weight matrix. However, performing standard low-rank factorization with a small rank can hurt the model expressiveness and significantly decrease the performance. In this work, we propose to use a mixture of multiple low-rank factorizations to model a large weight matrix, and the mixture coefficients are computed dynamically depending on its input. We demonstrate the effectiveness of the proposed approach on both language modeling and image classification tasks. Experiments show that our method not only improves the computation efficiency but also maintains (sometimes outperforms) its accuracy compared with the full-rank counterparts.", "keywords": "Low-Rank Factorization;Compact Neural Nets;Efficient Modeling;Mixture models", "primary_area": "", "supplementary_material": "", "author": "Ting Chen;Ji Lin;Tian Lin;Song Han;Chong Wang;Denny Zhou", "authorids": "iamtingchen@gmail.com;lin-j14@mails.tsinghua.edu.cn;tianlin@google.com;chongw@google.com;dennyzhou@google.com;hansong8811@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nchen2019adaptive,\ntitle={Adaptive Mixture of Low-Rank Factorizations for Compact Neural Modeling},\nauthor={Ting Chen and Ji Lin and Tian Lin and Song Han and Chong Wang and Denny Zhou},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xFE3Rqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1xFE3Rqt7", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;4;5", "wc_review": "613;217;170", "wc_reply_reviewers": "484;0;0", "wc_reply_authors": "1618;581;58", "reply_reviewers": "2;0;0", "reply_authors": "4;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 333.3333333333333, 198.68288524402118 ], "wc_reply_reviewers_avg": [ 161.33333333333334, 228.15978806285932 ], "wc_reply_authors_avg": [ 752.3333333333334, 648.2881732343693 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.18898223650461357, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=544378124247842398&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "r1xN5oA5tm", "title": "Phrase-Based Attentions", "track": "main", "status": "Reject", "tldr": "Phrase-based attention mechanisms to assign attention on phrases, achieving token-to-phrase, phrase-to-token, phrase-to-phrase attention alignments, in addition to existing token-to-token attentions.", "abstract": "Most state-of-the-art neural machine translation systems, despite being different\nin architectural skeletons (e.g., recurrence, convolutional), share an indispensable\nfeature: the Attention. However, most existing attention methods are token-based\nand ignore the importance of phrasal alignments, the key ingredient for the success\nof phrase-based statistical machine translation. In this paper, we propose\nnovel phrase-based attention methods to model n-grams of tokens as attention\nentities. We incorporate our phrase-based attentions into the recently proposed\nTransformer network, and demonstrate that our approach yields improvements of\n1.3 BLEU for English-to-German and 0.5 BLEU for German-to-English translation\ntasks, and 1.75 and 1.35 BLEU points in English-to-Russian and Russian-to-English translation tasks \non WMT newstest2014 using WMT\u201916 training data.\n", "keywords": "neural machine translation;natural language processing;attention;transformer;seq2seq;phrase-based;phrase;n-gram", "primary_area": "", "supplementary_material": "", "author": "Phi Xuan Nguyen;Shafiq Joty", "authorids": "xuanphi001@e.ntu.edu.sg;srjoty@ntu.edu.sg", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnguyen2019phrasebased,\ntitle={Phrase-Based Attentions},\nauthor={Phi Xuan Nguyen and Shafiq Joty},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xN5oA5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1xN5oA5tm", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;5;4", "wc_review": "916;418;306", "wc_reply_reviewers": "98;0;0", "wc_reply_authors": "2406;1718;1223", "reply_reviewers": "1;0;0", "reply_authors": "5;4;3", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 546.6666666666666, 265.13057579657277 ], "wc_reply_reviewers_avg": [ 32.666666666666664, 46.19764303752111 ], "wc_reply_authors_avg": [ 1782.3333333333333, 485.09540871415754 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 4.0, 0.816496580927726 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17060283836413569578&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Modeling Uncertainty with Hedged Instance Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/776", "id": "r1xQQhAqKX", "author_site": "Seong Joon Oh, Andrew Gallagher, Kevin Murphy, Florian Schroff, Jiyan Pan, Joseph Roth", "tldr": "The paper proposes using probability distributions instead of points for instance embeddings tasks such as recognition and verification.", "abstract": "Instance embeddings are an efficient and versatile image representation that facilitates applications like recognition, verification, retrieval, and clustering. Many metric learning methods represent the input as a single point in the embedding space. Often the distance between points is used as a proxy for match confidence. However, this can fail to represent uncertainty which can arise when the input is ambiguous, e.g., due to occlusion or blurriness. This work addresses this issue and explicitly models the uncertainty by \u201chedging\u201d the location of each input in the embedding space. We introduce the hedged instance embedding (HIB) in which embeddings are modeled as random variables and the model is trained under the variational information bottleneck principle (Alemi et al., 2016; Achille & Soatto, 2018). Empirical results on our new N-digit MNIST dataset show that our method leads to the desired behavior of \u201chedging its bets\u201d across the embedding space upon encountering ambiguous inputs. This results in improved performance for image matching and classification tasks, more structure in the learned embedding space, and an ability to compute a per-exemplar uncertainty measure which is correlated with downstream performance.", "keywords": "uncertainty;instance embedding;metric learning;probabilistic embedding", "primary_area": "", "supplementary_material": "", "author": "Seong Joon Oh;Kevin P. Murphy;Jiyan Pan;Joseph Roth;Florian Schroff;Andrew C. Gallagher", "authorids": "coallaoh@linecorp.com;agallagher@google.com;kpmurphy@google.com;fschroff@google.com;jiyanpan@google.com;josephroth@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\noh2018modeling,\ntitle={Modeling Uncertainty with Hedged Instance Embeddings},\nauthor={Seong Joon Oh and Andrew C. Gallagher and Kevin P. Murphy and Florian Schroff and Jiyan Pan and Joseph Roth},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xQQhAqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;3;3", "wc_review": "449;224;297", "wc_reply_reviewers": "0;226;0", "wc_reply_authors": "836;836;836", "reply_reviewers": "0;1;0", "reply_authors": "2;2;2", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 323.3333333333333, 93.72418163004798 ], "wc_reply_reviewers_avg": [ 75.33333333333333, 106.53742169877317 ], "wc_reply_authors_avg": [ 836.0, 0.0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 133, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15448440055420606601&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1xQQhAqKX", "pdf": "https://openreview.net/pdf?id=r1xQQhAqKX", "email": ";;;;;", "author_num": 6 }, { "id": "r1xRW3A9YX", "title": "Riemannian TransE: Multi-relational Graph Embedding in Non-Euclidean Space", "track": "main", "status": "Reject", "tldr": "Multi-relational graph embedding with Riemannian manifolds and TransE-like loss function. ", "abstract": "Multi-relational graph embedding which aims at achieving effective representations with reduced low-dimensional parameters, has been widely used in knowledge base completion. Although knowledge base data usually contains tree-like or cyclic structure, none of existing approaches can embed these data into a compatible space that in line with the structure. To overcome this problem, a novel framework, called Riemannian TransE, is proposed in this paper to embed the entities in a Riemannian manifold. Riemannian TransE models each relation as a move to a point and defines specific novel distance dissimilarity for each relation, so that all the relations are naturally embedded in correspondence to the structure of data. Experiments on several knowledge base completion tasks have shown that, based on an appropriate choice of manifold, Riemannian TransE achieves good performance even with a significantly reduced parameters.", "keywords": "Riemannian TransE;graph embedding;multi-relational graph;Riemannian manifold;TransE;hyperbolic space;sphere;knowledge base", "primary_area": "", "supplementary_material": "", "author": "Atsushi Suzuki;Yosuke Enokida;Kenji Yamanishi", "authorids": "atsushi-suzuki@g.ecc.u-tokyo.ac.jp;xenolay@g.ecc.u-tokyo.ac.jp;yamanishi@mist.i.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsuzuki2019riemannian,\ntitle={Riemannian TransE: Multi-relational Graph Embedding in Non-Euclidean Space},\nauthor={Atsushi Suzuki and Yosuke Enokida and Kenji Yamanishi},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xRW3A9YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1xRW3A9YX", "pdf_size": 0, "rating": "5;5;5", "confidence": "2;3;5", "wc_review": "201;300;418", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "508;502;642", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 306.3333333333333, 88.70300007453086 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 550.6666666666666, 64.62885492478486 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15259745465611274857&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Beyond Greedy Ranking: Slate Optimization via List-CVAE", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/827", "id": "r1xX42R5Fm", "author_site": "Ray Jiang, Sven Gowal, Yuqiu Qian, Timothy A Mann, Danilo Jimenez Rezende", "tldr": "We used a CVAE type model structure to learn to directly generate slates/whole pages for recommendation systems.", "abstract": "The conventional approach to solving the recommendation problem greedily ranks\nindividual document candidates by prediction scores. However, this method fails to\noptimize the slate as a whole, and hence, often struggles to capture biases caused\nby the page layout and document interdepedencies. The slate recommendation\nproblem aims to directly find the optimally ordered subset of documents (i.e.\nslates) that best serve users\u2019 interests. Solving this problem is hard due to the\ncombinatorial explosion of document candidates and their display positions on the\npage. Therefore we propose a paradigm shift from the traditional viewpoint of solving a ranking problem to a direct slate generation framework. In this paper, we introduce List Conditional Variational Auto-Encoders (ListCVAE),\nwhich learn the joint distribution of documents on the slate conditioned\non user responses, and directly generate full slates. Experiments on simulated\nand real-world data show that List-CVAE outperforms greedy ranking methods\nconsistently on various scales of documents corpora.", "keywords": "CVAE;VAE;recommendation system;slate optimization;whole page optimization", "primary_area": "", "supplementary_material": "", "author": "Ray Jiang;Sven Gowal;Yuqiu Qian;Timothy Mann;Danilo J. Rezende", "authorids": "rayjiang@google.com;sgowal@google.com;yqqian@cs.hku.hk;timothymann@google.com;danilor@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\njiang2018beyond,\ntitle={Beyond Greedy Ranking: Slate Optimization via List-{CVAE}},\nauthor={Ray Jiang and Sven Gowal and Yuqiu Qian and Timothy Mann and Danilo J. Rezende},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xX42R5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;4", "wc_review": "623;537;172", "wc_reply_reviewers": "0;410;0", "wc_reply_authors": "693;1121;112", "reply_reviewers": "0;2;0", "reply_authors": "2;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 444.0, 195.51129549636426 ], "wc_reply_reviewers_avg": [ 136.66666666666666, 193.275853524323 ], "wc_reply_authors_avg": [ 642.0, 413.49808544498325 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8102727489122848940&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=r1xX42R5Fm", "pdf": "https://openreview.net/pdf?id=r1xX42R5Fm", "email": ";;;;", "author_num": 5 }, { "id": "r1xYr3C5t7", "title": "Neural Message Passing for Multi-Label Classification", "track": "main", "status": "Reject", "tldr": "We propose Message Passing Encoder-Decode networks for a fast and accurate way of modelling label dependencies for multi-label classification.", "abstract": "Multi-label classification (MLC) is the task of assigning a set of target labels for a given sample. Modeling the combinatorial label interactions in MLC has been a long-haul challenge. Recurrent neural network (RNN) based encoder-decoder models have shown state-of-the-art performance for solving MLC. However, the sequential nature of modeling label dependencies through an RNN limits its ability in parallel computation, predicting dense labels, and providing interpretable results. In this paper, we propose Message Passing Encoder-Decoder (MPED) Networks, aiming to provide fast, accurate, and interpretable MLC. MPED networks model the joint prediction of labels by replacing all RNNs in the encoder-decoder architecture with message passing mechanisms and dispense with autoregressive inference entirely. The proposed models are simple, fast, accurate, interpretable, and structure-agnostic (can be used on known or unknown structured data). Experiments on seven real-world MLC datasets show the proposed models outperform autoregressive RNN models across five different metrics with a significant speedup during training and testing time.", "keywords": "Multi-label Classification;Graph Neural Networks;Attention;Graph Attention", "primary_area": "", "supplementary_material": "", "author": "Jack Lanchantin;Arshdeep Sekhon;Yanjun Qi", "authorids": "jjl5sw@virginia.edu;as5cu@virginia.edu;yq2h@virginia.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlanchantin2019neuralmessage,\ntitle={Neural Message Passing for Multi-Label Classification},\nauthor={Jack Lanchantin and Arshdeep Sekhon and Yanjun Qi},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xYr3C5t7},\n}", "github": "[![github](/images/github_icon.svg) QData/LaMP](https://github.com/QData/LaMP)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1xYr3C5t7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;2;4", "wc_review": "415;451;868", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "460;944;533", "reply_reviewers": "0;0;0", "reply_authors": "4;6;6", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 578.0, 205.58696456730908 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 645.6666666666666, 213.04824701357097 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 5.333333333333333, 0.9428090415820634 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4902041650695374421&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "id": "r1xce3ActX", "title": "N/A", "track": "main", "status": "Withdraw", "tldr": "N/A", "abstract": "N/A", "keywords": "N/A", "primary_area": "", "supplementary_material": "", "author": "N/A;N/A;N/A", "authorids": "samitha.herath@data61.csiro.au;u5505348@anu.edu.au;mehrtash.harandi@monash.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1xce3ActX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;5", "wc_review": "522;548;1283", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "27;27;27", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 784.3333333333334, 352.7703061326008 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 27.0, 0.0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Stochastic Prediction of Multi-Agent Interactions from Partial Observations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/893", "id": "r1xdH3CcKX", "author_site": "Chen Sun, Per Karlsson, Jiajun Wu, Joshua B Tenenbaum, Kevin Murphy", "tldr": "We present a method which learns to integrate temporal information and ambiguous visual information in the context of interacting agents.", "abstract": "We present a method which learns to integrate temporal information, from a learned dynamics model, with ambiguous visual information, from a learned vision model, in the context of interacting agents. Our method is based on a graph-structured variational recurrent neural network, which is trained end-to-end to infer the current state of the (partially observed) world, as well as to forecast future states. We show that our method outperforms various baselines on two sports datasets, one based on real basketball trajectories, and one generated by a soccer game engine.", "keywords": "Dynamics modeling;partial observations;multi-agent interactions;predictive models", "primary_area": "", "supplementary_material": "", "author": "Chen Sun;Per Karlsson;Jiajun Wu;Joshua B Tenenbaum;Kevin Murphy", "authorids": "chensun@google.com;perk@google.com;jiajunwu@mit.edu;jbt@mit.edu;kpmurphy@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nsun2018predicting,\ntitle={Predicting the Present and Future States of Multi-agent Systems from Partially-observed Visual Data},\nauthor={Chen Sun and Per Karlsson and Jiajun Wu and Joshua B Tenenbaum and Kevin Murphy},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xdH3CcKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "wc_review": "378;374;337", "wc_reply_reviewers": "79;0;0", "wc_reply_authors": "251;238;156", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 363.0, 18.457157599876172 ], "wc_reply_reviewers_avg": [ 26.333333333333332, 37.2409571424915 ], "wc_reply_authors_avg": [ 215.0, 42.05551886098502 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11679184736370359376&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=r1xdH3CcKX", "pdf": "https://openreview.net/pdf?id=r1xdH3CcKX", "email": ";;;;", "author_num": 5 }, { "id": "r1xkIjA9tX", "title": "q-Neurons: Neuron Activations based on Stochastic Jackson's Derivative Operators", "track": "main", "status": "Reject", "tldr": "q-calculus helps build simple and scalable neural activation functions", "abstract": "We propose a new generic type of stochastic neurons, called $q$-neurons, that considers activation functions based on Jackson's $q$-derivatives, with stochastic parameters $q$. Our generalization of neural network architectures with $q$-neurons is shown to be both scalable and very easy to implement. We demonstrate experimentally consistently improved performances over state-of-the-art standard activation functions, both on training and testing loss functions.\n", "keywords": "q-calculus;neural activation function", "primary_area": "", "supplementary_material": "", "author": "Frank Nielsen;Ke Sun", "authorids": "frank.nielsen@acm.org;sunk.edu@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nnielsen2019qneurons,\ntitle={q-Neurons: Neuron Activations based on Stochastic Jackson's Derivative Operators},\nauthor={Frank Nielsen and Ke Sun},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xkIjA9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1xkIjA9tX", "pdf_size": 0, "rating": "2;5;6", "confidence": "5;3;3", "wc_review": "390;191;96", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 225.66666666666666, 122.50260768199544 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9707253433941512, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13781560033307484308&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8 }, { "title": "Learning to Remember More with Less Memorization", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/666", "id": "r1xlvi0qYm", "author_site": "Hung Le, Truyen Tran, Svetha Venkatesh", "tldr": "", "abstract": "Memory-augmented neural networks consisting of a neural controller and an external memory have shown potentials in long-term sequential learning. Current RAM-like memory models maintain memory accessing every timesteps, thus they do not effectively leverage the short-term memory held in the controller. We hypothesize that this scheme of writing is suboptimal in memory utilization and introduces redundant computation. To validate our hypothesis, we derive a theoretical bound on the amount of information stored in a RAM-like system and formulate an optimization problem that maximizes the bound. The proposed solution dubbed Uniform Writing is proved to be optimal under the assumption of equal timestep contributions. To relax this assumption, we introduce modifications to the original solution, resulting in a solution termed Cached Uniform Writing. This method aims to balance between maximizing memorization and forgetting via overwriting mechanisms. Through an extensive set of experiments, we empirically demonstrate the advantages of our solutions over other recurrent architectures, claiming the state-of-the-arts in various sequential modeling tasks. ", "keywords": "memory-augmented neural networks;writing optimization", "primary_area": "", "supplementary_material": "", "author": "Hung Le;Truyen Tran;Svetha Venkatesh", "authorids": "lethai@deakin.edu.au;truyen.tran@deakin.edu.au;svetha.venkatesh@deakin.edu.au", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nle2018learning,\ntitle={Learning to Remember More with Less Memorization},\nauthor={Hung Le and Truyen Tran and Svetha Venkatesh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xlvi0qYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;3;4", "wc_review": "951;367;423", "wc_reply_reviewers": "0;0;168", "wc_reply_authors": "818;200;431", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 580.3333333333334, 263.0960956676392 ], "wc_reply_reviewers_avg": [ 56.0, 79.19595949289332 ], "wc_reply_authors_avg": [ 483.0, 254.96274237621463 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10822310561181575410&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=r1xlvi0qYm", "pdf": "https://openreview.net/pdf?id=r1xlvi0qYm", "email": ";;", "author_num": 3 }, { "id": "r1xrb3CqtQ", "title": "Latent Domain Transfer: Crossing modalities with Bridging Autoencoders", "track": "main", "status": "Reject", "tldr": "Conditional VAE on top of latent spaces of pre-trained generative models that enables transfer between drastically different domains while preserving locality and semantic alignment.", "abstract": "Domain transfer is a exciting and challenging branch of machine learning because models must learn to smoothly transfer between domains, preserving local variations and capturing many aspects of variation without labels. \nHowever, most successful applications to date require the two domains to be closely related (ex. image-to-image, video-video), \nutilizing similar or shared networks to transform domain specific properties like texture, coloring, and line shapes. \nHere, we demonstrate that it is possible to transfer across modalities (ex. image-to-audio) by first abstracting the data with latent generative models and then learning transformations between latent spaces. \nWe find that a simple variational autoencoder is able to learn a shared latent space to bridge between two generative models in an unsupervised fashion, and even between different types of models (ex. variational autoencoder and a generative adversarial network). \nWe can further impose desired semantic alignment of attributes with a linear classifier in the shared latent space. \nThe proposed variation autoencoder enables preserving both locality and semantic alignment through the transfer process, as shown in the qualitative and quantitative evaluations.\nFinally, the hierarchical structure decouples the cost of training the base generative models and semantic alignments, enabling computationally efficient and data efficient retraining of personalized mapping functions. ", "keywords": "Generative Model;Latent Space;Domain Transfer", "primary_area": "", "supplementary_material": "", "author": "Yingtao Tian;Jesse Engel", "authorids": "yittian@cs.stonybrook.edu;jesseengel@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntian2019latent,\ntitle={Latent Domain Transfer: Crossing modalities with Bridging Autoencoders},\nauthor={Yingtao Tian and Jesse Engel},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xrb3CqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=r1xrb3CqtQ", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "wc_review": "184;156;160", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "687;425;366", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 166.66666666666666, 12.36482466066094 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 492.6666666666667, 139.50945806248725 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13496613189697872073&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1xurn0cKQ", "title": "Correction Networks: Meta-Learning for Zero-Shot Learning", "track": "main", "status": "Reject", "tldr": "A model learns to perform zero-shot classification using a meta-learner that is trained to update predictions based on the learner's training data.", "abstract": "We propose a model that learns to perform zero-shot classification using a meta-learner that is trained to produce a correction to the output of a previously trained learner. The model consists of two modules: a task module that supplies an initial prediction, and a correction module that updates the initial prediction. The task module is the learner and the correction module is the meta-learner. The correction module is trained in an episodic approach whereby many different task modules are trained on various subsets of the total training data, with the rest being used as unseen data for the correction module. The correction module takes as input a representation of the task module's training data so that the predicted correction is a function of the task module's training data. The correction module is trained to update the task module's prediction to be closer to the target value. This approach leads to state-of-the-art performance for zero-shot classification on natural language class descriptions on the CUB and NAB datasets. ", "keywords": "zero-shot learning;image classification;fine-grained classification;meta-learning", "primary_area": "", "supplementary_material": "", "author": "R. Lily Hu;Caiming Xiong;Richard Socher", "authorids": "rlilyhu@gmail.com;cxiong@salesforce.com;rsocher@salesforce.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhu2019correction,\ntitle={Correction Networks: Meta-Learning for Zero-Shot Learning},\nauthor={R. Lily Hu and Caiming Xiong and Richard Socher},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xurn0cKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1xurn0cKQ", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;5;4", "wc_review": "368;549;524", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "738;745;775", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 480.3333333333333, 80.08467740807573 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 752.6666666666666, 16.048537489614297 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1498515563955830327&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "GamePad: A Learning Environment for Theorem Proving", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/781", "id": "r1xwKoR9Y7", "author_site": "Daniel Huang, Prafulla Dhariwal, Dawn Song, Ilya Sutskever", "tldr": "We introduce a system called GamePad to explore the application of machine learning methods to theorem proving in the Coq proof assistant.", "abstract": "In this paper, we introduce a system called GamePad that can be used to explore the application of machine learning methods to theorem proving in the Coq proof assistant. Interactive theorem provers such as Coq enable users to construct machine-checkable proofs in a step-by-step manner. Hence, they provide an opportunity to explore theorem proving with human supervision. We use GamePad to synthesize proofs for a simple algebraic rewrite problem and train baseline models for a formalization of the Feit-Thompson theorem. We address position evaluation (i.e., predict the number of proof steps left) and tactic prediction (i.e., predict the next proof step) tasks, which arise naturally in tactic-based theorem proving.", "keywords": "Theorem proving;ITP;systems;neural embeddings", "primary_area": "", "supplementary_material": "", "author": "Daniel Huang;Prafulla Dhariwal;Dawn Song;Ilya Sutskever", "authorids": "dehuang@berkeley.edu;prafulla@openai.com;dawnsong@cs.berkeley.edu;ilyasu@openai.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhuang2018gamepad,\ntitle={GamePad: A Learning Environment for Theorem Proving},\nauthor={Daniel Huang and Prafulla Dhariwal and Dawn Song and Ilya Sutskever},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xwKoR9Y7},\n}", "github": "[![github](/images/github_icon.svg) ml4tp/gamepad](https://github.com/ml4tp/gamepad)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;2;3", "wc_review": "203;600;336", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "223;180;185", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 379.6666666666667, 164.9895619594026 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 196.0, 19.200694431886227 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 127, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10460600857870546205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=r1xwKoR9Y7", "pdf": "https://openreview.net/pdf?id=r1xwKoR9Y7", "email": ";;;", "author_num": 4 }, { "id": "r1xwS3RqKQ", "title": "Differential Equation Networks", "track": "main", "status": "Reject", "tldr": "We introduce a method to learn the nonlinear activation function for each neuron in the network.", "abstract": "Most deep neural networks use simple, fixed activation functions, such\nas sigmoids or rectified linear units, regardless of domain or\nnetwork structure. We introduce differential equation networks, an\nimprovement to modern neural networks in which each neuron learns the\nparticular nonlinear activation function that it requires. We show\nthat enabling each neuron with the ability to learn its own activation\nfunction results in a more compact network capable of achieving\ncomperable, if not superior performance when compared to much larger\nnetworks. We\nalso showcase the capability of a differential equation neuron to\nlearn behaviors, such as oscillation, currently only obtainable by a\nlarge group of neurons. The ability of\ndifferential equation networks to essentially compress a large neural network, without loss of overall performance\nmakes them suitable for on-device applications, where predictions must\nbe computed locally. Our experimental evaluation of real-world and toy\ndatasets show that differential equation networks outperform fixed activatoin networks in several areas.", "keywords": "deep learning;activation function;differential equations", "primary_area": "", "supplementary_material": "", "author": "MohamadAli Torkamani;Phillip Wallis", "authorids": "torkamani@gmail.com;wallis.phillip@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntorkamani2019differential,\ntitle={Differential Equation Networks},\nauthor={MohamadAli Torkamani and Phillip Wallis},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xwS3RqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1xwS3RqKQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "1099;336;661", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 698.6666666666666, 312.63006182316434 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r1xwqjRcY7", "title": "Probabilistic Semantic Embedding", "track": "main", "status": "Reject", "tldr": "", "abstract": "We present an extension of a variational auto-encoder that creates semantically richcoupled probabilistic latent representations that capture the semantics of multiplemodalities of data. We demonstrate this model through experiments using imagesand textual descriptors as inputs and images as outputs. Our latent representationsare not only capable of driving a decoder to generate novel data, but can also be useddirectly for annotation or classification. Using the MNIST and Fashion-MNISTdatasets we show that the embedding not only provides better reconstruction andclassification performance than the current state-of-the-art, but it also allows us toexploit the semantic content of the pretrained word embedding spaces to do taskssuch as image generation from labels outside of those seen during training.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yue Jiao;Jonathon Hare;Adam Pr\u00fcgel-Bennett", "authorids": "yj5y15@ecs.soton.ac.uk;jsh2@ecs.soton.ac.uk;apb@ecs.soton.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njiao2019probabilistic,\ntitle={Probabilistic Semantic Embedding},\nauthor={Yue Jiao and Jonathon Hare and Adam Pr\u00fcgel-Bennett},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xwqjRcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=r1xwqjRcY7", "pdf_size": 0, "rating": "4;4;7", "confidence": "4;4;3", "wc_review": "163;256;517", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "923;752;1147", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 312.0, 149.84658821608184 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 940.6666666666666, 161.74121992312973 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "r1xywsC9tQ", "title": "Mapping the hyponymy relation of wordnet onto vector Spaces", "track": "main", "status": "Reject", "tldr": "We investigate mapping the hyponymy relation of wordnet to feature vectors", "abstract": " In this paper, we investigate mapping the hyponymy relation of\n wordnet to feature vectors.\n We aim to model lexical knowledge in such a way that it can be used as\n input in generic machine-learning models, such as phrase entailment\n predictors.\n We propose two models. The first one leverages an existing mapping of\n words to feature vectors (fasttext), and attempts to classify\n such vectors as within or outside of each class. The second model is fully supervised,\n using solely wordnet as a ground truth. It maps each concept to an\n interval or a disjunction thereof.\n On the first model, we approach, but not quite attain state of the\n art performance. The second model can achieve near-perfect accuracy.\n", "keywords": "fasttext;hyponymy;wordnet", "primary_area": "", "supplementary_material": "", "author": "Jean-Philippe Bernardy;Aleksandre Maskharashvili", "authorids": "jean-philippe.bernardy@gu.se;aleksandre.maskharashvili@gu.se", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbernardy2019mapping,\ntitle={Mapping the hyponymy relation of wordnet onto vector Spaces},\nauthor={Jean-Philippe Bernardy and Aleksandre Maskharashvili},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xywsC9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=r1xywsC9tQ", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;3;4", "wc_review": "931;307;593", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 610.3333333333334, 255.04160880574412 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:8Kkw01_GTcsJ:scholar.google.com/&scioq=Mapping+the+hyponymy+relation+of+wordnet+onto+vector+Spaces&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "r1xyx3R9tQ", "title": "Prototypical Examples in Deep Learning: Metrics, Characteristics, and Utility", "track": "main", "status": "Reject", "tldr": "We can identify prototypical and outlier examples in machine learning that are quantifiably very different, and make use of them to improve many aspects of neural networks.", "abstract": "Machine learning (ML) research has investigated prototypes: examples that are representative of the behavior to be learned. We systematically evaluate five methods for identifying prototypes, both ones previously introduced as well as new ones we propose, finding all of them to provide meaningful but different interpretations. Through a human study, we confirm that all five metrics are well matched to human intuition. Examining cases where the metrics disagree offers an informative perspective on the properties of data and algorithms used in learning, with implications for data-corpus construction, efficiency, adversarial robustness, interpretability, and other ML aspects. In particular, we confirm that the \"train on hard\" curriculum approach can improve accuracy on many datasets and tasks, but that it is strictly worse when there are many mislabeled or ambiguous examples.", "keywords": "prototypes;curriculum learning;interpretability;differential privacy;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Nicholas Carlini;Ulfar Erlingsson;Nicolas Papernot", "authorids": "nicholas@carlini.com;ulfar@google.com;papernot@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ncarlini2019prototypical,\ntitle={Prototypical Examples in Deep Learning: Metrics, Characteristics, and Utility},\nauthor={Nicholas Carlini and Ulfar Erlingsson and Nicolas Papernot},\nyear={2019},\nurl={https://openreview.net/forum?id=r1xyx3R9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1xyx3R9tQ", "pdf_size": 0, "rating": "3;5;5", "confidence": "3;4;4", "wc_review": "959;189;381", "wc_reply_reviewers": "1121;283;0", "wc_reply_authors": "3089;987;1126", "reply_reviewers": "2;3;0", "reply_authors": "5;2;2", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 509.6666666666667, 327.2525358530049 ], "wc_reply_reviewers_avg": [ 468.0, 475.9754895650265 ], "wc_reply_authors_avg": [ 1734.0, 959.8086614876252 ], "reply_reviewers_avg": [ 1.6666666666666667, 1.247219128924647 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12746769823056024417&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "r1z1UjA5FX", "title": "Adversarial Defense Via Data Dependent Activation Function and Total Variation Minimization", "track": "main", "status": "Withdraw", "tldr": "We proposal strategies for adversarial defense based on data dependent activation function, total variation minimization, and training data augmentation", "abstract": "We improve the robustness of deep neural nets to adversarial attacks by using an interpolating function as the output activation. This data-dependent activation function remarkably improves both classification accuracy and stability to adversarial perturbations. Together with the total variation minimization of adversarial images and augmented training, under the strongest attack, we achieve up to 20.6%, 50.7%, and 68.7% accuracy improvement w.r.t. the fast gradient sign method, iterative fast gradient sign method, and Carlini-WagnerL2attacks, respectively. Our defense strategy is additive to many of the existing methods. We give an intuitive explanation of our defense strategy via analyzing the geometry of the feature space. For reproducibility, the code will be available on GitHub.", "keywords": "Adversarial Attack;Adversarial Defense;Data Dependent Activation Function;Total Variation Minimization", "primary_area": "", "supplementary_material": "", "author": "Bao Wang;Alex T. Lin;Zuoqiang Shi;Wei Zhu;Penghang Yin;Andrea L. Bertozzi;Stanley J. Osher", "authorids": "wangbaonj@gmail.com;atlin@math.ucla.edu;zqshi@mail.tsinghua.edu.cn;zhu@math.duke.edu;yph@g.ucla.edu;bertozzi@math.ucla.edu;sjo@math.ucla.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=r1z1UjA5FX", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8804375157549319639&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "r1zOg309tX", "title": "Understanding the Effectiveness of Lipschitz-Continuity in Generative Adversarial Nets", "track": "main", "status": "Reject", "tldr": "We disclose the fundamental cause of failure in training of GANs, and demonstrate that Lipschitz-continuity is a general solution to this issue.", "abstract": "In this paper, we investigate the underlying factor that leads to the failure and success in training of GANs. Specifically, we study the property of the optimal discriminative function $f^*(x)$ and show that $f^*(x)$ in most GANs can only reflect the local densities at $x$, which means the value of $f^*(x)$ for points in the fake distribution ($P_g$) does not contain any information useful about the location of other points in the real distribution ($P_r$). Given that the supports of the real and fake distributions are usually disjoint, we argue that such a $f^*(x)$ and its gradient tell nothing about \"how to pull $P_g$ to $P_r$\", which turns out to be the fundamental cause of failure in training of GANs. We further demonstrate that a well-defined distance metric (including the dual form of Wasserstein distance with a compacted constraint) does not necessarily ensure the convergence of GANs. Finally, we propose Lipschitz-continuity condition as a general solution and show that in a large family of GAN objectives, Lipschitz condition is capable of connecting $P_g$ and $P_r$ through $f^*(x)$ such that the gradient $\\nabla_{\\!x}f^*(x)$ at each sample $x \\sim P_g$ points towards some real sample $y \\sim P_r$.", "keywords": "GANs;Lipschitz-continuity;convergence", "primary_area": "", "supplementary_material": "", "author": "Zhiming Zhou;Yuxuan Song;Lantao Yu;Hongwei Wang;Weinan Zhang;Zhihua Zhang;Yong Yu", "authorids": "heyohai@apex.sjtu.edu.cn;songyuxuan@apex.sjtu.edu.cn;yulantao@apex.sjtu.edu.cn;wanghongwei55@gmail.com;wnzhang@sjtu.edu.cn;zhzhang@math.pku.edu.cn;yyu@apex.sjtu.edu.cn", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nzhou2019understanding,\ntitle={Understanding the Effectiveness of Lipschitz-Continuity in Generative Adversarial Nets},\nauthor={Zhiming Zhou and Yuxuan Song and Lantao Yu and Hongwei Wang and Weinan Zhang and Zhihua Zhang and Yong Yu},\nyear={2019},\nurl={https://openreview.net/forum?id=r1zOg309tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=r1zOg309tX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "260;747;535", "wc_reply_reviewers": "1187;0;0", "wc_reply_authors": "3727;653;442", "reply_reviewers": "3;0;0", "reply_authors": "9;1;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 514.0, 199.3706765466443 ], "wc_reply_reviewers_avg": [ 395.6666666666667, 559.5571661789546 ], "wc_reply_authors_avg": [ 1607.3333333333333, 1501.3039517551254 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 4.0, 3.559026084010437 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3099369960395076011&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4 }, { "id": "r1zmVhCqKm", "title": "Text Infilling", "track": "main", "status": "Reject", "tldr": "We study a general task of text infilling that fills missing portions of given text; an self-attention model is developed.", "abstract": "Recent years have seen remarkable progress of text generation in different contexts, including the most common setting of generating text from scratch, the increasingly popular paradigm of retrieval and editing, and others. Text infilling, which fills missing text portions of a sentence or paragraph, is also of numerous use in real life. Previous work has focused on restricted settings, by either assuming single word per missing portion, or limiting to single missing portion to the end of text. This paper studies the general task of text infilling, where the input text can have an arbitrary number of portions to be filled, each of which may require an arbitrary unknown number of tokens. \nWe develop a self-attention model with segment-aware position encoding for precise global context modeling.\nWe further create a variety of supervised data by masking out text in different domains with varying missing ratios and mask strategies. Extensive experiments show the proposed model performs significantly better than other methods, and generates meaningful text patches.", "keywords": "text generation;text infilling;self attention;sequence to sequence", "primary_area": "", "supplementary_material": "", "author": "Wanrong Zhu;Zhiting Hu;Eric P. Xing", "authorids": "zhuwr56@gmail.com;zhitinghu@gmail.com;epxing@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhu2019text,\ntitle={Text Infilling},\nauthor={Wanrong Zhu and Zhiting Hu and Eric P. Xing},\nyear={2019},\nurl={https://openreview.net/forum?id=r1zmVhCqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=r1zmVhCqKm", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "322;375;255", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 317.3333333333333, 49.100803342602134 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17875953644453449350&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "r1znKiAcY7", "title": "Few-shot Classification on Graphs with Structural Regularized GCNs", "track": "main", "status": "Reject", "tldr": "", "abstract": "We consider the fundamental problem of semi-supervised node classification in attributed graphs with a focus on \\emph{few-shot} learning. Here, we propose Structural Regularized Graph Convolutional Networks (SRGCN), novel neural network architectures extending the well-known GCN structures by stacking transposed convolutional layers for reconstruction of input features. We add a reconstruction error term in the loss function as a regularizer. Unlike standard regularization such as $L_1$ or $L_2$, which controls the model complexity by including a penalty term depends solely on parameters, our regularization function is parameterized by a trainable neural network whose structure depends on the topology of the underlying graph. The new approach effectively addresses the shortcomings of previous graph convolution-based techniques for learning classifiers in the few-shot regime and significantly improves generalization performance over original GCNs when the number of labeled samples is insufficient. Experimental studies on three challenging benchmarks demonstrate that the proposed approach has matched state-of-the-art results and can improve classification accuracies by a notable margin when there are very few examples from each class.", "keywords": "Graph Convolutional Networks;Few-shot;Classification", "primary_area": "", "supplementary_material": "", "author": "Shengzhong Zhang;Ziang Zhou;Zengfeng Huang;Zhongyu Wei", "authorids": "17210980007@fudan.edu.cn;15300180085@fudan.edu.cn;huangzf@fudan.edu.cn;zywei@fudan.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nzhang2019fewshot,\ntitle={Few-shot Classification on Graphs with Structural Regularized {GCN}s},\nauthor={Shengzhong Zhang and Ziang Zhou and Zengfeng Huang and Zhongyu Wei},\nyear={2019},\nurl={https://openreview.net/forum?id=r1znKiAcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1znKiAcY7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "459;247;202", "wc_reply_reviewers": "0;119;0", "wc_reply_authors": "1047;418;397", "reply_reviewers": "0;1;0", "reply_authors": "2;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 302.6666666666667, 112.06049953286643 ], "wc_reply_reviewers_avg": [ 39.666666666666664, 56.09713797413277 ], "wc_reply_authors_avg": [ 620.6666666666666, 301.5850718380397 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=326069690134614704&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "r1ztwiCcYQ", "title": "VARIATIONAL SGD: DROPOUT , GENERALIZATION AND CRITICAL POINT AT THE END OF CONVEXITY", "track": "main", "status": "Reject", "tldr": "Proposed method for finding the most generalizable solution that is stable w.r.t. perturbations of trainig data.", "abstract": "The goal of the paper is to propose an algorithm for learning the most generalizable solution from given training data. It is shown that Bayesian approach leads to a solution that dependent on statistics of training data and not on particular\nsamples. The solution is stable under perturbations of training data because it is defined by an integral contribution of multiple maxima of the likelihood and not by a single global maximum. Specifically, the Bayesian probability distribution\nof parameters (weights) of a probabilistic model given by a neural network is estimated via recurrent variational approximations. Derived recurrent update rules correspond to SGD-type rules for finding a minimum of an effective loss that is an average of an original negative log-likelihood over the Gaussian distributions of weights, which makes it a function of means and variances. The effective loss is convex for large variances and non-convex in the limit of small variances. Among stationary solutions of the update rules there are trivial solutions with zero variances at local minima of the original loss and a single non-trivial solution with finite variances that is a critical point at the end of convexity of the effective loss\nin the mean-variance space. At the critical point both first- and second-order gradients of the effective loss w.r.t. means are zero. The empirical study confirms that the critical point represents the most generalizable solution. While the location of\nthe critical point in the weight space depends on specifics of the used probabilistic model some properties at the critical point are universal and model independent.", "keywords": "Bayesian inference;neural networks;generalization;critical point solution", "primary_area": "", "supplementary_material": "", "author": "Michael Tetelman", "authorids": "michael.tetelman@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ntetelman2019variational,\ntitle={{VARIATIONAL} {SGD}: {DROPOUT} , {GENERALIZATION} {AND} {CRITICAL} {POINT} {AT} {THE} {END} {OF} {CONVEXITY}},\nauthor={Michael Tetelman},\nyear={2019},\nurl={https://openreview.net/forum?id=r1ztwiCcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=r1ztwiCcYQ", "pdf_size": 0, "rating": "2;2;4", "confidence": "5;4;3", "wc_review": "51;320;715", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;49;695", "reply_reviewers": "0;0;0", "reply_authors": "0;1;1", "rating_avg": [ 2.6666666666666665, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 362.0, 272.6988571055381 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 248.0, 316.7091199613088 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:09JScqdOESoJ:scholar.google.com/&scioq=VARIATIONAL+SGD:+DROPOUT+,+GENERALIZATION+AND+CRITICAL+POINT+AT+THE+END+OF+CONVEXITY&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/845", "id": "rJ4km2R5t7", "author_site": "Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, Sam Bowman", "tldr": "We present a multi-task benchmark and analysis platform for evaluating generalization in natural language understanding systems.", "abstract": "For natural language understanding (NLU) technology to be maximally useful, it must be able to process language in a way that is not exclusive to a single task, genre, or dataset. In pursuit of this objective, we introduce the General Language Understanding Evaluation (GLUE) benchmark, a collection of tools for evaluating the performance of models across a diverse set of existing NLU tasks. By including tasks with limited training data, GLUE is designed to favor and encourage models that share general linguistic knowledge across tasks. GLUE also includes a hand-crafted diagnostic test suite that enables detailed linguistic analysis of models. We evaluate baselines based on current methods for transfer and representation learning and find that multi-task training on all tasks performs better than training a separate model per task. However, the low absolute performance of our best model indicates the need for improved general NLU systems.", "keywords": "natural language understanding;multi-task learning;evaluation", "primary_area": "", "supplementary_material": "", "author": "Alex Wang;Amanpreet Singh;Julian Michael;Felix Hill;Omer Levy;Samuel R. Bowman", "authorids": "alexwang@nyu.edu;amanpreet@nyu.edu;julianjm@cs.washington.edu;felixhill@google.com;omerlevy@cs.washington.edu;bowman@nyu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nwang2018glue,\ntitle={{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding},\nauthor={Alex Wang and Amanpreet Singh and Julian Michael and Felix Hill and Omer Levy and Samuel R. Bowman},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJ4km2R5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;7;8", "confidence": "2;1;4", "wc_review": "402;194;212", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "221;120;193", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 1.247219128924647 ], "confidence_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 269.3333333333333, 94.0968767931339 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 178.0, 42.57542327055207 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 8516, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17443412968683100072&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=rJ4km2R5t7", "pdf": "https://openreview.net/pdf?id=rJ4km2R5t7", "email": ";;;;;", "author_num": 6 }, { "id": "rJ4qXnCqFX", "title": "Probabilistic Knowledge Graph Embeddings", "track": "main", "status": "Reject", "tldr": "Scalable hyperparameter learning for knowledge graph embedding models using variational EM", "abstract": "We develop a probabilistic extension of state-of-the-art embedding models for link prediction in relational knowledge graphs. Knowledge graphs are collections of relational facts, where each fact states that a certain relation holds between two entities, such as people, places, or objects. We argue that knowledge graphs should be treated within a Bayesian framework because even large knowledge graphs typically contain only few facts per entity, leading effectively to a small data problem where parameter uncertainty matters. We introduce a probabilistic reinterpretation of the DistMult (Yang et al., 2015) and ComplEx (Trouillon et al., 2016) models and employ variational inference to estimate a lower bound on the marginal likelihood of the data. We find that the main benefit of the Bayesian approach is that it allows for efficient, gradient based optimization over hyperparameters, which would lead to divergences in a non-Bayesian treatment. Models with such learned hyperparameters improve over the state-of-the-art by a significant margin, as we demonstrate on several benchmarks.", "keywords": "knowledge graph;variational inference;probabilistic models;representation learning", "primary_area": "", "supplementary_material": "", "author": "Farnood Salehi;Robert Bamler;Stephan Mandt", "authorids": "farnood.salehi@epfl.ch;robert.bamler@gmail.com;stephan.mandt@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsalehi2019probabilistic,\ntitle={Probabilistic Knowledge Graph Embeddings},\nauthor={Farnood Salehi and Robert Bamler and Stephan Mandt},\nyear={2019},\nurl={https://openreview.net/forum?id=rJ4qXnCqFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJ4qXnCqFX", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;2;3", "wc_review": "352;517;240", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "761;753;476", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 369.6666666666667, 113.7726778370898 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 663.3333333333334, 132.5049265331503 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14622247052318845588&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rJ4vlh0qtm", "title": "SSoC: Learning Spontaneous and Self-Organizing Communication for Multi-Agent Collaboration", "track": "main", "status": "Reject", "tldr": "This paper proposes a spontaneous and self-organizing communication (SSoC) learning scheme for multi-agent RL tasks.", "abstract": "Multi-agent collaboration is required by numerous real-world problems. Although distributed setting is usually adopted by practical systems, local range communication and information aggregation still matter in fulfilling complex tasks. For multi-agent reinforcement learning, many previous studies have been dedicated to design an effective communication architecture. However, existing models usually suffer from an ossified communication structure, e.g., most of them predefine a particular communication mode by specifying a fixed time frequency and spatial scope for agents to communicate regardless of necessity. Such design is incapable of dealing with multi-agent scenarios that are capricious and complicated, especially when only partial information is available. Motivated by this, we argue that the solution is to build a spontaneous and self-organizing communication (SSoC) learning scheme. By treating the communication behaviour as an explicit action, SSoC learns to organize communication in an effective and efficient way. Particularly, it enables each agent to spontaneously decide when and who to send messages based on its observed states. In this way, a dynamic inter-agent communication channel is established in an online and self-organizing manner. The agents also learn how to adaptively aggregate the received messages and its own hidden states to execute actions. Various experiments have been conducted to demonstrate that SSoC really learns intelligent message passing among agents located far apart. With such agile communications, we observe that effective collaboration tactics emerge which have not been mastered by the compared baselines.", "keywords": "reinforcement learning;multi-agent learning;multi-agent communication;deep learning", "primary_area": "", "supplementary_material": "", "author": "Xiangyu Kong;Jing Li;Bo Xin;Yizhou Wang", "authorids": "kong@pku.edu.cn;lijingg@pku.edu.cn;jimxinbo@gmail.com;yizhou.wang@pku.edu.cn", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkong2019ssoc,\ntitle={{SS}oC: Learning Spontaneous and Self-Organizing Communication for Multi-Agent Collaboration},\nauthor={Xiangyu Kong and Jing Li and Bo Xin and Yizhou Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=rJ4vlh0qtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJ4vlh0qtm", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;4;3", "wc_review": "93;508;309", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 303.3333333333333, 169.47041695299572 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sJnHDsDiDX0J:scholar.google.com/&scioq=SSoC:+Learning+Spontaneous+and+Self-Organizing+Communication+for+Multi-Agent+Collaboration&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJEGwo0cFX", "title": "An Attention-Based Model for Learning Dynamic Interaction Networks", "track": "main", "status": "Withdraw", "tldr": "A graph neural network able to automatically learn and leverage a dynamic interactive graph structure", "abstract": "While machine learning models achieve human-comparable performance on sequential data, exploiting structured knowledge is still a challenging problem. Spatio-temporal graphs have been proved to be a useful tool to abstract interaction graphs and previous works exploits carefully designed feed-forward architecture to preserve such structure. We argue to scale such network design to real-world problem, a model needs to automatically learn a meaningful representation of the possible relations. Learning such interaction structure is not trivial: on the one hand, a model has to discover the hidden relations between different problem factors in an unsupervised way; on the other hand, the mined relations have to be interpretable. \n\nIn this paper, we propose an attention module able to project a graph sub-structure in a fixed size embedding, preserving the influence that the neighbours exert on a given vertex. On a comprehensive evaluation done on real-world as well as toy task, we found our model competitive against strong baselines.", "keywords": "dynamic networks;interaction graphs;attention model", "primary_area": "", "supplementary_material": "", "author": "Sandro Cavallari;Vincent W Zheng;Hongyun Cai;Erik Cambria", "authorids": "sandro001@e.ntu.edu.sg;vincent.zheng@adsc-create.edu.sg;hongyun.c@adsc.com.sg;cambria@ntu.edu.sg", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJEGwo0cFX", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;3", "wc_review": "275;655;217", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 382.3333333333333, 194.2529851050486 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15510324768300724269&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Learning Robust Representations by Projecting Superficial Statistics Out", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/793", "id": "rJEjjoR9K7", "author_site": "Haohan Wang, Zexue He, Zachary Lipton, Eric P Xing", "tldr": "Building on previous work on domain generalization, we hope to produce a classifier that will generalize to previously unseen domains, even when domain identifiers are not available during training.", "abstract": "Despite impressive performance as evaluated on i.i.d. holdout data, deep neural networks depend heavily on superficial statistics of the training data and are liable to break under distribution shift. For example, subtle changes to the background or texture of an image can break a seemingly powerful classifier. Building on previous work on domain generalization, we hope to produce a classifier that will generalize to previously unseen domains, even when domain identifiers are not available during training. This setting is challenging because the model may extract many distribution-specific (superficial) signals together with distribution-agnostic (semantic) signals. To overcome this challenge, we incorporate the gray-level co-occurrence matrix (GLCM) to extract patterns that our prior knowledge suggests are superficial: they are sensitive to the texture but unable to capture the gestalt of an image. Then we introduce two techniques for improving our networks' out-of-sample performance. The first method is built on the reverse gradient method that pushes our model to learn representations from which the GLCM representation is not predictable. The second method is built on the independence introduced by projecting the model's representation onto the subspace orthogonal to GLCM representation's.\nWe test our method on the battery of standard domain generalization data sets and, interestingly, achieve comparable or better performance as compared to other domain generalization methods that explicitly require samples from the target distribution for training.", "keywords": "domain generalization;robustness", "primary_area": "", "supplementary_material": "", "author": "Haohan Wang;Zexue He;Zachary C. Lipton;Eric P. Xing", "authorids": "haohanw@cs.cmu.edu;zexueh@mail.bnu.edu.cn;zlipton@cmu.edu;epxing@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nwang2018learning,\ntitle={Learning Robust Representations by Projecting Superficial Statistics Out},\nauthor={Haohan Wang and Zexue He and Eric P. Xing},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJEjjoR9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;9", "confidence": "4;4;3", "wc_review": "563;158;104", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "150;145;43", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 275.0, 204.83652018133876 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 112.66666666666667, 49.30404536028346 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 270, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10744255442611850331&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rJEjjoR9K7", "pdf": "https://openreview.net/pdf?id=rJEjjoR9K7", "email": ";;;", "author_num": 4 }, { "id": "rJEyrjRqYX", "title": "Reduced-Gate Convolutional LSTM Design Using Predictive Coding for Next-Frame Video Prediction", "track": "main", "status": "Reject", "tldr": "A novel reduced-gate convolutional LSTM design using predictive coding for next-frame video prediction", "abstract": "Spatiotemporal sequence prediction is an important problem in deep learning. We\nstudy next-frame video prediction using a deep-learning-based predictive coding\nframework that uses convolutional, long short-term memory (convLSTM) modules.\nWe introduce a novel reduced-gate convolutional LSTM architecture. Our\nreduced-gate model achieves better next-frame prediction accuracy than the original\nconvolutional LSTM while using a smaller parameter budget, thereby reducing\ntraining time. We tested our reduced gate modules within a predictive coding architecture\non the moving MNIST and KITTI datasets. We found that our reduced-gate\nmodel has a significant reduction of approximately 40 percent of the total\nnumber of training parameters and training time in comparison with the standard\nLSTM model which makes it attractive for hardware implementation especially\non small devices.", "keywords": "rgcLSTM;convolutional LSTM;unsupervised learning;predictive coding;video prediction;moving MNIST;KITTI datasets;deep learning", "primary_area": "", "supplementary_material": "", "author": "Nelly Elsayed;Anthony S. Maida;Magdy Bayoumi", "authorids": "nelly.elsayed5@gmail.com;maida@louisiana.edu;mab0778@louisiana.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nelsayed2019reducedgate,\ntitle={Reduced-Gate Convolutional {LSTM} Design Using Predictive Coding for Next-Frame Video Prediction},\nauthor={Nelly Elsayed and Anthony S. Maida and Magdy Bayoumi},\nyear={2019},\nurl={https://openreview.net/forum?id=rJEyrjRqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJEyrjRqYX", "pdf_size": 0, "rating": "3;5;7", "confidence": "5;4;4", "wc_review": "403;356;139", "wc_reply_reviewers": "598;196;0", "wc_reply_authors": "685;818;37", "reply_reviewers": "1;2;0", "reply_authors": "1;2;1", "rating_avg": [ 5.0, 1.632993161855452 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 299.3333333333333, 114.98502317934955 ], "wc_reply_reviewers_avg": [ 264.6666666666667, 248.91408602612714 ], "wc_reply_authors_avg": [ 513.3333333333334, 341.16695163642225 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LTGV1o73wDYJ:scholar.google.com/&scioq=Reduced-Gate+Convolutional+LSTM+Design+Using+Predictive+Coding+for+Next-Frame+Video+Prediction&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJG8asRqKX", "title": "A Deep Learning Approach for Dynamic Survival Analysis with Competing Risks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Currently available survival analysis methods are limited in their ability to deal with complex, heterogeneous, and longitudinal data such as that available in primary care records, or in their ability to deal with multiple competing risks. This paper develops a novel deep learning architecture that flexibly incorporates the available longitudinal data comprising various repeated measurements (rather than only the last available measurements) in order to issue dynamically updated survival predictions for one or multiple competing risk(s). Unlike existing works in the survival analysis on the basis of longitudinal data, the proposed method learns the time-to-event distributions without specifying underlying stochastic assumptions of the longitudinal or the time-to-event processes. Thus, our method is able to learn associations between the longitudinal data and the various associated risks in a fully data-driven fashion. We demonstrate the power of our method by applying it to real-world longitudinal datasets and show a drastic improvement over state-of-the-art methods in discriminative performance. Furthermore, our analysis of the variable importance and dynamic survival predictions will yield a better understanding of the predicted risks which will result in more effective health care.", "keywords": "dynamic survival analysis;survival analysis;longitudinal measurements;competing risks", "primary_area": "", "supplementary_material": "", "author": "Changhee Lee;Mihaela van der Schaar", "authorids": "chl8856@gmail.com;mihaela@ee.ucla.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlee2019a,\ntitle={A Deep Learning Approach for Dynamic Survival Analysis with Competing Risks},\nauthor={Changhee Lee and Mihaela van der Schaar},\nyear={2019},\nurl={https://openreview.net/forum?id=rJG8asRqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJG8asRqKX", "pdf_size": 0, "rating": "4;4;8", "confidence": "3;4;4", "wc_review": "422;299;227", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1053;581;192", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 316.0, 80.51086883148137 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 608.6666666666666, 352.04576722668065 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9851677714225749751&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJGgFjA9FQ", "title": "Explaining AlphaGo: Interpreting Contextual Effects in Neural Networks", "track": "main", "status": "Withdraw", "tldr": "This paper presents methods to disentangle and interpret contextual effects that are encoded in a deep neural network.", "abstract": "This paper presents two methods to disentangle and interpret contextual effects that are encoded in a pre-trained deep neural network. Unlike convolutional studies that visualize image appearances corresponding to the network output or a neural activation from a global perspective, our research aims to clarify how a certain input unit (dimension) collaborates with other units (dimensions) to constitute inference patterns of the neural network and thus contribute to the network output. The analysis of local contextual effects w.r.t. certain input units is of special values in real applications. In particular, we used our methods to explain the gaming strategy of the alphaGo Zero model in experiments, and our method successfully disentangled the rationale of each move during the game.", "keywords": "Interpretability;Deep learning;alphaGo", "primary_area": "", "supplementary_material": "", "author": "Zenan Ling;Haotian Ma;Yu Yang;Robert C. Qiu;Song-Chun Zhu;Quanshi Zhang", "authorids": "lingzenan@sjtu.edu.cn;11612807@mail.sustc.edu.cn;yy19970901@ucla.edu;rqiu@tntech.edu;sczhu@stat.ucla.edu;zqs1022@sjtu.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJGgFjA9FQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;5;4", "wc_review": "310;654;321", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 428.3333333333333, 159.63360826871292 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15524243405461609514&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rJMcdsA5FX", "title": "On Accurate Evaluation of GANs for Language Generation", "track": "main", "status": "Reject", "tldr": "We discuss how to evaluate GANs for language generation, propose a protocol and show that simple Language Models achieve results as good as GANs.", "abstract": "Generative Adversarial Networks (GANs) are a promising approach to language generation. The latest works introducing novel GAN models for language generation use n-gram based metrics for evaluation and only report single scores of the best run. In this paper, we argue that this often misrepresents the true picture and does not tell the full story, as GAN models can be extremely sensitive to the random initialization and small deviations from the best hyperparameter choice. In particular, we demonstrate that the previously used BLEU score is not sensitive to semantic deterioration of generated texts and propose alternative metrics that better capture the quality and diversity of the generated samples. We also conduct a set of experiments comparing a number of GAN models for text with a conventional Language Model (LM) and find that none of the considered models performs convincingly better than the LM.", "keywords": "GANs;Evaluation;Generative Models", "primary_area": "", "supplementary_material": "", "author": "Stanislau Semeniuta;Aliaksei Severyn;Sylvain Gelly", "authorids": "stas@inb.uni-luebeck.de;severyn@google.com;sylvaingelly@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsemeniuta2019on,\ntitle={On Accurate Evaluation of {GAN}s for Language Generation},\nauthor={Stanislau Semeniuta and Aliaksei Severyn and Sylvain Gelly},\nyear={2019},\nurl={https://openreview.net/forum?id=rJMcdsA5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJMcdsA5FX", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "208;425;384", "wc_reply_reviewers": "32;0;0", "wc_reply_authors": "69;50;0", "reply_reviewers": "1;0;0", "reply_authors": "1;1;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 339.0, 94.1311142325781 ], "wc_reply_reviewers_avg": [ 10.666666666666666, 15.084944665313014 ], "wc_reply_authors_avg": [ 39.666666666666664, 29.101355447622865 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 102, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12672682286833885635&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "On Computation and Generalization of Generative Adversarial Networks under Spectrum Control", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/866", "id": "rJNH6sAqY7", "author_site": "Haoming Jiang, Zhehui Chen, Minshuo Chen, Feng Liu, Dingding Wang, Tuo Zhao", "tldr": "", "abstract": "Generative Adversarial Networks (GANs), though powerful, is hard to train. Several recent works (Brock et al., 2016; Miyato et al., 2018) suggest that controlling the spectra of weight matrices in the discriminator can significantly improve the training of GANs. Motivated by their discovery, we propose a new framework for training GANs, which allows more flexible spectrum control (e.g., making the weight matrices of the discriminator have slow singular value decays). Specifically, we propose a new reparameterization approach for the weight matrices of the discriminator in GANs, which allows us to directly manipulate the spectra of the weight matrices through various regularizers and constraints, without intensively computing singular value decompositions. Theoretically, we further show that the spectrum control improves the generalization ability of GANs. Our experiments on CIFAR-10, STL-10, and ImgaeNet datasets confirm that compared to other competitors, our proposed method is capable of generating images with better or equal quality by utilizing spectral normalization and encouraging the slow singular value decay.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Haoming Jiang;Zhehui Chen;Minshuo Chen;Feng Liu;Dingding Wang;Tuo Zhao", "authorids": "jianghm@gatech.edu;zhchen@gatech.edu;mchen393@gatech.edu;fliu2016@fau.edu;wangd@fau.edu;tourzhao@gatech.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\njiang2018on,\ntitle={On Computation and Generalization of Generative Adversarial Networks under Spectrum Control},\nauthor={Haoming Jiang and Zhehui Chen and Minshuo Chen and Feng Liu and Dingding Wang and Tuo Zhao},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJNH6sAqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "2;4;4", "wc_review": "206;1296;382", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "191;538;272", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 628.0, 477.78098190140076 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 333.6666666666667, 148.22130601082813 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2465017104621578394&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rJNH6sAqY7", "pdf": "https://openreview.net/pdf?id=rJNH6sAqY7", "email": ";;;;;", "author_num": 6 }, { "title": "Large-Scale Study of Curiosity-Driven Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/930", "id": "rJNwDjAqYX", "author_site": "Yuri Burda, Harrison Edwards, Deepak Pathak, Amos Storkey, Trevor Darrell, Alexei Efros", "tldr": "An agent trained only with curiosity, and no extrinsic reward, does surprisingly well on 54 popular environments, including the suite of Atari games, Mario etc.", "abstract": "Reinforcement learning algorithms rely on carefully engineered rewards from the environment that are extrinsic to the agent. However, annotating each environment with hand-designed, dense rewards is difficult and not scalable, motivating the need for developing reward functions that are intrinsic to the agent. \nCuriosity is such intrinsic reward function which uses prediction error as a reward signal. In this paper: (a) We perform the first large-scale study of purely curiosity-driven learning, i.e. {\\em without any extrinsic rewards}, across $54$ standard benchmark environments, including the Atari game suite. Our results show surprisingly good performance as well as a high degree of alignment between the intrinsic curiosity objective and the hand-designed extrinsic rewards of many games. (b) We investigate the effect of using different feature spaces for computing prediction error and show that random features are sufficient for many popular RL game benchmarks, but learned features appear to generalize better (e.g. to novel game levels in Super Mario Bros.). (c) We demonstrate limitations of the prediction-based rewards in stochastic setups. Game-play videos and code are at https://doubleblindsupplementary.github.io/large-curiosity/.", "keywords": "exploration;curiosity;intrinsic reward;no extrinsic reward;unsupervised;no-reward;skills", "primary_area": "", "supplementary_material": "", "author": "Yuri Burda;Harri Edwards;Deepak Pathak;Amos Storkey;Trevor Darrell;Alexei A. Efros", "authorids": "yburda@openai.com;harri@openai.com;pathak@berkeley.edu;a.storkey@ed.ac.uk;trevor@eecs.berkeley.edu;efros@eecs.berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nburda2018largescale,\ntitle={Large-Scale Study of Curiosity-Driven Learning},\nauthor={Yuri Burda and Harri Edwards and Deepak Pathak and Amos Storkey and Trevor Darrell and Alexei A. Efros},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJNwDjAqYX},\n}", "github": "[![github](/images/github_icon.svg) openai/large-scale-curiosity](https://github.com/openai/large-scale-curiosity) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rJNwDjAqYX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;3;5", "wc_review": "1131;226;267", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "664;320;416", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 541.3333333333334, 417.29312905385297 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 466.6666666666667, 144.93523457814604 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.6546536707079772, "gs_citation": 958, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6931272873542879959&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=rJNwDjAqYX", "pdf": "https://openreview.net/pdf?id=rJNwDjAqYX", "email": ";;;;;", "author_num": 6 }, { "id": "rJVoEiCqKQ", "title": "Deep Perm-Set Net: Learn to predict sets with unknown permutation and cardinality using deep neural networks", "track": "main", "status": "Reject", "tldr": "We present a novel approach for learning to predict sets with unknown permutation and cardinality using feed-forward deep neural networks.", "abstract": "Many real-world problems, e.g. object detection, have outputs that are naturally expressed as sets of entities. This creates a challenge for traditional deep neural networks which naturally deal with structured outputs such as vectors, matrices or tensors. We present a novel approach for learning to predict sets with unknown permutation and cardinality using deep neural networks. Specifically, in our formulation we incorporate the permutation as unobservable variable and estimate its distribution during the learning process using alternating optimization. We demonstrate the validity of this new formulation on two relevant vision problems: object detection, for which our formulation outperforms state-of-the-art detectors such as Faster R-CNN and YOLO, and a complex CAPTCHA test, where we observe that, surprisingly, our set based network acquired the ability of mimicking arithmetics without any rules being coded.", "keywords": "Set learning;Permutation invariant;Object detection;CAPTCHA test", "primary_area": "", "supplementary_material": "", "author": "S. Hamid Rezatofighi;Roman Kaskman;Farbod T. Motlagh;Qinfeng Shi;Daniel Cremers;Laura Leal-Taix\u00e9;Ian Reid", "authorids": "hamid.rezatofighi@adelaide.edu.au;roman.kaskman@tum.de;farbod.motlagh@student.adelaide.edu.au;javen.shi@adelaide.edu.au;cremers@tum.de;leal.taixe@tum.de;ian.reid@adelaide.edu.au", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nrezatofighi2019deep,\ntitle={Deep Perm-Set Net: Learn to predict sets with unknown permutation and cardinality using deep neural networks},\nauthor={S. Hamid Rezatofighi and Roman Kaskman and Farbod T. Motlagh and Qinfeng Shi and Daniel Cremers and Laura Leal-Taix\u00e9 and Ian Reid},\nyear={2019},\nurl={https://openreview.net/forum?id=rJVoEiCqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJVoEiCqKQ", "pdf_size": 0, "rating": "3;3;7", "confidence": "4;3;3", "wc_review": "520;727;181", "wc_reply_reviewers": "221;324;0", "wc_reply_authors": "432;853;134", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 476.0, 225.06443521800597 ], "wc_reply_reviewers_avg": [ 181.66666666666666, 135.164919840747 ], "wc_reply_authors_avg": [ 473.0, 294.95875417872696 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5539818345421952330&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Slalom: Fast, Verifiable and Private Execution of Neural Networks in Trusted Hardware", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/655", "id": "rJVorjCcKQ", "author_site": "Florian Tramer, Dan Boneh", "tldr": "We accelerate secure DNN inference in trusted execution environments (by a factor 4x-20x) by selectively outsourcing the computation of linear layers to a faster yet untrusted co-processor.", "abstract": "As Machine Learning (ML) gets applied to security-critical or sensitive domains, there is a growing need for integrity and privacy for outsourced ML computations. A pragmatic solution comes from Trusted Execution Environments (TEEs), which use hardware and software protections to isolate sensitive computations from the untrusted software stack. However, these isolation guarantees come at a price in performance, compared to untrusted alternatives. This paper initiates the study of high performance execution of Deep Neural Networks (DNNs) in TEEs by efficiently partitioning DNN computations between trusted and untrusted devices. Building upon an efficient outsourcing scheme for matrix multiplication, we propose Slalom, a framework that securely delegates execution of all linear layers in a DNN from a TEE (e.g., Intel SGX or Sanctum) to a faster, yet untrusted, co-located processor. We evaluate Slalom by running DNNs in an Intel SGX enclave, which selectively delegates work to an untrusted GPU. For canonical DNNs (VGG16, MobileNet and ResNet variants) we obtain 6x to 20x increases in throughput for verifiable inference, and 4x to 11x for verifiable and private inference.", "keywords": "Trusted hardware;integrity;privacy;secure inference;SGX", "primary_area": "", "supplementary_material": "", "author": "Florian Tramer;Dan Boneh", "authorids": "tramer@cs.stanford.edu;dabo@cs.stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\ntramer2018slalom,\ntitle={Slalom: Fast, Verifiable and Private Execution of Neural Networks in Trusted Hardware},\nauthor={Florian Tramer and Dan Boneh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJVorjCcKQ},\n}", "github": "[![github](/images/github_icon.svg) ftramer/slalom](https://github.com/ftramer/slalom)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;9", "confidence": "2;3;4", "wc_review": "480;474;418", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "443;277;494", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 457.3333333333333, 27.920522121829233 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 404.6666666666667, 92.64388209098801 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 507, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7461531422951047390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rJVorjCcKQ", "pdf": "https://openreview.net/pdf?id=rJVorjCcKQ", "email": ";", "author_num": 2 }, { "id": "rJe-LiA5YX", "title": "Exponentially Decaying Flows for Optimization in Deep Learning", "track": "main", "status": "Withdraw", "tldr": "Introduction of a new optimization method and its application to deep learning.", "abstract": "The field of deep learning has been craving for an optimization method that shows outstanding property for both optimization and generalization. We propose a method for mathematical optimization based on flows along geodesics, that is, the shortest paths between two points, with respect to the Riemannian metric induced by a non-linear function. In our method, the flows refer to Exponentially Decaying Flows (EDF), as they can be designed to converge on the local solutions exponentially. In this paper, we conduct experiments to show its high performance on optimization benchmarks (i.e., convergence properties), as well as its potential for producing good machine learning benchmarks (i.e., generalization properties).", "keywords": "optimization;deep learning", "primary_area": "", "supplementary_material": "", "author": "Mitsuharu Takeori;Kenta Nakamura", "authorids": "takeori.mitsuharu.d5s@jp.nssol.nssmc.com;nakamura.kenta.4n4@jp.nssol.nssmc.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJe-LiA5YX", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;3;5", "wc_review": "650;474;324", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 2.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 482.6666666666667, 133.22995992727095 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Qm-BNM-XPXAJ:scholar.google.com/&scioq=Exponentially+Decaying+Flows+for+Optimization+in+Deep+Learning&hl=en&as_sdt=0,44", "gs_version_total": 0 }, { "title": "Unsupervised Discovery of Parts, Structure, and Dynamics", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/768", "id": "rJe10iC5K7", "author_site": "Zhenjia Xu, Zhijian Liu, Chen Sun, Kevin Murphy, William Freeman, Joshua B Tenenbaum, Jiajun Wu", "tldr": "Learning object parts, hierarchical structure, and dynamics by watching how they move", "abstract": "Humans easily recognize object parts and their hierarchical structure by watching how they move; they can then predict how each part moves in the future. In this paper, we propose a novel formulation that simultaneously learns a hierarchical, disentangled object representation and a dynamics model for object parts from unlabeled videos. Our Parts, Structure, and Dynamics (PSD) model learns to, first, recognize the object parts via a layered image representation; second, predict hierarchy via a structural descriptor that composes low-level concepts into a hierarchical structure; and third, model the system dynamics by predicting the future. Experiments on multiple real and synthetic datasets demonstrate that our PSD model works well on all three tasks: segmenting object parts, building their hierarchical structure, and capturing their motion distributions.", "keywords": "Self-Supervised Learning;Visual Prediction;Hierarchical Models", "primary_area": "", "supplementary_material": "", "author": "Zhenjia Xu*;Zhijian Liu*;Chen Sun;Kevin Murphy;William T. Freeman;Joshua B. Tenenbaum;Jiajun Wu", "authorids": "xuzhenjia1997@gmail.com;zhijian@mit.edu;chensun@google.com;kpmurphy@google.com;billf@mit.edu;jbt@mit.edu;jiajunwu@mit.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nliu2018modeling,\ntitle={Modeling Parts, Structure, and System Dynamics via Predictive Learning},\nauthor={Zhijian Liu and Jiajun Wu and Zhenjia Xu and Chen Sun and Kevin Murphy and William T. Freeman and Joshua B. Tenenbaum},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJe10iC5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6;7", "confidence": "3;3;3;4", "wc_review": "516;349;1082;671", "wc_reply_reviewers": "0;0;91;0", "wc_reply_authors": "525;681;820;305", "reply_reviewers": "0;0;1;0", "reply_authors": "2;2;3;2", "rating_avg": [ 6.0, 0.7071067811865476 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 654.5, 271.818413651467 ], "wc_reply_reviewers_avg": [ 22.75, 39.40415587219196 ], "wc_reply_authors_avg": [ 582.75, 191.32482196515954 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 2.25, 0.4330127018922193 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.816496580927726, "gs_citation": 84, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6600322539946703070&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=rJe10iC5K7", "pdf": "https://openreview.net/pdf?id=rJe10iC5K7", "email": ";;;;;;", "author_num": 7 }, { "id": "rJe1y3CqtX", "title": "Deep Reinforcement Learning of Universal Policies with Diverse Environment Summaries", "track": "main", "status": "Reject", "tldr": "As an alternative to domain randomization, we summarize simulator configurations to ensure that the policy is trained on a diverse set of induced state-trajectories.", "abstract": "Deep reinforcement learning has enabled robots to complete complex tasks in simulation. However, the resulting policies do not transfer to real robots due to model errors in the simulator. One solution is to randomize the simulation environment, so that the resulting, trained policy achieves high performance in expectation over a variety of configurations that could represent the real-world. However, the distribution over simulator configurations must be carefully selected to represent the relevant dynamic modes of the system, as otherwise it can be unlikely to sample challenging configurations frequently enough. Moreover, the ideal distribution to improve the policy changes as the policy (un)learns to solve tasks in certain configurations. In this paper, we propose to use an inexpensive, kernel-based summarization method method that identifies configurations that lead to diverse behaviors. Since failure modes for the given task are naturally diverse, the policy trains on a mixture of representative and challenging configurations, which leads to more robust policies. In experiments, we show that the proposed method achieves the same performance as domain randomization in simple cases, but performs better when domain randomization does not lead to diverse dynamic modes.", "keywords": "Domain Randomization;Diverse Summaries;Reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Felix Berkenkamp;Debadeepta Dey;Ashish Kapoor", "authorids": "befelix@inf.ethz.ch;dedey@microsoft.com;akapoor@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nberkenkamp2019deep,\ntitle={Deep Reinforcement Learning of Universal Policies with Diverse Environment Summaries},\nauthor={Felix Berkenkamp and Debadeepta Dey and Ashish Kapoor},\nyear={2019},\nurl={https://openreview.net/forum?id=rJe1y3CqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJe1y3CqtX", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;4", "wc_review": "542;326;537", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 468.3333333333333, 100.66556290785621 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z9WGdvFUwVsJ:scholar.google.com/&scioq=Deep+Reinforcement+Learning+of+Universal+Policies+with+Diverse+Environment+Summaries&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Music Transformer: Generating Music with Long-Term Structure", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1008", "id": "rJe4ShAcF7", "author_site": "Anna Huang, Ashish Vaswani, Jakob Uszkoreit, Ian Simon, Curtis Hawthorne, Noam Shazeer, Andrew Dai, Matthew D Hoffman, Monica Dinculescu, Douglas Eck", "tldr": "We show the first successful use of Transformer in generating music that exhibits long-term structure. ", "abstract": "Music relies heavily on repetition to build structure and meaning. Self-reference occurs on multiple timescales, from motifs to phrases to reusing of entire sections of music, such as in pieces with ABA structure. The Transformer (Vaswani et al., 2017), a sequence model based on self-attention, has achieved compelling results in many generation tasks that require maintaining long-range coherence. This suggests that self-attention might also be well-suited to modeling music. In musical composition and performance, however, relative timing is critically important. Existing approaches for representing relative positional information in the Transformer modulate attention based on pairwise distance (Shaw et al., 2018). This is impractical for long sequences such as musical compositions since their memory complexity is quadratic in the sequence length. We propose an algorithm that reduces the intermediate memory requirements to linear in the sequence length. This enables us to demonstrate that a Transformer with our modified relative attention mechanism can generate minute-long (thousands of steps) compositions with compelling structure, generate continuations that coherently elaborate on a given motif, and in a seq2seq setup generate accompaniments conditioned on melodies. We evaluate the Transformer with our relative attention mechanism on two datasets, JSB Chorales and Piano-e-competition, and obtain state-of-the-art results on the latter.", "keywords": "music generation", "primary_area": "", "supplementary_material": "", "author": "Cheng-Zhi Anna Huang;Ashish Vaswani;Jakob Uszkoreit;Ian Simon;Curtis Hawthorne;Noam Shazeer;Andrew M. Dai;Matthew D. Hoffman;Monica Dinculescu;Douglas Eck", "authorids": "chengzhiannahuang@gmail.com;avaswani@google.com;uszkoreit@google.com;iansimon@google.com;fjord@google.com;noam@google.com;adai@google.com;mhoffman@google.com;noms@google.com;deck@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@inproceedings{\nhuang2018music,\ntitle={Music Transformer},\nauthor={Cheng-Zhi Anna Huang and Ashish Vaswani and Jakob Uszkoreit and Ian Simon and Curtis Hawthorne and Noam Shazeer and Andrew M. Dai and Matthew D. Hoffman and Monica Dinculescu and Douglas Eck},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJe4ShAcF7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 12 community implementations](https://paperswithcode.com/paper/?openreview=rJe4ShAcF7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4;AnonReviewer2", "pdf_size": 0, "rating": "4;5;6;7", "confidence": "4;3;4;3", "wc_review": "258;414;537;221", "wc_reply_reviewers": "138;147;0;127", "wc_reply_authors": "1106;908;491;356", "reply_reviewers": "1;1;0;1", "reply_authors": "2;2;2;2", "rating_avg": [ 5.5, 1.118033988749895 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 357.5, 126.43674307731911 ], "wc_reply_reviewers_avg": [ 103.0, 59.88739433303139 ], "wc_reply_authors_avg": [ 715.25, 303.80369895707327 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": -0.4472135954999579, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=rJe4ShAcF7", "pdf": "https://openreview.net/pdf?id=rJe4ShAcF7", "email": ";;;;;;;;;", "author_num": 10 }, { "id": "rJeEqiC5KQ", "title": "ON THE USE OF CONVOLUTIONAL AUTO-ENCODER FOR INCREMENTAL CLASSIFIER LEARNING IN CONTEXT AWARE ADVERTISEMENT", "track": "main", "status": "Reject", "tldr": "Human brain inspired incremental learning system", "abstract": "Context Aware Advertisement (CAA) is a type of advertisement\nappearing on websites or mobile apps. The advertisement is targeted\non specific group of users and/or the content displayed on the\nwebsites or apps. This paper focuses on classifying images displayed\non the websites by incremental learning classifier with Deep\nConvolutional Neural Network (DCNN) especially for Context Aware\nAdvertisement (CAA) framework. Incrementally learning new knowledge\nwith DCNN leads to catastrophic forgetting as previously stored\ninformation is replaced with new information. To prevent\ncatastrophic forgetting, part of previously learned knowledge should\nbe stored for the life time of incremental classifier. Storing\ninformation for life time involves privacy and legal concerns\nespecially in context aware advertising framework. Here, we propose\nan incremental classifier learning method which addresses privacy\nand legal concerns while taking care of catastrophic forgetting\nproblem. We conduct experiments on different datasets including\nCIFAR-100. Experimental results show that proposed system achieves\nrelatively high performance compared to the state-of-the-art\nincremental learning methods.", "keywords": "Incremental learning;deep learning;autoencoder;privacy;convolutional neural network", "primary_area": "", "supplementary_material": "", "author": "Tin Lay Nwe;Shudong Xie;Balaji Nataraj;Yiqun Li;Joo-Hwee Lim", "authorids": "tlnma@i2r.a-star.edu.sg;xie_shudong@i2r.a-star.edu.sg;e0267605@u.nus.edu;yqli@i2r.a-star.edu.sg;joohwee@i2r.a-star.edu.sg", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nnwe2019on,\ntitle={{ON} {THE} {USE} {OF} {CONVOLUTIONAL} {AUTO}-{ENCODER} {FOR} {INCREMENTAL} {CLASSIFIER} {LEARNING} {IN} {CONTEXT} {AWARE} {ADVERTISEMENT}},\nauthor={Tin Lay Nwe and Shudong Xie and Balaji Nataraj and Yiqun Li and Joo-Hwee Lim},\nyear={2019},\nurl={https://openreview.net/forum?id=rJeEqiC5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJeEqiC5KQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;5", "wc_review": "409;162;201", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 257.3333333333333, 108.41996536103898 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:sNOZRomZghoJ:scholar.google.com/&scioq=ON+THE+USE+OF+CONVOLUTIONAL+AUTO-ENCODER+FOR+INCREMENTAL+CLASSIFIER+LEARNING+IN+CONTEXT+AWARE+ADVERTISEMENT&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rJeQYjRqYX", "title": "Effective Path: Know the Unknowns of Neural Network", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite their enormous success, there is still no solid understanding of deep neural network\u2019s working mechanism. As such, researchers have demonstrated DNNs are vulnerable to small input perturbation, i.e., adversarial attacks. This work proposes the effective path as a new approach to exploring DNNs' internal organization. The effective path is an ensemble of synapses and neurons, which is reconstructed from a trained DNN using our activation-based backward algorithm. The per-image effective path can be aggregated to the class-level effective path, through which we observe that adversarial images activate effective path different from normal images. We propose an effective path similarity-based method to detect adversarial images and demonstrate its high accuracy and broad applicability.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yuxian Qiu;Jingwen Leng;Yuhao Zhu;Quan Chen;Chao Li;Minyi Guo", "authorids": "qiuyuxian@sjtu.edu.cn;leng-jw@sjtu.edu.cn;yzhu@rochester.edu;chen-quan@sjtu.edu.cn;lichao@cs.sjtu.edu.cn;guo-my@cs.sjtu.edu.cn", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nqiu2019effective,\ntitle={Effective Path: Know the Unknowns of Neural Network},\nauthor={Yuxian Qiu and Jingwen Leng and Yuhao Zhu and Quan Chen and Chao Li and Minyi Guo},\nyear={2019},\nurl={https://openreview.net/forum?id=rJeQYjRqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJeQYjRqYX", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;5", "wc_review": "755;739;633", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 709.0, 54.135632135098106 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:atp3vd1bYjkJ:scholar.google.com/&scioq=Effective+Path:+Know+the+Unknowns+of+Neural+Network&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "BabyAI: A Platform to Study the Sample Efficiency of Grounded Language Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/733", "id": "rJeXCo0cYX", "author_site": "Maxime Chevalier-Boisvert, Dzmitry Bahdanau, Salem Lahlou, Lucas Willems, Chitwan Saharia, Thien H Nguyen, Yoshua Bengio", "tldr": "We present the BabyAI platform for studying data efficiency of language learning with a human in the loop", "abstract": "Allowing humans to interactively train artificial agents to understand language instructions is desirable for both practical and scientific reasons. Though, given the lack of sample efficiency in current learning methods, reaching this goal may require substantial research efforts. We introduce the BabyAI research platform, with the goal of supporting investigations towards including humans in the loop for grounded language learning. The BabyAI platform comprises an extensible suite of 19 levels of increasing difficulty. Each level gradually leads the agent towards acquiring a combinatorially rich synthetic language, which is a proper subset of English. The platform also provides a hand-crafted bot agent, which simulates a human teacher. We report estimated amount of supervision required for training neural reinforcement and behavioral-cloning agents on some BabyAI levels. We put forward strong evidence that current deep learning methods are not yet sufficiently sample-efficient in the context of learning a language with compositional properties.", "keywords": "language;learning;efficiency;imitation learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Maxime Chevalier-Boisvert;Dzmitry Bahdanau;Salem Lahlou;Lucas Willems;Chitwan Saharia;Thien Huu Nguyen;Yoshua Bengio", "authorids": "maximechevalierb@gmail.com;dimabgv@gmail.com;salemlahlou9@gmail.com;lcswillems@gmail.com;chitwaniit@gmail.com;thien@cs.uoregon.edu;yoshua.bengio@umontreal.ca", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nchevalier-boisvert2018babyai,\ntitle={Baby{AI}: First Steps Towards Grounded Language Learning With a Human In the Loop},\nauthor={Maxime Chevalier-Boisvert and Dzmitry Bahdanau and Salem Lahlou and Lucas Willems and Chitwan Saharia and Thien Huu Nguyen and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJeXCo0cYX},\n}", "github": "[![github](/images/github_icon.svg) mila-iqia/babyai](https://github.com/mila-iqia/babyai) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=rJeXCo0cYX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;5", "wc_review": "268;454;356", "wc_reply_reviewers": "0;0;71", "wc_reply_authors": "325;931;677", "reply_reviewers": "0;0;1", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 359.3333333333333, 75.97075460699041 ], "wc_reply_reviewers_avg": [ 23.666666666666668, 33.469720976163245 ], "wc_reply_authors_avg": [ 644.3333333333334, 248.47445654544765 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.9999999999999998, "gs_citation": 259, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16615836502291630253&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rJeXCo0cYX", "pdf": "https://openreview.net/pdf?id=rJeXCo0cYX", "email": ";;;;;;", "author_num": 7 }, { "id": "rJeZS3RcYm", "title": "Simple Black-box Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The construction of adversarial images is a search problem in high dimensions within a small region around a target image. The goal is to find an imperceptibly modified image that is misclassified by a target model. In the black-box setting, only sporadic feedback is provided through occasional model evaluations. In this paper we provide a new algorithm whose search strategy is based on an intriguingly simple iterative principle: We randomly pick a low frequency component of the discrete cosine transform (DCT) and either add or subtract it to the target image. Model evaluations are only required to identify whether an operation decreases the adversarial loss. Despite its simplicity, the proposed method can be used for targeted and untargeted attacks --- resulting in previously unprecedented query efficiency in both settings. We require a median of 600 black-box model queries (ResNet-50) to produce an adversarial ImageNet image, and we successfully attack Google Cloud Vision with 2500 median queries, averaging to a cost of only $3 per image. We argue that our proposed algorithm should serve as a strong baseline for future adversarial black-box attacks, in particular because it is extremely fast and can be implemented in less than 20 lines of PyTorch code. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chuan Guo;Jacob R. Gardner;Yurong You;Andrew G. Wilson;Kilian Q. Weinberger", "authorids": "cg563@cornell.edu;jrg365@cornell.edu;yy785@cornell.edu;andrew@cornell.edu;kqw4@cornell.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nguo2019simple,\ntitle={Simple Black-box Adversarial Attacks},\nauthor={Chuan Guo and Jacob R. Gardner and Yurong You and Andrew G. Wilson and Kilian Q. Weinberger},\nyear={2019},\nurl={https://openreview.net/forum?id=rJeZS3RcYm},\n}", "github": "[![github](/images/github_icon.svg) cg563/simple-blackbox-attack](https://github.com/cg563/simple-blackbox-attack) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rJeZS3RcYm)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJeZS3RcYm", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;3;3", "wc_review": "834;212;185", "wc_reply_reviewers": "133;0;0", "wc_reply_authors": "718;157;29", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 410.3333333333333, 299.78028991616884 ], "wc_reply_reviewers_avg": [ 44.333333333333336, 62.69680126520721 ], "wc_reply_authors_avg": [ 301.3333333333333, 299.22603867682074 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 734, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14524309362525785070&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "title": "Analyzing Inverse Problems with Invertible Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1018", "id": "rJed6j0cKX", "author_site": "Lynton Ardizzone, Jakob Kruse, Carsten Rother, Ullrich Koethe", "tldr": "To analyze inverse problems with Invertible Neural Networks", "abstract": "For many applications, in particular in natural science, the task is to\ndetermine hidden system parameters from a set of measurements. Often,\nthe forward process from parameter- to measurement-space is well-defined,\nwhereas the inverse problem is ambiguous: multiple parameter sets can\nresult in the same measurement. To fully characterize this ambiguity, the full\nposterior parameter distribution, conditioned on an observed measurement,\nhas to be determined. We argue that a particular class of neural networks\nis well suited for this task \u2013 so-called Invertible Neural Networks (INNs).\nUnlike classical neural networks, which attempt to solve the ambiguous\ninverse problem directly, INNs focus on learning the forward process, using\nadditional latent output variables to capture the information otherwise\nlost. Due to invertibility, a model of the corresponding inverse process is\nlearned implicitly. Given a specific measurement and the distribution of\nthe latent variables, the inverse pass of the INN provides the full posterior\nover parameter space. We prove theoretically and verify experimentally, on\nartificial data and real-world problems from medicine and astrophysics, that\nINNs are a powerful analysis tool to find multi-modalities in parameter space,\nuncover parameter correlations, and identify unrecoverable parameters.", "keywords": "Inverse problems;Neural Networks;Uncertainty;Invertible Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Lynton Ardizzone;Jakob Kruse;Carsten Rother;Ullrich K\u00f6the", "authorids": "lynton.ardizzone@iwr.uni-heidelberg.de;jakob.kruse@iwr.uni-heidelberg.de;carsten.rother@iwr.uni-heidelberg.de;ullrich.koethe@iwr.uni-heidelberg.de", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nardizzone2018analyzing,\ntitle={Analyzing Inverse Problems with Invertible Neural Networks},\nauthor={Lynton Ardizzone and Jakob Kruse and Carsten Rother and Ullrich K\u00f6the},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJed6j0cKX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rJed6j0cKX)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;2;5", "wc_review": "382;267;699", "wc_reply_reviewers": "0;0;316", "wc_reply_authors": "753;352;2079", "reply_reviewers": "0;0;4", "reply_authors": "2;2;5", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 449.3333333333333, 182.67700700659861 ], "wc_reply_reviewers_avg": [ 105.33333333333333, 148.963828569966 ], "wc_reply_authors_avg": [ 1061.3333333333333, 737.9856969405904 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "reply_authors_avg": [ 3.0, 1.4142135623730951 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 687, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11726167172639510448&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rJed6j0cKX", "pdf": "https://openreview.net/pdf?id=rJed6j0cKX", "email": ";;;", "author_num": 4 }, { "title": "RelGAN: Relational Generative Adversarial Networks for Text Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/966", "id": "rJedV3R5tm", "author_site": "Weili Nie, Nina Narodytska, Ankit B Patel", "tldr": "", "abstract": "Generative adversarial networks (GANs) have achieved great success at generating realistic images. However, the text generation still remains a challenging task for modern GAN architectures. In this work, we propose RelGAN, a new GAN architecture for text generation, consisting of three main components: a relational memory based generator for the long-distance dependency modeling, the Gumbel-Softmax relaxation for training GANs on discrete data, and multiple embedded representations in the discriminator to provide a more informative signal for the generator updates. Our experiments show that RelGAN outperforms current state-of-the-art models in terms of sample quality and diversity, and we also reveal via ablation studies that each component of RelGAN contributes critically to its performance improvements. Moreover, a key advantage of our method, that distinguishes it from other GANs, is the ability to control the trade-off between sample quality and diversity via the use of a single adjustable parameter. Finally, RelGAN is the first architecture that makes GANs with Gumbel-Softmax relaxation succeed in generating realistic text.", "keywords": "RelGAN;text generation;relational memory;Gumbel-Softmax relaxation;multiple embedded representations", "primary_area": "", "supplementary_material": "", "author": "Weili Nie;Nina Narodytska;Ankit Patel", "authorids": "wn8@rice.edu;nnarodytska@vmware.com;abp4@rice.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nnie2018relgan,\ntitle={Rel{GAN}: Relational Generative Adversarial Networks for Text Generation},\nauthor={Weili Nie and Nina Narodytska and Ankit Patel},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJedV3R5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "4;4;4", "wc_review": "572;275;510", "wc_reply_reviewers": "0;0;35", "wc_reply_authors": "1015;555;823", "reply_reviewers": "0;0;1", "reply_authors": "2;1;2", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 452.3333333333333, 127.92271972649041 ], "wc_reply_reviewers_avg": [ 11.666666666666666, 16.49915822768611 ], "wc_reply_authors_avg": [ 797.6666666666666, 188.6466420468585 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 265, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8523757541722331979&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rJedV3R5tm", "pdf": "https://openreview.net/pdf?id=rJedV3R5tm", "email": ";;", "author_num": 3 }, { "id": "rJedbn0ctQ", "title": "Zero-training Sentence Embedding via Orthogonal Basis", "track": "main", "status": "Reject", "tldr": "A simple and training-free approach for sentence embeddings with competitive performance compared with sophisticated models requiring either large amount of training data or prolonged training time.", "abstract": "We propose a simple and robust training-free approach for building sentence representations. Inspired by the Gram-Schmidt Process in geometric theory, we build an orthogonal basis of the subspace spanned by a word and its surrounding context in a sentence. We model the semantic meaning of a word in a sentence based on two aspects. One is its relatedness to the word vector subspace already spanned by its contextual words. The other is its novel semantic meaning which shall be introduced as a new basis vector perpendicular to this existing subspace. Following this motivation, we develop an innovative method based on orthogonal basis to combine pre-trained word embeddings into sentence representation. This approach requires zero training and zero parameters, along with efficient inference performance. We evaluate our approach on 11 downstream NLP tasks. Experimental results show that our model outperforms all existing zero-training alternatives in all the tasks and it is competitive to other approaches relying on either large amounts of labelled data or prolonged training time.", "keywords": "Natural Language Processing;Sentence Embeddings", "primary_area": "", "supplementary_material": "", "author": "Ziyi Yang;Chenguang Zhu;Weizhu Chen", "authorids": "ziyi.yang@stanford.edu;chezhu@microsoft.com;wzchen@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyang2019zerotraining,\ntitle={Zero-training Sentence Embedding via Orthogonal Basis},\nauthor={Ziyi Yang and Chenguang Zhu and Weizhu Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=rJedbn0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJedbn0ctQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "393;251;356", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "592;732;726", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 333.3333333333333, 60.14611837480085 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 683.3333333333334, 64.62885492478487 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2091490425418041287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJegl2C9K7", "title": "Feature Matters: A Stage-by-Stage Approach for Task Independent Knowledge Transfer", "track": "main", "status": "Withdraw", "tldr": "This paper proposes to transfer knowledge from deep model to shallow one by mimicking features stage by stage.", "abstract": "Convolutional Neural Networks (CNNs) become deeper and deeper in recent years, making the study of model acceleration imperative. It is a common practice to employ a shallow network, called student, to learn from a deep one, which is termed as teacher. Prior work made many attempts to transfer different types of knowledge from teacher to student, however, there are two problems remaining unsolved. Firstly, the knowledge used by existing methods is highly dependent on task and dataset, limiting their applications. Secondly, there lacks an effective training scheme for the transfer process, leading to degradation of performance. In this work, we argue that feature is the most important knowledge from teacher. It is sufficient for student to just learn good features regardless of the target task. From this discovery, we further present an efficient learning strategy to mimic features stage by stage. Extensive experiments demonstrate the importance of features and show that the proposed approach significantly narrows down the gap between student and teacher, outperforming the state-of-the-art methods.\n", "keywords": "knowledge transfer;task independent;feature transfer;stage-by-stage", "primary_area": "", "supplementary_material": "", "author": "Mengya Gao;Yujun Shen;Quanquan Li;Liang Wan;Xiaoou Tang", "authorids": "daisy@tju.edu.cn;sy116@ie.cuhk.edu.hk;liquanquan@sensetime.com;lwan@tju.edu.cn;xtang@ie.cuhk.edu.hk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJegl2C9K7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "142;578;296", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 338.6666666666667, 180.53500737776287 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6707512741744362570&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "The Singular Values of Convolutional Layers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/982", "id": "rJevYoA9Fm", "author_site": "Hanie Sedghi, Vineet Gupta, Phil Long", "tldr": "We characterize the singular values of the linear transformation associated with a standard 2D multi-channel convolutional layer, enabling their efficient computation. ", "abstract": "We characterize the singular values of the linear transformation associated with a standard 2D multi-channel convolutional layer, enabling their efficient computation. This characterization also leads to an algorithm for projecting a convolutional layer onto an operator-norm ball. We show that this is an effective regularizer; for example, it improves the test error of a deep residual network using batch normalization on CIFAR-10 from 6.2% to 5.3%. ", "keywords": "singular values;operator norm;convolutional layers;regularization", "primary_area": "", "supplementary_material": "", "author": "Hanie Sedghi;Vineet Gupta;Philip M. Long", "authorids": "hsedghi@google.com;vineet@google.com;plong@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsedghi2018the,\ntitle={The Singular Values of Convolutional Layers},\nauthor={Hanie Sedghi and Vineet Gupta and Philip M. Long},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJevYoA9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "4;7;8", "confidence": "5;3;4", "wc_review": "543;163;248", "wc_reply_reviewers": "857;0;0", "wc_reply_authors": "814;6;17", "reply_reviewers": "3;0;0", "reply_authors": "5;1;1", "rating_avg": [ 6.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 318.0, 162.83938917432314 ], "wc_reply_reviewers_avg": [ 285.6666666666667, 403.9936743179141 ], "wc_reply_authors_avg": [ 279.0, 378.3287811767255 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 30, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7205766921228921, "gs_citation": 234, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8194946284623254449&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "openreview": "https://openreview.net/forum?id=rJevYoA9Fm", "pdf": "https://openreview.net/pdf?id=rJevYoA9Fm", "email": ";;", "author_num": 3 }, { "id": "rJeyV2AcKX", "title": "Network Reparameterization for Unseen Class Categorization", "track": "main", "status": "Withdraw", "tldr": "A unified frame for both few-shot learning and zero-shot learning based on network reparameterization", "abstract": "Many problems with large-scale labeled training data have been impressively solved by deep learning. However, Unseen Class Categorization (UCC) with minimal information provided about target classes is the most commonly encountered setting in industry, which remains a challenging research problem in machine learning. Previous approaches to UCC either fail to generate a powerful discriminative feature extractor or fail to learn a flexible classifier that can be easily adapted to unseen classes. In this paper, we propose to address these issues through network reparameterization, \\textit{i.e.}, reparametrizing the learnable weights of a network as a function of other variables, by which we decouple the feature extraction part and the classification part of a deep classification model to suit the special setting of UCC, securing both strong discriminability and excellent adaptability. Extensive experiments for UCC on several widely-used benchmark datasets in the settings of zero-shot and few-shot learning demonstrate that, our method with network reparameterization achieves state-of-the-art performance.", "keywords": "Unseen class categorization;network reparameterization;few-shot learning;zero-shot learning", "primary_area": "", "supplementary_material": "", "author": "Kai Li;Martin Renqiang Min;Bing Bai;Yun Fu;Hans Peter Graf", "authorids": "li.kai.gml@gmail.com;renqiang@nec-labs.com;bbai@nec-labs.com;yunfu@ece.neu.edu;hpg@nec-labs.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJeyV2AcKX", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;3;5", "wc_review": "277;400;360", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 345.6666666666667, 51.227162933567016 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16883833518972989875&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJf0BjAqYX", "title": "Like What You Like: Knowledge Distill via Neuron Selectivity Transfer", "track": "main", "status": "Reject", "tldr": "We treat knowledge distill as a distribution matching problem and adopt Maximum Mean Discrepancy to minimize the distances between student features and teacher features.", "abstract": "Despite deep neural networks have demonstrated extraordinary power in various applications, their superior performances are at expense of high storage and computational costs. Consequently, the acceleration and compression of neural networks have attracted much attention recently. Knowledge Transfer (KT), which aims at training a smaller student network by transferring knowledge from a larger teacher model, is one of the popular solutions. In this paper, we propose a novel knowledge transfer method by treating it as a distribution matching problem. Particularly, we match the distributions of neuron selectivity patterns between teacher and student networks. To achieve this goal, we devise a new KT loss function by minimizing the Maximum Mean Discrepancy (MMD) metric between these distributions. Combined with the original loss function, our method can significantly improve the performance of student networks. We validate the effectiveness of our method across several datasets, and further combine it with other KT methods to explore the best possible results. Last but not least, we fine-tune the model to other tasks such as object detection. The results are also encouraging, which confirm the transferability of the learned features.", "keywords": "Knowledge Distill", "primary_area": "", "supplementary_material": "", "author": "Zehao Huang;Naiyan Wang", "authorids": "zehaohuang18@gmail.com;winsty@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhuang2019like,\ntitle={Like What You Like: Knowledge Distill via Neuron Selectivity Transfer},\nauthor={Zehao Huang and Naiyan Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=rJf0BjAqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJf0BjAqYX", "pdf_size": 0, "rating": "4;4;6", "confidence": "4;4;5", "wc_review": "324;298;163", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 261.6666666666667, 70.57068954050416 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 625, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15240415627315762783&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "An Empirical study of Binary Neural Networks' Optimisation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1067", "id": "rJfUCoR5KX", "author_site": "Milad Alizadeh, Javier Fernandez-Marques, Nicholas Lane, Yarin Gal", "tldr": "", "abstract": "Binary neural networks using the Straight-Through-Estimator (STE) have been shown to achieve state-of-the-art results, but their training process is not well-founded. This is due to the discrepancy between the evaluated function in the forward path, and the weight updates in the back-propagation, updates which do not correspond to gradients of the forward path. Efficient convergence and accuracy of binary models often rely on careful fine-tuning and various ad-hoc techniques. In this work, we empirically identify and study the effectiveness of the various ad-hoc techniques commonly used in the literature, providing best-practices for efficient training of binary models. We show that adapting learning rates using second moment methods is crucial for the successful use of the STE, and that other optimisers can easily get stuck in local minima. We also find that many of the commonly employed tricks are only effective towards the end of the training, with these methods making early stages of the training considerably slower. Our analysis disambiguates necessary from unnecessary ad-hoc techniques for training of binary neural networks, paving the way for future development of solid theoretical foundations for these. Our newly-found insights further lead to new procedures which make training of existing binary neural networks notably faster.", "keywords": "binary neural networks;quantized neural networks;straight-through-estimator", "primary_area": "", "supplementary_material": "", "author": "Milad Alizadeh;Javier Fern\u00e1ndez-Marqu\u00e9s;Nicholas D. Lane;Yarin Gal", "authorids": "milad.alizadeh@cs.ox.ac.uk;javier.fernandezmarques@cs.ox.ac.uk;nicholas.lane@cs.ox.ac.uk;yarin.gal@cs.ox.ac.uk", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nalizadeh2018a,\ntitle={A Systematic Study of Binary Neural Networks' Optimisation},\nauthor={Milad Alizadeh and Javier Fern\u00e1ndez-Marqu\u00e9s and Nicholas D. Lane and Yarin Gal},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJfUCoR5KX},\n}", "github": "[![github](/images/github_icon.svg) mi-lad/studying-binary-neural-networks](https://github.com/mi-lad/studying-binary-neural-networks)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;3;4", "wc_review": "273;591;181", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "302;372;249", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 348.3333333333333, 175.65369971117096 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 307.6666666666667, 50.37415563119203 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11225440842042128291&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=rJfUCoR5KX", "pdf": "https://openreview.net/pdf?id=rJfUCoR5KX", "email": ";;;", "author_num": 4 }, { "title": "Approximability of Discriminators Implies Diversity in GANs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/936", "id": "rJfW5oA5KQ", "author_site": "Yu Bai, Tengyu Ma, Andrej Risteski", "tldr": "GANs can in principle learn distributions sample-efficiently, if the discriminator class is compact and has strong distinguishing power against the particular generator class.", "abstract": "While Generative Adversarial Networks (GANs) have empirically produced impressive results on learning complex real-world distributions, recent works have shown that they suffer from lack of diversity or mode collapse. The theoretical work of Arora et al. (2017a) suggests a dilemma about GANs\u2019 statistical properties: powerful discriminators cause overfitting, whereas weak discriminators cannot detect mode collapse.\nBy contrast, we show in this paper that GANs can in principle learn distributions in Wasserstein distance (or KL-divergence in many cases) with polynomial sample complexity, if the discriminator class has strong distinguishing power against the particular generator class (instead of against all possible generators). For various generator classes such as mixture of Gaussians, exponential families, and invertible and injective neural networks generators, we design corresponding discriminators (which are often neural nets of specific architectures) such that the Integral Probability Metric (IPM) induced by the discriminators can provably approximate the Wasserstein distance and/or KL-divergence. This implies that if the training is successful, then the learned distribution is close to the true distribution in Wasserstein distance or KL divergence, and thus cannot drop modes. Our preliminary experiments show that on synthetic datasets the test IPM is well correlated with KL divergence or the Wasserstein distance, indicating that the lack of diversity in GANs may be caused by the sub-optimality in optimization instead of statistical inefficiency.", "keywords": "Theory;Generative adversarial networks;Mode collapse;Generalization", "primary_area": "", "supplementary_material": "", "author": "Yu Bai;Tengyu Ma;Andrej Risteski", "authorids": "yub@stanford.edu;tengyuma@stanford.edu;risteski@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbai2018approximability,\ntitle={Approximability of Discriminators Implies Diversity in {GAN}s},\nauthor={Yu Bai and Tengyu Ma and Andrej Risteski},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJfW5oA5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;3;2", "wc_review": "166;657;226", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "283;436;81", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 349.6666666666667, 218.69359590278103 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 266.6666666666667, 145.38760454576433 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 93, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9758154062502373933&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rJfW5oA5KQ", "pdf": "https://openreview.net/pdf?id=rJfW5oA5KQ", "email": ";;", "author_num": 3 }, { "title": "Learning Embeddings into Entropic Wasserstein Spaces", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/813", "id": "rJg4J3CqFm", "author_site": "Charlie Frogner, Farzaneh Mirzazadeh, Justin Solomon", "tldr": "We show that Wasserstein spaces are good targets for embedding data with complex semantic structure.", "abstract": "Despite their prevalence, Euclidean embeddings of data are fundamentally limited in their ability to capture latent semantic structures, which need not conform to Euclidean spatial assumptions. Here we consider an alternative, which embeds data as discrete probability distributions in a Wasserstein space, endowed with an optimal transport metric. Wasserstein spaces are much larger and more flexible than Euclidean spaces, in that they can successfully embed a wider variety of metric structures. We propose to exploit this flexibility by learning an embedding that captures the semantic information in the Wasserstein distance between embedded distributions. We examine empirically the representational capacity of such learned Wasserstein embeddings, showing that they can embed a wide variety of complex metric structures with smaller distortion than an equivalent Euclidean embedding. We also investigate an application to word embedding, demonstrating a unique advantage of Wasserstein embeddings: we can directly visualize the high-dimensional embedding, as it is a probability distribution on a low-dimensional space. This obviates the need for dimensionality reduction techniques such as t-SNE for visualization.", "keywords": "Embedding;Wasserstein;Sinkhorn;Optimal Transport", "primary_area": "", "supplementary_material": "", "author": "Charlie Frogner;Farzaneh Mirzazadeh;Justin Solomon", "authorids": "frogner@mit.edu;farzaneh@ibm.com;jsolomon@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nfrogner2018learning,\ntitle={Learning Entropic Wasserstein Embeddings},\nauthor={Charlie Frogner and Farzaneh Mirzazadeh and Justin Solomon},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJg4J3CqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "3;7;7", "confidence": "4;4;3", "wc_review": "167;388;299", "wc_reply_reviewers": "475;80;0", "wc_reply_authors": "1811;543;325", "reply_reviewers": "2;1;0", "reply_authors": "4;2;1", "rating_avg": [ 5.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 284.6666666666667, 90.79035680560402 ], "wc_reply_reviewers_avg": [ 185.0, 207.64553129472029 ], "wc_reply_authors_avg": [ 893.0, 655.1966625881627 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 44, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3356420680246481859&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=rJg4J3CqFm", "pdf": "https://openreview.net/pdf?id=rJg4J3CqFm", "email": ";;", "author_num": 3 }, { "title": "DeepOBS: A Deep Learning Optimizer Benchmark Suite", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/945", "id": "rJg6ssC5Y7", "author_site": "Frank Stefan Schneider, Lukas Balles, Philipp Hennig", "tldr": "We provide a software package that drastically simplifies, automates, and improves the evaluation of deep learning optimizers.", "abstract": "Because the choice and tuning of the optimizer affects the speed, and ultimately the performance of deep learning, there is significant past and recent research in this area. Yet, perhaps surprisingly, there is no generally agreed-upon protocol for the quantitative and reproducible evaluation of optimization strategies for deep learning. We suggest routines and benchmarks for stochastic optimization, with special focus on the unique aspects of deep learning, such as stochasticity, tunability and generalization. As the primary contribution, we present DeepOBS, a Python package of deep learning optimization benchmarks. The package addresses key challenges in the quantitative assessment of stochastic optimizers, and automates most steps of benchmarking. The library includes a wide and extensible set of ready-to-use realistic optimization problems, such as training Residual Networks for image classification on ImageNet or character-level language prediction models, as well as popular classics like MNIST and CIFAR-10. The package also provides realistic baseline results for the most popular optimizers on these test problems, ensuring a fair comparison to the competition when benchmarking new optimizers, and without having to run costly experiments. It comes with output back-ends that directly produce LaTeX code for inclusion in academic publications. It supports TensorFlow and is available open source.", "keywords": "deep learning;optimization", "primary_area": "", "supplementary_material": "", "author": "Frank Schneider;Lukas Balles;Philipp Hennig", "authorids": "frank.schneider@tuebingen.mpg.de;lukas.balles@tuebingen.mpg.de;philipp.hennig@uni-tuebingen.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nschneider2018deepobs,\ntitle={Deep{OBS}: A Deep Learning Optimizer Benchmark Suite},\nauthor={Frank Schneider and Lukas Balles and Philipp Hennig},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJg6ssC5Y7},\n}", "github": "[![github](/images/github_icon.svg) fsschneider/deepobs](https://github.com/fsschneider/deepobs)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;4", "wc_review": "186;769;343", "wc_reply_reviewers": "0;1100;0", "wc_reply_authors": "348;1776;132", "reply_reviewers": "0;3;0", "reply_authors": "1;4;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 432.6666666666667, 246.30920044168514 ], "wc_reply_reviewers_avg": [ 366.6666666666667, 518.5449728701349 ], "wc_reply_authors_avg": [ 752.0, 729.4271725127876 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 81, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10657953635405668036&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rJg6ssC5Y7", "pdf": "https://openreview.net/pdf?id=rJg6ssC5Y7", "email": ";;", "author_num": 3 }, { "title": "InfoBot: Transfer and Exploration via the Information Bottleneck", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1138", "id": "rJg8yhAqKm", "author_site": "Anirudh Goyal, Riashat Islam, DJ Strouse, Zafarali Ahmed, Hugo Larochelle, Matthew Botvinick, Sergey Levine, Yoshua Bengio", "tldr": "Training agents with goal-policy information bottlenecks promotes transfer and yields a powerful exploration bonus", "abstract": "A central challenge in reinforcement learning is discovering effective policies for tasks where rewards are sparsely distributed. We postulate that in the absence of useful reward signals, an effective exploration strategy should seek out {\\it decision states}. These states lie at critical junctions in the state space from where the agent can transition to new, potentially unexplored regions. We propose to learn about decision states from prior experience. By training a goal-conditioned model with an information bottleneck, we can identify decision states by examining where the model accesses the goal state through the bottleneck. We find that this simple mechanism effectively identifies decision states, even in partially observed settings. In effect, the model learns the sensory cues that correlate with potential subgoals. In new environments, this model can then identify novel subgoals for further exploration, guiding the agent through a sequence of potential decision states and through new regions of the state space.", "keywords": "Information bottleneck;policy transfer;policy generalization;exploration", "primary_area": "", "supplementary_material": "", "author": "Anirudh Goyal;Riashat Islam;DJ Strouse;Zafarali Ahmed;Hugo Larochelle;Matthew Botvinick;Yoshua Bengio;Sergey Levine", "authorids": "anirudhgoyal9119@gmail.com;riashat.islam@mail.mcgill.ca;danieljstrouse@gmail.com;zafarali.ahmed@mail.mcgill.ca;hugolarochelle@google.com;botvinick@google.com;svlevine@eecs.berkeley.edu;yoshua.bengio@mila.quebec", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\ngoyal2018transfer,\ntitle={Transfer and Exploration via the Information Bottleneck},\nauthor={Anirudh Goyal and Riashat Islam and DJ Strouse and Zafarali Ahmed and Hugo Larochelle and Matthew Botvinick and Sergey Levine and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJg8yhAqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "3;7;7", "confidence": "3;3;3", "wc_review": "143;366;511", "wc_reply_reviewers": "0;30;0", "wc_reply_authors": "1506;1691;2062", "reply_reviewers": "0;1;0", "reply_authors": "6;6;7", "rating_avg": [ 5.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 340.0, 151.35609226809032 ], "wc_reply_reviewers_avg": [ 10.0, 14.142135623730951 ], "wc_reply_authors_avg": [ 1753.0, 231.1810257496637 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 6.333333333333333, 0.4714045207910317 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 189, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4130805279522394424&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rJg8yhAqKm", "pdf": "https://openreview.net/pdf?id=rJg8yhAqKm", "email": ";;;;;;;", "author_num": 8 }, { "title": "The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/843", "id": "rJgMlhRctm", "author_site": "Jiayuan Mao, Chuang Gan, Pushmeet Kohli, Joshua B Tenenbaum, Jiajun Wu", "tldr": "We propose the Neuro-Symbolic Concept Learner (NS-CL), a model that learns visual concepts, words, and semantic parsing of sentences without explicit supervision on any of them.", "abstract": "We propose the Neuro-Symbolic Concept Learner (NS-CL), a model that learns visual concepts, words, and semantic parsing of sentences without explicit supervision on any of them; instead, our model learns by simply looking at images and reading paired questions and answers. Our model builds an object-based scene representation and translates sentences into executable, symbolic programs. To bridge the learning of two modules, we use a neuro-symbolic reasoning module that executes these programs on the latent scene representation. Analogical to human concept learning, the perception module learns visual concepts based on the language description of the object being referred to. Meanwhile, the learned visual concepts facilitate learning new words and parsing new sentences. We use curriculum learning to guide the searching over the large compositional space of images and language. Extensive experiments demonstrate the accuracy and efficiency of our model on learning visual concepts, word representations, and semantic parsing of sentences. Further, our method allows easy generalization to new object attributes, compositions, language concepts, scenes and questions, and even new program domains. It also empowers applications including visual question answering and bidirectional image-text retrieval.", "keywords": "Neuro-Symbolic Representations;Concept Learning;Visual Reasoning", "primary_area": "", "supplementary_material": "", "author": "Jiayuan Mao;Chuang Gan;Pushmeet Kohli;Joshua B. Tenenbaum;Jiajun Wu", "authorids": "maojiayuan@gmail.com;ganchuang1990@gmail.com;pushmeet@google.com;jbt@mit.edu;jiajunwu@mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nmao2018the,\ntitle={The Neuro-Symbolic Concept Learner: Interpreting Scenes, Words, and Sentences From Natural Supervision},\nauthor={Jiayuan Mao and Chuang Gan and Pushmeet Kohli and Joshua B. Tenenbaum and Jiajun Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgMlhRctm},\n}", "github": "[![github](/images/github_icon.svg) vacancy/NSCL-PyTorch-Release](https://github.com/vacancy/NSCL-PyTorch-Release) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rJgMlhRctm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;4;5", "wc_review": "528;228;115", "wc_reply_reviewers": "161;0;0", "wc_reply_authors": "1060;139;505", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 290.3333333333333, 174.27245590995983 ], "wc_reply_reviewers_avg": [ 53.666666666666664, 75.8961278473561 ], "wc_reply_authors_avg": [ 568.0, 378.62646500211787 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 929, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8837128214653317831&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rJgMlhRctm", "pdf": "https://openreview.net/pdf?id=rJgMlhRctm", "email": ";;;;", "author_num": 5 }, { "id": "rJgP7hR5YQ", "title": "COMPOSITION AND DECOMPOSITION OF GANS", "track": "main", "status": "Reject", "tldr": "GANs can be composed to build more complex models and decomposed to obtain building blocks", "abstract": "In this work, we propose a composition/decomposition framework for adversarially training generative models on composed data - data where each sample can be thought of as being constructed from a fixed number of components. In our framework, samples are generated by sampling components from component generators and feeding these components to a composition function which combines them into a \u201ccomposed sample\u201d. This compositional training approach improves the modularity, extensibility and interpretability of Generative Adversarial Networks (GANs) - providing a principled way to incrementally construct complex models out of simpler component models, and allowing for explicit \u201cdivision of responsibility\u201d between these components. Using this framework, we define a family of learning tasks and evaluate their feasibility on two datasets in two different data modalities (image and text). Lastly, we derive sufficient conditions such that these compositional generative models are identifiable. Our work provides a principled approach to building on pretrained generative models or for exploiting the compositional nature of data distributions to train extensible and interpretable models.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yeu-Chern Harn;Zhenghao Chen;Vladimir Jojic", "authorids": "ycharn@cs.unc.edu;chen.zhenghao@gmail.com;vjojic@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nharn2019composition,\ntitle={{COMPOSITION} {AND} {DECOMPOSITION} {OF} {GANS}},\nauthor={Yeu-Chern Harn and Zhenghao Chen and Vladimir Jojic},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgP7hR5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJgP7hR5YQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;5", "wc_review": "1079;556;329", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "801;763;775", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 654.6666666666666, 314.03432225722645 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 779.6666666666666, 15.860503004493758 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:1h_xud-nwBYJ:scholar.google.com/&scioq=COMPOSITION+AND+DECOMPOSITION+OF+GANS&hl=en&as_sdt=0,5", "gs_version_total": 4 }, { "id": "rJgSV3AqKQ", "title": "Combining adaptive algorithms and hypergradient method: a performance and robustness study", "track": "main", "status": "Reject", "tldr": "We provide a study trying to see how the recent online learning rate adaptation extends the conclusion made by Wilson et al. 2018 about adaptive gradient methods, along with comparison and sensitivity analysis.", "abstract": "Wilson et al. (2017) showed that, when the stepsize schedule is properly designed, stochastic gradient generalizes better than ADAM (Kingma & Ba, 2014). In light of recent work on hypergradient methods (Baydin et al., 2018), we revisit these claims to see if such methods close the gap between the most popular optimizers. As a byproduct, we analyze the true benefit of these hypergradient methods compared to more classical schedules, such as the fixed decay of Wilson et al. (2017). In particular, we observe they are of marginal help since their performance varies significantly when tuning their hyperparameters. Finally, as robustness is a critical quality of an optimizer, we provide a sensitivity analysis of these gradient based optimizers to assess how challenging their tuning is.", "keywords": "optimization;adaptive methods;learning rate decay", "primary_area": "", "supplementary_material": "", "author": "Akram Erraqabi;Nicolas Le Roux", "authorids": "akram.er-raqabi@umontreal.ca;nicolas@le-roux.name", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nerraqabi2019combining,\ntitle={Combining adaptive algorithms and hypergradient method: a performance and robustness study},\nauthor={Akram Erraqabi and Nicolas Le Roux},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgSV3AqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJgSV3AqKQ", "pdf_size": 0, "rating": "3;3;4", "confidence": "2;4;4", "wc_review": "387;221;127", "wc_reply_reviewers": "18;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "1;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 245.0, 107.49263540664852 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GWFU0GWfx6wJ:scholar.google.com/&scioq=Combining+adaptive+algorithms+and+hypergradient+method:+a+performance+and+robustness+study&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "The Comparative Power of ReLU Networks and Polynomial Kernels in the Presence of Sparse Latent Structure", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/804", "id": "rJgTTjA9tX", "author_site": "Frederic Koehler, Andrej Risteski", "tldr": "Beyond-worst-case analysis of the representational power of ReLU nets & polynomial kernels -- in particular in the presence of sparse latent structure.", "abstract": "There has been a large amount of interest, both in the past and particularly recently, into the relative advantage of different families of universal function approximators, for instance neural networks, polynomials, rational functions, etc. However, current research has focused almost exclusively on understanding this problem in a worst case setting: e.g. characterizing the best L1 or L_{infty} approximation in a box (or sometimes, even under an adversarially constructed data distribution.) In this setting many classical tools from approximation theory can be effectively used.\n\nHowever, in typical applications we expect data to be high dimensional, but structured -- so, it would only be important to approximate the desired function well on the relevant part of its domain, e.g. a small manifold on which real input data actually lies. Moreover, even within this domain the desired quality of approximation may not be uniform; for instance in classification problems, the approximation needs to be more accurate near the decision boundary. These issues, to the best of our knowledge, have remain unexplored until now.\n\t\nWith this in mind, we analyze the performance of neural networks and polynomial kernels in a natural regression setting where the data enjoys sparse latent structure, and the labels depend in a simple way on the latent variables. We give an almost-tight theoretical analysis of the performance of both neural networks and polynomials for this problem, as well as verify our theory with simulations. Our results both involve new (complex-analytic) techniques, which may be of independent interest, and show substantial qualitative differences with what is known in the worst-case setting.", "keywords": "theory;representational power;universal approximators;polynomial kernels;latent sparsity;beyond worst case;separation result", "primary_area": "", "supplementary_material": "", "author": "Frederic Koehler;Andrej Risteski", "authorids": "fkoehler@mit.edu;risteski@mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkoehler2018the,\ntitle={The Comparative Power of Re{LU} Networks and Polynomial Kernels in the Presence of Sparse Latent Structure},\nauthor={Frederic Koehler and Andrej Risteski},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgTTjA9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;3;3", "wc_review": "156;231;190", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 192.33333333333334, 30.663043264200347 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4978405382971332072&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=rJgTTjA9tX", "pdf": "https://openreview.net/pdf?id=rJgTTjA9tX", "email": ";", "author_num": 2 }, { "id": "rJgTciR9tm", "title": "Learning Information Propagation in the Dynamical Systems via Information Bottleneck Hierarchy", "track": "main", "status": "Reject", "tldr": "Compact perception of dynamical process", "abstract": "Extracting relevant information, causally inferring and predicting the future states with high accuracy is a crucial task for modeling complex systems. The endeavor to address these tasks is made even more challenging when we have to deal with high-dimensional heterogeneous data streams. Such data streams often have higher-order inter-dependencies across spatial and temporal dimensions. We propose to perform a soft-clustering of the data and learn its dynamics to produce a compact dynamical model while still ensuring the original objectives of causal inference and accurate predictions. To efficiently and rigorously process the dynamics of soft-clustering, we advocate for an information theory inspired approach that incorporates stochastic calculus and seeks to determine a trade-off between the predictive accuracy and compactness of the mathematical representation. We cast the model construction as a maximization of the compression of the state variables such that the predictive ability and causal interdependence (relatedness) constraints between the original data streams and the compact model are closely bounded. We provide theoretical guarantees concerning the convergence of the proposed learning algorithm. To further test the proposed framework, we consider a high-dimensional Gaussian case study and describe an iterative scheme for updating the new model parameters. Using numerical experiments, we demonstrate the benefits on compression and prediction accuracy for a class of dynamical systems. Finally, we apply the proposed algorithm to the real-world dataset of multimodal sentiment intensity and show improvements in prediction with reduced dimensions.", "keywords": "compact representation;perception;dynamical systems;information bottleneck", "primary_area": "", "supplementary_material": "", "author": "Gaurav Gupta;Mohamed Ridha Znaidi;Paul Bogdan", "authorids": "ggaurav@usc.edu;znaidi@usc.edu;pbogdan@usc.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ngupta2019learning,\ntitle={Learning Information Propagation in the Dynamical Systems via Information Bottleneck Hierarchy},\nauthor={Gaurav Gupta and Mohamed Ridha Znaidi and Paul Bogdan},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgTciR9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJgTciR9tm", "pdf_size": 0, "rating": "4;5;5", "confidence": "2;4;3", "wc_review": "597;464;254", "wc_reply_reviewers": "299;0;0", "wc_reply_authors": "1241;658;252", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 438.3333333333333, 141.20040919046787 ], "wc_reply_reviewers_avg": [ 99.66666666666667, 140.94995171651846 ], "wc_reply_authors_avg": [ 717.0, 405.907214356516 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:yv_OphJvCNgJ:scholar.google.com/&scioq=Learning+Information+Propagation+in+the+Dynamical+Systems+via+Information+Bottleneck+Hierarchy&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Learning Implicitly Recurrent CNNs Through Parameter Sharing", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/854", "id": "rJgYxn09Fm", "author_site": "Pedro Savarese, Michael Maire", "tldr": "We propose a method that enables CNN folding to create recurrent connections", "abstract": "We introduce a parameter sharing scheme, in which different layers of a convolutional neural network (CNN) are defined by a learned linear combination of parameter tensors from a global bank of templates. Restricting the number of templates yields a flexible hybridization of traditional CNNs and recurrent networks. Compared to traditional CNNs, we demonstrate substantial parameter savings on standard image classification tasks, while maintaining accuracy.\nOur simple parameter sharing scheme, though defined via soft weights, in practice often yields trained networks with near strict recurrent structure; with negligible side effects, they convert into networks with actual loops. Training these networks thus implicitly involves discovery of suitable recurrent architectures. Though considering only the aspect of recurrent links, our trained networks achieve accuracy competitive with those built using state-of-the-art neural architecture search (NAS) procedures.\nOur hybridization of recurrent and convolutional networks may also represent a beneficial architectural bias. Specifically, on synthetic tasks which are algorithmic in nature, our hybrid networks both train faster and extrapolate better to test examples outside the span of the training set.", "keywords": "deep learning;architecture search;computer vision", "primary_area": "", "supplementary_material": "", "author": "Pedro Savarese;Michael Maire", "authorids": "savarese@ttic.edu;mmaire@uchicago.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nsavarese2018learning,\ntitle={Learning Implicitly Recurrent {CNN}s Through Parameter Sharing},\nauthor={Pedro Savarese and Michael Maire},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgYxn09Fm},\n}", "github": "[![github](/images/github_icon.svg) lolemacs/soft-sharing](https://github.com/lolemacs/soft-sharing)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;4", "wc_review": "147;228;1448", "wc_reply_reviewers": "0;0;29", "wc_reply_authors": "512;317;2583", "reply_reviewers": "0;0;1", "reply_authors": "1;1;4", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 607.6666666666666, 595.1248235081069 ], "wc_reply_reviewers_avg": [ 9.666666666666666, 13.67073110293992 ], "wc_reply_authors_avg": [ 1137.3333333333333, 1025.335825744695 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15123734257747528548&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rJgYxn09Fm", "pdf": "https://openreview.net/pdf?id=rJgYxn09Fm", "email": ";", "author_num": 2 }, { "id": "rJg_NjCqtX", "title": "CHEMICAL NAMES STANDARDIZATION USING NEURAL SEQUENCE TO SEQUENCE MODEL", "track": "main", "status": "Reject", "tldr": "We designed an end-to-end framework using sequence to sequence model to do the chemical names standardization.", "abstract": "Chemical information extraction is to convert chemical knowledge in text into true chemical database, which is a text processing task heavily relying on chemical compound name identification and standardization. Once a systematic name for a chemical compound is given, it will naturally and much simply convert the name into the eventually required molecular formula. However, for many chemical substances, they have been shown in many other names besides their systematic names which poses a great challenge for this task. In this paper, we propose a framework to do the auto standardization from the non-systematic names to the corresponding systematic names by using the spelling error correction, byte pair encoding tokenization and neural sequence to sequence model. Our framework is trained end to end and is fully data-driven. Our standardization accuracy on the test dataset achieves 54.04% which has a great improvement compared to previous state-of-the-art result.", "keywords": "Chemical Names Standardization;Byte Pair Encoding;Sequence to Sequence Model", "primary_area": "", "supplementary_material": "", "author": "Junlang Zhan;Hai Zhao", "authorids": "longmr.zhan@sjtu.edu.cn;zhaohai@cs.sjtu.edu.cn", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhan2019chemical,\ntitle={{CHEMICAL} {NAMES} {STANDARDIZATION} {USING} {NEURAL} {SEQUENCE} {TO} {SEQUENCE} {MODEL}},\nauthor={Junlang Zhan and Hai Zhao},\nyear={2019},\nurl={https://openreview.net/forum?id=rJg_NjCqtX},\n}", "github": "[![github](/images/github_icon.svg) zhanjunlang/Neural_Chemical_Name_Standardization](https://github.com/zhanjunlang/Neural_Chemical_Name_Standardization)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJg_NjCqtX", "pdf_size": 0, "rating": "3;4;7", "confidence": "5;4;3", "wc_review": "372;143;234", "wc_reply_reviewers": "53;35;0", "wc_reply_authors": "302;530;105", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 249.66666666666666, 94.14291735913483 ], "wc_reply_reviewers_avg": [ 29.333333333333332, 22.00504992546534 ], "wc_reply_authors_avg": [ 312.3333333333333, 173.65930886524788 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.9607689228305228, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:e1nY3Th2jAQJ:scholar.google.com/&scioq=CHEMICAL+NAMES+STANDARDIZATION+USING+NEURAL+SEQUENCE+TO+SEQUENCE+MODEL&hl=en&as_sdt=0,5", "gs_version_total": 4 }, { "title": "Learning Particle Dynamics for Manipulating Rigid Bodies, Deformable Objects, and Fluids", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/852", "id": "rJgbSn09Ym", "author_site": "Yunzhu Li, Jiajun Wu, Russ Tedrake, Joshua B Tenenbaum, Antonio Torralba", "tldr": "Learning particle dynamics with dynamic interaction graphs for simulating and control rigid bodies, deformable objects, and fluids. ", "abstract": "Real-life control tasks involve matters of various substances---rigid or soft bodies, liquid, gas---each with distinct physical behaviors. This poses challenges to traditional rigid-body physics engines. Particle-based simulators have been developed to model the dynamics of these complex scenes; however, relying on approximation techniques, their simulation often deviates from real-world physics, especially in the long term. In this paper, we propose to learn a particle-based simulator for complex control tasks. Combining learning with particle-based systems brings in two major benefits: first, the learned simulator, just like other particle-based systems, acts widely on objects of different materials; second, the particle-based representation poses strong inductive bias for learning: particles of the same type have the same dynamics within. This enables the model to quickly adapt to new environments of unknown dynamics within a few observations. We demonstrate robots achieving complex manipulation tasks using the learned simulator, such as manipulating fluids and deformable foam, with experiments both in simulation and in the real world. Our study helps lay the foundation for robot learning of dynamic scenes with particle-based representations.", "keywords": "Dynamics modeling;Control;Particle-Based Representation", "primary_area": "", "supplementary_material": "", "author": "Yunzhu Li;Jiajun Wu;Russ Tedrake;Joshua B. Tenenbaum;Antonio Torralba", "authorids": "liyunzhu@mit.edu;jiajunwu@mit.edu;russt@mit.edu;jbt@mit.edu;torralba@mit.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nli2018learning,\ntitle={Learning Particle Dynamics for Manipulating Rigid Bodies, Deformable Objects, and Fluids},\nauthor={Yunzhu Li and Jiajun Wu and Russ Tedrake and Joshua B. Tenenbaum and Antonio Torralba},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgbSn09Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;3;3", "wc_review": "477;228;668", "wc_reply_reviewers": "189;0;119", "wc_reply_authors": "692;323;620", "reply_reviewers": "1;0;1", "reply_authors": "2;1;2", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 457.6666666666667, 180.14870400742703 ], "wc_reply_reviewers_avg": [ 102.66666666666667, 78.01851632073561 ], "wc_reply_authors_avg": [ 545.0, 159.70597985047397 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 435, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1764275287835987384&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=rJgbSn09Ym", "pdf": "https://openreview.net/pdf?id=rJgbSn09Ym", "email": ";;;;", "author_num": 5 }, { "id": "rJgdHs05FQ", "title": "Nonlinear Channels Aggregation Networks for Deep Action Recognition", "track": "main", "status": "Withdraw", "tldr": "An architecture enables CNN trained on the video sequences converging rapidly ", "abstract": "We introduce the concept of channel aggregation in ConvNet architecture, a novel compact representation of CNN features useful for explicitly modeling the nonlinear channels encoding especially when the new unit is embedded inside of deep architectures for action recognition. The channel aggregation is based on multiple-channels features of ConvNet and aims to be at the spot finding the optical convergence path at fast speed. We name our proposed convolutional architecture \u201cnonlinear channels aggregation networks (NCAN)\u201d and its new layer \u201cnonlinear channels aggregation layer (NCAL)\u201d. We theoretically motivate channels aggregation functions and empirically study their effect on convergence speed and classification accuracy. Another contribution in this work is an efficient and effective implementation of the NCAL, speeding it up orders of magnitude. We evaluate its performance on standard benchmarks UCF101 and HMDB51, and experimental results demonstrate that this formulation not only obtains a fast convergence but stronger generalization capability without sacrificing performance.", "keywords": "action recognition;convolutional neural network;network training", "primary_area": "", "supplementary_material": "", "author": "Zhigang Zhu;Hongbing Ji;Wenbo Zhang;Cheng Ouyang", "authorids": "zgzhu_xidian@163.com;hbji@xidian.edu.cn;zwbsoul@163.com;ouoyc@aliyun.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJgdHs05FQ", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;3", "wc_review": "369;806;183", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 452.6666666666667, 261.1287464493755 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WDJXT_GYaSwJ:scholar.google.com/&scioq=Nonlinear+Channels+Aggregation+Networks+for+Deep+Action+Recognition&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJgfjjC9Ym", "title": "Backprop with Approximate Activations for Memory-efficient Network Training", "track": "main", "status": "Reject", "tldr": "An algorithm to reduce the amount of memory required for training deep networks, based on an approximation strategy.", "abstract": "With innovations in architecture design, deeper and wider neural network models deliver improved performance on a diverse variety of tasks. But the increased memory footprint of these models presents a challenge during training, when all intermediate layer activations need to be stored for back-propagation. Limited GPU memory forces practitioners to make sub-optimal choices: either train inefficiently with smaller batches of examples; or limit the architecture to have lower depth and width, and fewer layers at higher spatial resolutions. This work introduces an approximation strategy that significantly reduces a network's memory footprint during training, but has negligible effect on training performance and computational expense. During the forward pass, we replace activations with lower-precision approximations immediately after they have been used by subsequent layers, thus freeing up memory. The approximate activations are then used during the backward pass. This approach limits the accumulation of errors across the forward and backward pass---because the forward computation across the network still happens at full precision, and the approximation has a limited effect when computing gradients to a layer's input. Experiments, on CIFAR and ImageNet, show that using our approach with 8- and even 4-bit fixed-point approximations of 32-bit floating-point activations has only a minor effect on training and validation performance, while affording significant savings in memory usage.", "keywords": "Back-propagation;Memory Efficient Training;Approximate Gradients;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Ayan Chakrabarti;Benjamin Moseley", "authorids": "ayan@wustl.edu;moseleyb@andrew.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchakrabarti2019backprop,\ntitle={Backprop with Approximate Activations for Memory-efficient Network Training},\nauthor={Ayan Chakrabarti and Benjamin Moseley},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgfjjC9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJgfjjC9Ym", "pdf_size": 0, "rating": "5;5;7", "confidence": "5;3;4", "wc_review": "221;302;324", "wc_reply_reviewers": "85;0;0", "wc_reply_authors": "780;454;571", "reply_reviewers": "2;0;0", "reply_authors": "4;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 282.3333333333333, 44.28945196720722 ], "wc_reply_reviewers_avg": [ 28.333333333333332, 40.069384267237695 ], "wc_reply_authors_avg": [ 601.6666666666666, 134.84394272227763 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5757416287639825852&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rJgvf3RcFQ", "title": "On Inductive Biases in Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Many deep reinforcement learning algorithms contain inductive biases that sculpt the agent's objective and its interface to the environment. These inductive biases can take many forms, including domain knowledge and pretuned hyper-parameters. In general, there is a trade-off between generality and performance when we use such biases. Stronger biases can lead to faster learning, but weaker biases can potentially lead to more general algorithms that work on a wider class of problems.\nThis trade-off is relevant because these inductive biases are not free; substantial effort may be required to obtain relevant domain knowledge or to tune hyper-parameters effectively. In this paper, we re-examine several domain-specific components that modify the agent's objective and environmental interface. We investigated whether the performance deteriorates when all these fixed components are replaced with adaptive solutions from the literature. In our experiments, performance sometimes decreased with the adaptive components, as one might expect when comparing to components crafted for the domain, but sometimes the adaptive components performed better. We then investigated the main benefit of having fewer domain-specific components, by comparing the learning performance of the two systems on a different set of continuous control problems, without additional tuning of either system. As hypothesized, the system with adaptive components performed better on many of the tasks.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Matteo Hessel;Hado van Hasselt;Joseph Modayil;David Silver", "authorids": "mtthss@google.com;hado@google.com;modayil@google.com;davidsilver@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nhessel2019on,\ntitle={On Inductive Biases in Deep Reinforcement Learning},\nauthor={Matteo Hessel and Hado van Hasselt and Joseph Modayil and David Silver},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgvf3RcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJgvf3RcFQ", "pdf_size": 0, "rating": "3;3;7", "confidence": "4;4;2", "wc_review": "161;352;73", "wc_reply_reviewers": "0;90;0", "wc_reply_authors": "195;72;28", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 195.33333333333334, 116.45981662740539 ], "wc_reply_reviewers_avg": [ 30.0, 42.42640687119285 ], "wc_reply_authors_avg": [ 98.33333333333333, 70.67452786463372 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12310475139188121141&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rJgz8sA5F7", "title": "HC-Net: Memory-based Incremental Dual-Network System for Continual learning", "track": "main", "status": "Reject", "tldr": "In this paper, we propose a network which efficiently increases its complexity without degrading the performance of previous tasks inspired by the brain system of human being", "abstract": "Training a neural network for a classification task typically assumes that the data to train are given from the beginning.\nHowever, in the real world, additional data accumulate gradually and the model requires additional training without accessing the old training data. This usually leads to the catastrophic forgetting problem which is inevitable for the traditional training methodology of neural networks.\nIn this paper, we propose a memory-based continual learning method that is able to learn additional tasks while retaining the performance of previously learned tasks.\nComposed of two complementary networks, the Hippocampus-Net (H-Net) and the Cortex-Net (C-Net), our model estimates the index of the corresponding task for an input sample and utilizes a particular portion of itself with the estimated index.\nThe C-Net guarantees no degradation in the performance of the previously learned tasks and the H-Net shows high confidence in finding the origin of an input sample.", "keywords": "continual learning;lifelong learning;catastrophic forgetting", "primary_area": "", "supplementary_material": "", "author": "Jangho Kim;Jeesoo Kim;Nojun Kwak", "authorids": "kjh91@snu.ac.kr;kimjiss0305@snu.ac.kr;nojunk@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkim2019hcnet,\ntitle={{HC}-Net: Memory-based Incremental Dual-Network System for Continual learning},\nauthor={Jangho Kim and Jeesoo Kim and Nojun Kwak},\nyear={2019},\nurl={https://openreview.net/forum?id=rJgz8sA5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJgz8sA5F7", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;3", "wc_review": "455;361;193", "wc_reply_reviewers": "12;22;0", "wc_reply_authors": "215;421;215", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 336.3333333333333, 108.3738385814994 ], "wc_reply_reviewers_avg": [ 11.333333333333334, 8.993825042154695 ], "wc_reply_authors_avg": [ 283.6666666666667, 97.10933128295252 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9718004432089239965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJl-HsR9KX", "title": "Discriminative Active Learning", "track": "main", "status": "Reject", "tldr": "A new active learning algorithm for the batch mode setting using neural networks", "abstract": "We propose a new batch mode active learning algorithm designed for neural networks and large query batch sizes. The method, Discriminative Active Learning (DAL), poses active learning as a binary classification task, attempting to choose examples to label in such a way as to make the labeled set and the unlabeled pool indistinguishable. Experimenting on image classification tasks, we empirically show our method to be on par with state of the art methods in medium and large query batch sizes, while being simple to implement and also extend to other domains besides classification tasks. Our experiments also show that none of the state of the art methods of today are clearly better than uncertainty sampling, negating some of the reported results in the recent literature.", "keywords": "Active Learning;Neural Networks", "primary_area": "", "supplementary_material": "", "author": "Daniel Gissin;Shai Shalev-Shwartz", "authorids": "daniel.gissin@mail.huji.ac.il;shais@cs.huji.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngissin2019discriminative,\ntitle={Discriminative Active Learning},\nauthor={Daniel Gissin and Shai Shalev-Shwartz},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl-HsR9KX},\n}", "github": "[![github](/images/github_icon.svg) dsgissin/DiscriminativeActiveLearning](https://github.com/dsgissin/DiscriminativeActiveLearning) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rJl-HsR9KX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJl-HsR9KX", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;4;4", "wc_review": "618;551;446", "wc_reply_reviewers": "24;186;0", "wc_reply_authors": "1084;806;255", "reply_reviewers": "1;2;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 538.3333333333334, 70.78763231588474 ], "wc_reply_reviewers_avg": [ 70.0, 82.60750571225353 ], "wc_reply_authors_avg": [ 715.0, 344.50060474064003 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 247, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13126030920953179607&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/972", "id": "rJl-b3RcF7", "author_site": "Jonathan Frankle, Michael Carbin", "tldr": "Feedforward neural networks that can have weights pruned after training could have had the same weights pruned before training", "abstract": "Neural network pruning techniques can reduce the parameter counts of trained networks by over 90%, decreasing storage requirements and improving computational performance of inference without compromising accuracy. However, contemporary experience is that the sparse architectures produced by pruning are difficult to train from the start, which would similarly improve training performance.\n\nWe find that a standard pruning technique naturally uncovers subnetworks whose initializations made them capable of training effectively. Based on these results, we articulate the \"lottery ticket hypothesis:\" dense, randomly-initialized, feed-forward networks contain subnetworks (\"winning tickets\") that - when trained in isolation - reach test accuracy comparable to the original network in a similar number of iterations. The winning tickets we find have won the initialization lottery: their connections have initial weights that make training particularly effective.\n\nWe present an algorithm to identify winning tickets and a series of experiments that support the lottery ticket hypothesis and the importance of these fortuitous initializations. We consistently find winning tickets that are less than 10-20% of the size of several fully-connected and convolutional feed-forward architectures for MNIST and CIFAR10. Above this size, the winning tickets that we find learn faster than the original network and reach higher test accuracy.", "keywords": "Neural networks;sparsity;pruning;compression;performance;architecture search", "primary_area": "", "supplementary_material": "", "author": "Jonathan Frankle;Michael Carbin", "authorids": "jfrankle@mit.edu;mcarbin@csail.mit.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nfrankle2018the,\ntitle={The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks},\nauthor={Jonathan Frankle and Michael Carbin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl-b3RcF7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 24 community implementations](https://paperswithcode.com/paper/?openreview=rJl-b3RcF7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;9;9", "confidence": "4;4;4", "wc_review": "847;954;884", "wc_reply_reviewers": "0;166;30", "wc_reply_authors": "1055;1048;632", "reply_reviewers": "0;1;1", "reply_authors": "2;2;1", "rating_avg": [ 7.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 895.0, 44.36965930302674 ], "wc_reply_reviewers_avg": [ 65.33333333333333, 72.22803395419876 ], "wc_reply_authors_avg": [ 911.6666666666666, 197.77484392330814 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4377, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14267585926782353027&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=rJl-b3RcF7", "pdf": "https://openreview.net/pdf?id=rJl-b3RcF7", "email": ";", "author_num": 2 }, { "title": "Regularized Learning for Domain Adaptation under Label Shifts", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/867", "id": "rJl0r3R9KX", "author_site": "Kamyar Azizzadenesheli, Anqi Liu, Fanny Yang, Anima Anandkumar", "tldr": "A practical and provably guaranteed approach for training efficiently classifiers in the presence of label shifts between Source and Target data sets", "abstract": "We propose Regularized Learning under Label shifts (RLLS), a principled and a practical domain-adaptation algorithm to correct for shifts in the label distribution between a source and a target domain. We first estimate importance weights using labeled source data and unlabeled target data, and then train a classifier on the weighted source samples. We derive a generalization bound for the classifier on the target domain which is independent of the (ambient) data dimensions, and instead only depends on the complexity of the function class. To the best of our knowledge, this is the first generalization bound for the label-shift problem where the labels in the target domain are not available. Based on this bound, we propose a regularized estimator for the small-sample regime which accounts for the uncertainty in the estimated weights. Experiments on the CIFAR-10 and MNIST datasets show that RLLS improves classification accuracy, especially in the low sample and large-shift regimes, compared to previous methods.", "keywords": "Deep Learning;Domain Adaptation;Label Shift;Importance Weights;Generalization", "primary_area": "", "supplementary_material": "", "author": "Kamyar Azizzadenesheli;Anqi Liu;Fanny Yang;Animashree Anandkumar", "authorids": "kazizzad@uci.edu;anqiliu@caltech.edu;fan.yang@stat.math.ethz.ch;anima@caltech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nazizzadenesheli2018regularized,\ntitle={Regularized Learning for Domain Adaptation under Label Shifts},\nauthor={Kamyar Azizzadenesheli and Anqi Liu and Fanny Yang and Animashree Anandkumar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl0r3R9KX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rJl0r3R9KX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "wc_review": "749;404;179", "wc_reply_reviewers": "34;0;0", "wc_reply_authors": "1044;835;301", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 444.0, 234.41416339462086 ], "wc_reply_reviewers_avg": [ 11.333333333333334, 16.027753706895076 ], "wc_reply_authors_avg": [ 726.6666666666666, 312.851757582121 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 263, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9783659999001427739&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=rJl0r3R9KX", "pdf": "https://openreview.net/pdf?id=rJl0r3R9KX", "email": ";;;", "author_num": 4 }, { "id": "rJl2E3AcF7", "title": "Doubly Sparse: Sparse Mixture of Sparse Experts for Efficient Softmax Inference", "track": "main", "status": "Reject", "tldr": "We present doubly sparse softmax, the sparse mixture of sparse of sparse experts, to improve the efficiency for softmax inference through exploiting the two-level overlapping hierarchy. ", "abstract": "Computations for the softmax function in neural network models are expensive when the number of output classes is large. This can become a significant issue in both training and inference for such models. In this paper, we present Doubly Sparse Softmax (DS-Softmax), Sparse Mixture of Sparse of Sparse Experts, to improve the efficiency for softmax inference. During training, our method learns a two-level class hierarchy by dividing entire output class space into several partially overlapping experts. Each expert is responsible for a learned subset of the output class space and each output class only belongs to a small number of those experts. During inference, our method quickly locates the most probable expert to compute small-scale softmax. Our method is learning-based and requires no knowledge of the output class partition space a priori. We empirically evaluate our method on several real-world tasks and demonstrate that we can achieve significant computation reductions without loss of performance.", "keywords": "hierarchical softmax;model compression", "primary_area": "", "supplementary_material": "", "author": "Shun Liao;Ting Chen;Tian Lin;Chong Wang;Dengyong Zhou", "authorids": "sliao3@cs.toronto.edu;tingchen@cs.ucla.edu;tianlin@google.com;dennyzhou@google.com;chongw@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nliao2019doubly,\ntitle={Doubly Sparse: Sparse Mixture of Sparse Experts for Efficient Softmax Inference},\nauthor={Shun Liao and Ting Chen and Tian Lin and Chong Wang and Dengyong Zhou},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl2E3AcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJl2E3AcF7", "pdf_size": 0, "rating": "4;6;7", "confidence": "3;3;3", "wc_review": "136;333;204", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "299;350;332", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 224.33333333333334, 81.69999320005411 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 327.0, 21.118712081942874 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16884193074330706895&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "rJl3S2A9t7", "title": "Policy Optimization via Stochastic Recursive Gradient Algorithm", "track": "main", "status": "Reject", "tldr": "This paper proposes the StochAstic Recursive Gradient Policy Optimization (SARAPO) algorithm based on the novel SARAH method, and exemplifies its advantages over existing policy gradient methods from both theory and experiments.", "abstract": "In this paper, we propose the StochAstic Recursive grAdient Policy Optimization (SARAPO) algorithm which is a novel variance reduction method on Trust Region Policy Optimization (TRPO). The algorithm incorporates the StochAstic Recursive grAdient algoritHm(SARAH) into the TRPO framework. Compared with the existing Stochastic Variance Reduced Policy Optimization (SVRPO), our algorithm is more stable in the variance. Furthermore, by theoretical analysis the ordinary differential equation and the stochastic differential equation (ODE/SDE) of SARAH, we analyze its convergence property and stability. Our experiments demonstrate its performance on a variety of benchmark tasks. We show that our algorithm gets better improvement in each iteration and matches or even outperforms SVRPO and TRPO.\n", "keywords": "reinforcement learning;policy gradient;variance reduction;stochastic recursive gradient algorithm", "primary_area": "", "supplementary_material": "", "author": "Huizhuo Yuan;Chris Junchi Li;Yuhao Tang;Yuren Zhou", "authorids": "yuanhz@pku.edu.cn;junchi.li.duke@gmail.com;yuhaotang97@gmail.com;yuren.zhou@duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyuan2019policy,\ntitle={Policy Optimization via Stochastic Recursive Gradient Algorithm},\nauthor={Huizhuo Yuan and Chris Junchi Li and Yuhao Tang and Yuren Zhou},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl3S2A9t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=rJl3S2A9t7", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;2", "wc_review": "175;401;81", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 219.0, 134.29321154349788 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1166002597573117873&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 0 }, { "id": "rJl4BsR5KX", "title": "k-Nearest Neighbors by Means of Sequence to Sequence Deep Neural Networks and Memory Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "k-Nearest Neighbors is one of the most fundamental but effective classification models. In this paper, we propose two families of models built on a sequence to sequence model and a memory network model to mimic the k-Nearest Neighbors model, which generate a sequence of labels, a sequence of out-of-sample feature vectors and a final label for classification, and thus they could also function as oversamplers. We also propose `out-of-core' versions of our models which assume that only a small portion of data can be loaded into memory. Computational experiments show that our models outperform k-Nearest Neighbors, a feed-forward neural network and a memory network, due to the fact that our models must produce additional output and not just the label. As an oversampler on imbalanced datasets, the sequence to sequence kNN model often outperforms Synthetic Minority Over-sampling Technique and Adaptive Synthetic Sampling.\n", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yiming Xu;Diego Klabjan", "authorids": "yimingxu2020@u.northwestern.edu;d-klabjan@northwestern.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nxu2019knearest,\ntitle={k-Nearest Neighbors by Means of Sequence to Sequence Deep Neural Networks and Memory Networks},\nauthor={Yiming Xu and Diego Klabjan},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl4BsR5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJl4BsR5KX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;4", "wc_review": "410;470;768", "wc_reply_reviewers": "0;92;108", "wc_reply_authors": "493;801;670", "reply_reviewers": "0;1;1", "reply_authors": "1;2;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 549.3333333333334, 156.54889190139787 ], "wc_reply_reviewers_avg": [ 66.66666666666667, 47.59084879353266 ], "wc_reply_authors_avg": [ 654.6666666666666, 126.20706090477752 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12383209615758926554&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rJl6M2C5Y7", "title": "Online Hyperparameter Adaptation via Amortized Proximal Optimization", "track": "main", "status": "Reject", "tldr": "We introduce amortized proximal optimization (APO), a method to adapt a variety of optimization hyperparameters online during training, including learning rates, damping coefficients, and gradient variance exponents.", "abstract": "Effective performance of neural networks depends critically on effective tuning of optimization hyperparameters, especially learning rates (and schedules thereof). We present Amortized Proximal Optimization (APO), which takes the perspective that each optimization step should approximately minimize a proximal objective (similar to the ones used to motivate natural gradient and trust region policy optimization). Optimization hyperparameters are adapted to best minimize the proximal objective after one weight update. We show that an idealized version of APO (where an oracle minimizes the proximal objective exactly) achieves global convergence to stationary point and locally second-order convergence to global optimum for neural networks. APO incurs minimal computational overhead. We experiment with using APO to adapt a variety of optimization hyperparameters online during training, including (possibly layer-specific) learning rates, damping coefficients, and gradient variance exponents. For a variety of network architectures and optimization algorithms (including SGD, RMSprop, and K-FAC), we show that with minimal tuning, APO performs competitively with carefully tuned optimizers.", "keywords": "hyperparameters;optimization;learning rate adaptation", "primary_area": "", "supplementary_material": "", "author": "Paul Vicol;Jeffery Z. HaoChen;Roger Grosse", "authorids": "pvicol@cs.toronto.edu;zhc15@mails.tsinghua.edu.cn;rgrosse@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nvicol2019online,\ntitle={Online Hyperparameter Adaptation via Amortized Proximal Optimization},\nauthor={Paul Vicol and Jeffery Z. HaoChen and Roger Grosse},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl6M2C5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJl6M2C5Y7", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;3", "wc_review": "219;293;358", "wc_reply_reviewers": "257;0;0", "wc_reply_authors": "1249;653;580", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 290.0, 56.786148545808835 ], "wc_reply_reviewers_avg": [ 85.66666666666667, 121.15096184329514 ], "wc_reply_authors_avg": [ 827.3333333333334, 299.6490539874197 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:bKiAbvpcGn4J:scholar.google.com/&scioq=Online+Hyperparameter+Adaptation+via+Amortized+Proximal+Optimization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJl8BhRqF7", "title": "Improving machine classification using human uncertainty measurements", "track": "main", "status": "Reject", "tldr": "improving classifiers using human uncertainty measurements", "abstract": "As deep CNN classifier performance using ground-truth labels has begun to asymptote at near-perfect levels, a key aim for the field is to extend training paradigms to capture further useful structure in natural image data and improve model robustness and generalization. In this paper, we present a novel natural image benchmark for making this extension, which we call CIFAR10H. This new dataset comprises a human-derived, full distribution over labels for each image of the CIFAR10 test set, offering the ability to assess the generalization of state-of-the-art CIFAR10 models, as well as investigate the effects of including this information in model training. We show that classification models trained on CIFAR10 do not generalize as well to our dataset as it does to traditional extensions, and that models fine-tuned using our label information are able to generalize better to related datasets, complement popular data augmentation schemes, and provide robustness to adversarial attacks. We explain these improvements in terms of better empirical approximations to the expected loss function over natural images and their categories in the visual world.", "keywords": "image classification;human experiments;risk minimization", "primary_area": "", "supplementary_material": "", "author": "Ruairidh M. Battleday;Joshua C. Peterson;Thomas L. Griffiths", "authorids": "ruairidh.battleday@gmail.com;peterson.c.joshua@gmail.com;tomg@princeton.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbattleday2019improving,\ntitle={Improving machine classification using human uncertainty measurements},\nauthor={Ruairidh M. Battleday and Joshua C. Peterson and Thomas L. Griffiths},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl8BhRqF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJl8BhRqF7", "pdf_size": 0, "rating": "3;3;6", "confidence": "2;5;4", "wc_review": "420;171;620", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 403.6666666666667, 183.66696914675637 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12746275423676314108&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rJl8FoRcY7", "title": "Deep Generative Models for learning Coherent Latent Representations from Multi-Modal Data", "track": "main", "status": "Reject", "tldr": "Deriving a general formulation of a multi-modal VAE from the joint marginal log-likelihood.", "abstract": "The application of multi-modal generative models by means of a Variational Auto Encoder (VAE) is an upcoming research topic for sensor fusion and bi-directional modality exchange.\nThis contribution gives insights into the learned joint latent representation and shows that expressiveness and coherence are decisive properties for multi-modal datasets.\nFurthermore, we propose a multi-modal VAE derived from the full joint marginal log-likelihood that is able to learn the most meaningful representation for ambiguous observations.\nSince the properties of multi-modal sensor setups are essential for our approach but hardly available, we also propose a technique to generate correlated datasets from uni-modal ones.\n", "keywords": "Multi-Modal Deep Generative Models;Sensor Fusion;Data Generation;VAE", "primary_area": "", "supplementary_material": "", "author": "Timo Korthals;Marc Hesse;J\u00fcrgen Leitner", "authorids": "korthals.timo@gmail.com;mhesse@cit-ec.uni-bielefeld.de;juxi.leitner@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nkorthals2019deep,\ntitle={Deep Generative Models for learning Coherent Latent Representations from Multi-Modal Data},\nauthor={Timo Korthals and Marc Hesse and J\u00fcrgen Leitner},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl8FoRcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJl8FoRcY7", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;3;2", "wc_review": "269;205;125", "wc_reply_reviewers": "154;0;0", "wc_reply_authors": "1038;813;489", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 199.66666666666666, 58.90859209166539 ], "wc_reply_reviewers_avg": [ 51.333333333333336, 72.59629620181887 ], "wc_reply_authors_avg": [ 780.0, 225.33974349856706 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:RNElQaeU_j4J:scholar.google.com/&scioq=Deep+Generative+Models+for+learning+Coherent+Latent+Representations+from+Multi-Modal+Data&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJl8viCqKQ", "title": "Low Latency Privacy Preserving Inference", "track": "main", "status": "Reject", "tldr": "This work presents methods, combining neural-networks and encryptions, to make predictions while preserving the privacy of the data owner with low latency", "abstract": "When applying machine learning to sensitive data one has to balance between accuracy, information leakage, and computational-complexity. Recent studies have shown that Homomorphic Encryption (HE) can be used for protecting against information leakage while applying neural networks. However, this comes with the cost of limiting the kind of neural networks that can be used (and hence the accuracy) and with latency of the order of several minutes even for relatively simple networks. In this study we improve on previous results both in the kind of networks that can be applied and in terms of the latency. Most of the improvement is achieved by novel ways to represent the data to make better use of the capabilities of the encryption scheme.", "keywords": "privacy;classification;homomorphic encryption;neural networks", "primary_area": "", "supplementary_material": "", "author": "Alon Brutzkus;Oren Elisha;Ran Gilad-Bachrach", "authorids": "brutzkus@gmail.com;oren.elisha@microsoft.com;rani.gb@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbrutzkus2019low,\ntitle={Low Latency Privacy Preserving Inference},\nauthor={Alon Brutzkus and Oren Elisha and Ran Gilad-Bachrach},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl8viCqKQ},\n}", "github": "[![github](/images/github_icon.svg) microsoft/CryptoNets](https://github.com/microsoft/CryptoNets)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=rJl8viCqKQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;2", "wc_review": "472;436;58", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "736;455;48", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 322.0, 187.2538384119268 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 413.0, 282.4405542174613 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 321, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=86142108232916247&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "title": "Von Mises-Fisher Loss for Training Sequence to Sequence Models with Continuous Outputs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1102", "id": "rJlDnoA5Y7", "author_site": "Sachin Kumar, Yulia Tsvetkov", "tldr": "Language generation using seq2seq models which produce word embeddings instead of a softmax based distribution over the vocabulary at each step enabling much faster training while maintaining generation quality", "abstract": "The Softmax function is used in the final layer of nearly all existing sequence-to-sequence models for language generation. However, it is usually the slowest layer to compute which limits the vocabulary size to a subset of most frequent types; and it has a large memory footprint. We propose a general technique for replacing the softmax layer with a continuous embedding layer. Our primary innovations are a novel probabilistic loss, and a training and inference procedure in which we generate a probability distribution over pre-trained word embeddings, instead of a multinomial distribution over the vocabulary obtained via softmax. We evaluate this new class of sequence-to-sequence models with continuous outputs on the task of neural machine translation. We show that our models obtain upto 2.5x speed-up in training time while performing on par with the state-of-the-art models in terms of translation quality. These models are capable of handling very large vocabularies without compromising on translation quality. They also produce more meaningful errors than in the softmax-based models, as these errors typically lie in a subspace of the vector space of the reference translations.", "keywords": "Language Generation;Regression;Word Embeddings;Machine Translation", "primary_area": "", "supplementary_material": "", "author": "Sachin Kumar;Yulia Tsvetkov", "authorids": "sachink@cs.cmu.edu;ytsvetko@cs.cmu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkumar2018von,\ntitle={Von Mises-Fisher Loss for Training Sequence to Sequence Models with Continuous Outputs},\nauthor={Sachin Kumar and Yulia Tsvetkov},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlDnoA5Y7},\n}", "github": "[![github](/images/github_icon.svg) Sachin19/seq2seq-con](https://github.com/Sachin19/seq2seq-con)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;4", "wc_review": "362;264;514", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "312;341;349", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 380.0, 102.8526454043194 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 334.0, 15.895492023421818 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 86, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1822338940984352644&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rJlDnoA5Y7", "pdf": "https://openreview.net/pdf?id=rJlDnoA5Y7", "email": ";", "author_num": 2 }, { "title": "Relational Forward Models for Multi-Agent Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/971", "id": "rJlEojAqFm", "author_site": "Andrea Tacchetti, Francis Song, Pedro Mediano, Vinicius Zambaldi, J\u00e1nos Kram\u00e1r, Neil C Rabinowitz, Thore Graepel, Matthew Botvinick, Peter Battaglia", "tldr": "Relational Forward Models for multi-agent learning make accurate predictions of agents' future behavior, they produce intepretable representations and can be used inside agents.", "abstract": "The behavioral dynamics of multi-agent systems have a rich and orderly structure, which can be leveraged to understand these systems, and to improve how artificial agents learn to operate in them. Here we introduce Relational Forward Models (RFM) for multi-agent learning, networks that can learn to make accurate predictions of agents' future behavior in multi-agent environments. Because these models operate on the discrete entities and relations present in the environment, they produce interpretable intermediate representations which offer insights into what drives agents' behavior, and what events mediate the intensity and valence of social interactions. Furthermore, we show that embedding RFM modules inside agents results in faster learning systems compared to non-augmented baselines. \nAs more and more of the autonomous systems we develop and interact with become multi-agent in nature, developing richer analysis tools for characterizing how and why agents make decisions is increasingly necessary. Moreover, developing artificial agents that quickly and safely learn to coordinate with one another, and with humans in shared environments, is crucial.", "keywords": "multi-agent reinforcement learning;relational reasoning;forward models", "primary_area": "", "supplementary_material": "", "author": "Andrea Tacchetti;H. Francis Song;Pedro A. M. Mediano;Vinicius Zambaldi;J\u00e1nos Kram\u00e1r;Neil C. Rabinowitz;Thore Graepel;Matthew Botvinick;Peter W. Battaglia", "authorids": "atacchet@google.com;songf@google.com;pmediano@imperial.ac.uk;vzambaldi@google.com;janosk@google.com;ncr@google.com;thore@google.com;botvinick@google.com;peterbattaglia@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\ntacchetti2018relational,\ntitle={Relational Forward Models for Multi-Agent Learning},\nauthor={Andrea Tacchetti and H. Francis Song and Pedro A. M. Mediano and Vinicius Zambaldi and J\u00e1nos Kram\u00e1r and Neil C. Rabinowitz and Thore Graepel and Matthew Botvinick and Peter W. Battaglia},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlEojAqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "6;6;7;7", "confidence": "4;3;3;3", "wc_review": "449;204;606;1239", "wc_reply_reviewers": "0;0;0;1519", "wc_reply_authors": "735;153;854;4546", "reply_reviewers": "0;0;0;7", "reply_authors": "2;1;2;9", "rating_avg": [ 6.5, 0.5 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 624.5, 382.6137085887018 ], "wc_reply_reviewers_avg": [ 379.75, 657.7462941742812 ], "wc_reply_authors_avg": [ 1572.0, 1737.4068320344547 ], "reply_reviewers_avg": [ 1.75, 3.031088913245535 ], "reply_authors_avg": [ 3.5, 3.2015621187164243 ], "replies_avg": [ 27, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": -0.5773502691896257, "gs_citation": 95, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1450759969074634127&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rJlEojAqFm", "pdf": "https://openreview.net/pdf?id=rJlEojAqFm", "email": ";;;;;;;;", "author_num": 9 }, { "id": "rJlGdsC9Ym", "title": "Learning of Sophisticated Curriculums by viewing them as Graphs over Tasks", "track": "main", "status": "Withdraw", "tldr": "We present a new algorithm for learning by curriculum based on the notion of mastering rate that outperforms previous algorithms.", "abstract": "Curriculum learning consists in learning a difficult task by first training on an easy version of it, then on more and more difficult versions and finally on the difficult task. To make this learning efficient, given a curriculum and the current learning state of an agent, we need to find what are the good next tasks to train the agent on.\nTeacher-Student algorithms assume that the good next tasks are the ones on which the agent is making the fastest progress or digress. We first simplify and improve them. However, two problematic situations where the agent is mainly trained on tasks it can't learn yet or it already learnt may occur.\nTherefore, we introduce a new algorithm using min max ordered curriculums that assumes that the good next tasks are the ones that are learnable but not learnt yet. It outperforms Teacher-Student algorithms on small curriculums and significantly outperforms them on sophisticated ones with numerous tasks.", "keywords": "learning;curriculum learning;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Lucas Willems;Yoshua Bengio", "authorids": "lcswillems@gmail.com;yoshua.bengio@umontreal.ca", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJlGdsC9Ym", "pdf_size": 0, "rating": "2;3;4", "confidence": "2;1;4", "wc_review": "63;72;412", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "328;56;192", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 2.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 182.33333333333334, 162.44041642672826 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 192.0, 111.04353500617074 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.6546536707079772, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w3uxTxfk6XEJ:scholar.google.com/&scioq=Learning+of+Sophisticated+Curriculums+by+viewing+them+as+Graphs+over+Tasks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJlHIo09KQ", "title": "Gradient-based Training of Slow Feature Analysis by Differentiable Approximate Whitening", "track": "main", "status": "Reject", "tldr": "We propose a way to train Slow Feature Analysis with stochastic gradient descent eliminating the need for greedy layer-wise training.", "abstract": "We propose Power Slow Feature Analysis, a gradient-based method to extract temporally slow features from a high-dimensional input stream that varies on a faster time-scale, as a variant of Slow Feature Analysis (SFA). While displaying performance comparable to hierarchical extensions to the SFA algorithm, such as Hierarchical Slow Feature Analysis, for a small number of output-features, our algorithm allows fully differentiable end-to-end training of arbitrary differentiable approximators (e.g., deep neural networks). We provide experimental evidence that PowerSFA is able to extract meaningful and informative low-dimensional features in the case of (a) synthetic low-dimensional data, (b) visual data, and also for (c) a general dataset for which symmetric non-temporal relations between points can be defined.", "keywords": "Slow Feature Analysis;Deep Learning;Spectral Embedding;Temporal Coherence", "primary_area": "", "supplementary_material": "", "author": "Merlin Sch\u00fcler;Hlynur Dav\u00ed\u00f0 Hlynsson;Laurenz Wiskott", "authorids": "merlin.schueler@ini.rub.de;hlynur.hlynsson@ini.rub.de;laurenz.wiskott@ini.rub.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsch\u00fcler2019gradientbased,\ntitle={Gradient-based Training of Slow Feature Analysis by Differentiable Approximate Whitening},\nauthor={Merlin Sch\u00fcler and Hlynur Dav\u00ed\u00f0 Hlynsson and Laurenz Wiskott},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlHIo09KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJlHIo09KQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "2;4;4", "wc_review": "292;376;118", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;571;242", "reply_reviewers": "0;0;0", "reply_authors": "0;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 262.0, 107.44300814850634 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 271.0, 234.00997129752113 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4116780982906632862&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "rJlJ-2CqtX", "title": "Success at any cost: value constrained model-free continuous control", "track": "main", "status": "Reject", "tldr": "We apply constrained optimization to continuous control tasks subject to a penalty to ensure a lower bound on the return, and learn the resulting conditional Lagrangian multipliers simultaneously with the policy.", "abstract": "Naively applying Reinforcement Learning algorithms to continuous control problems -- such as locomotion and robot control -- to maximize task reward often results in policies which rely on high-amplitude, high-frequency control signals, known colloquially as bang-bang control. While such policies can implement the optimal solution, particularly in simulated systems, they are often not desirable for real world systems since bang-bang control can lead to increased wear and tear and energy consumption and tends to excite undesired second-order dynamics. To counteract this issue, multi-objective optimization can be used to simultaneously optimize both the reward and some auxiliary cost that discourages undesired (e.g. high-amplitude) control. In principle, such an approach can yield the sought after, smooth, control policies. It can, however, be hard to find the correct trade-off between cost and return that results in the desired behavior. In this paper we propose a new constraint-based approach which defines a lower bound on the return while minimizing one or more costs (such as control effort). We employ Lagrangian relaxation to learn both (a) the parameters of a control policy that satisfies the desired constraints and (b) the Lagrangian multipliers for the optimization. Moreover, we demonstrate policy optimization which satisfies constraints either in expectation or in a per-step fashion, and we learn a single conditional policy that is able to dynamically change the trade-off between return and cost. We demonstrate the efficiency of our approach using a number of continuous control benchmark tasks as well as a realistic, energy-optimized quadruped locomotion task.", "keywords": "reinforcement learning;continuous control;robotics;constrained optimization;multi-objective optimization", "primary_area": "", "supplementary_material": "", "author": "Steven Bohez;Abbas Abdolmaleki;Michael Neunert;Jonas Buchli;Nicolas Heess;Raia Hadsell", "authorids": "sbohez@google.com;aabdolmaleki@google.com;neunertm@google.com;buchli@google.com;heess@google.com;raia@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nbohez2019success,\ntitle={Success at any cost: value constrained model-free continuous control},\nauthor={Steven Bohez and Abbas Abdolmaleki and Michael Neunert and Jonas Buchli and Nicolas Heess and Raia Hadsell},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlJ-2CqtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=rJlJ-2CqtX", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "134;437;492", "wc_reply_reviewers": "128;125;74", "wc_reply_authors": "237;459;1095", "reply_reviewers": "1;1;1", "reply_authors": "1;1;3", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 354.3333333333333, 157.4088801674021 ], "wc_reply_reviewers_avg": [ 109.0, 24.779023386727733 ], "wc_reply_authors_avg": [ 597.0, 363.6151812012254 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5908850594766930709&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJlMBjAcYX", "title": "Optimizing for Generalization in Machine Learning with Cross-Validation Gradients", "track": "main", "status": "Reject", "tldr": "", "abstract": "Cross-validation is the workhorse of modern applied statistics and machine learning, as it provides a principled framework for selecting the model that maximizes generalization performance. In this paper, we show that the cross-validation risk is differentiable with respect to the hyperparameters and training data for many common machine learning algorithms, including logistic regression, elastic-net regression, and support vector machines. Leveraging this property of differentiability, we propose a cross-validation gradient method (CVGM) for hyperparameter optimization. Our method enables efficient optimization in high-dimensional hyperparameter spaces of the cross-validation risk, the best surrogate of the true generalization ability of our learning algorithm.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Barratt;Shane;Sharma;Rishi", "authorids": ";;;", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbarratt2019optimizing,\ntitle={Optimizing for Generalization in Machine Learning with Cross-Validation Gradients},\nauthor={Barratt and Shane and Sharma and Rishi},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlMBjAcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rJlMBjAcYX", "pdf_size": 0, "rating": "2;4;5", "confidence": "4;4;5", "wc_review": "452;246;241", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 313.0, 98.309036546325 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8640038540042720095&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "rJlRKjActQ", "title": "Manifold Mixup: Learning Better Representations by Interpolating Hidden States", "track": "main", "status": "Reject", "tldr": "A method for learning better representations, that acts as a regularizer and despite its no significant additional computation cost , achieves improvements over strong baselines on Supervised and Semi-supervised Learning tasks.", "abstract": "Deep networks often perform well on the data distribution on which they are trained, yet give incorrect (and often very confident) answers when evaluated on points from off of the training distribution. This is exemplified by the adversarial examples phenomenon but can also be seen in terms of model generalization and domain shift. Ideally, a model would assign lower confidence to points unlike those from the training distribution. We propose a regularizer which addresses this issue by training with interpolated hidden states and encouraging the classifier to be less confident at these points. Because the hidden states are learned, this has an important effect of encouraging the hidden states for a class to be concentrated in such a way so that interpolations within the same class or between two different classes do not intersect with the real data points from other classes. This has a major advantage in that it avoids the underfitting which can result from interpolating in the input space. We prove that the exact condition for this problem of underfitting to be avoided by Manifold Mixup is that the dimensionality of the hidden states exceeds the number of classes, which is often the case in practice. Additionally, this concentration can be seen as making the features in earlier layers more discriminative. We show that despite requiring no significant additional computation, Manifold Mixup achieves large improvements over strong baselines in supervised learning, robustness to single-step adversarial attacks, semi-supervised learning, and Negative Log-Likelihood on held out samples.", "keywords": "Regularizer;Supervised Learning;Semi-supervised Learning;Better representation learning;Deep Neural Networks.", "primary_area": "", "supplementary_material": "", "author": "Vikas Verma;Alex Lamb;Christopher Beckham;Amir Najafi;Aaron Courville;Ioannis Mitliagkas;Yoshua Bengio", "authorids": "vikasverma.iitm@gmail.com;lambalex@iro.umontreal.ca;christopher.j.beckham@gmail.com;najafy@ce.sharif.edu;aaron.courville@gmail.com;imitliagkas@gmail.com;yoshua.umontreal@gmail.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nverma2019manifold,\ntitle={Manifold Mixup: Learning Better Representations by Interpolating Hidden States},\nauthor={Vikas Verma and Alex Lamb and Christopher Beckham and Amir Najafi and Aaron Courville and Ioannis Mitliagkas and Yoshua Bengio},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlRKjActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJlRKjActQ", "pdf_size": 0, "rating": "4;6;8", "confidence": "4;4;2", "wc_review": "562;204;358", "wc_reply_reviewers": "697;0;0", "wc_reply_authors": "4773;419;689", "reply_reviewers": "2;0;0", "reply_authors": "12;3;2", "rating_avg": [ 6.0, 1.632993161855452 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 374.6666666666667, 146.62726743534287 ], "wc_reply_reviewers_avg": [ 232.33333333333334, 328.5689509913491 ], "wc_reply_authors_avg": [ 1960.3333333333333, 1991.9078515054077 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 5.666666666666667, 4.4969125210773475 ], "replies_avg": [ 37, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9316395507848195617&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Imposing Category Trees Onto Word-Embeddings Using A Geometric Construction", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/811", "id": "rJlWOj0qF7", "author_site": "Tiansi Dong, Christian Bauckhage, Hailong Jin, Juanzi Li, Olaf Cremers, Daniel Speicher, Armin Cremers, Joerg Zimmermann", "tldr": "we show a geometric method to perfectly encode categroy tree information into pre-trained word-embeddings.", "abstract": "We present a novel method to precisely impose tree-structured category information onto word-embeddings, resulting in ball embeddings in higher dimensional spaces (N-balls for short). Inclusion relations among N-balls implicitly encode subordinate relations among categories. The similarity measurement in terms of the cosine function is enriched by category information. Using a geometric construction method instead of back-propagation, we create large N-ball embeddings that satisfy two conditions: (1) category trees are precisely imposed onto word embeddings at zero energy cost; (2) pre-trained word embeddings are well preserved. A new benchmark data set is created for validating the category of unknown words. Experiments show that N-ball embeddings, carrying category information, significantly outperform word embeddings in the test of nearest neighborhoods, and demonstrate surprisingly good performance in validating categories of unknown words. Source codes and data-sets are free for public access \\url{https://github.com/gnodisnait/nball4tree.git} and \\url{https://github.com/gnodisnait/bp94nball.git}. ", "keywords": "category tree;word-embeddings;geometry", "primary_area": "", "supplementary_material": "", "author": "Tiansi Dong;Chrisitan Bauckhage;Hailong Jin;Juanzi Li;Olaf Cremers;Daniel Speicher;Armin B. Cremers;Joerg Zimmermann", "authorids": "tian1shi2@gmail.com;christian.bauckhage@iais.fraunhofer.de;jinhl15@mails.tsinghua.edu.cn;lijuanzi2008@gmail.com;cremerso@iai.uni-bonn.de;dsp@bit.uni-bonn.de;abc@iai.uni-bonn.de;jz@bit.uni-bonn.de", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\ndong2018encoding,\ntitle={Encoding Category Trees Into Word-Embeddings Using Geometric Approach},\nauthor={Tiansi Dong and Olaf Cremers and Hailong Jin and Juanzi Li and Chrisitan Bauckhage and Armin B. Cremers and Daniel Speicher and Joerg Zimmermann},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlWOj0qF7},\n}", "github": "[![github](/images/github_icon.svg) gnodisnait/nball4tree](https://github.com/gnodisnait/nball4tree) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rJlWOj0qF7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "wc_review": "113;438;199", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;152;0", "reply_reviewers": "0;0;0", "reply_authors": "0;3;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 250.0, 137.49424230369308 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 50.666666666666664, 71.65348716023682 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 1.4142135623730951 ], "replies_avg": [ 38, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13347660437088281744&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rJlWOj0qF7", "pdf": "https://openreview.net/pdf?id=rJlWOj0qF7", "email": ";;;;;;;", "author_num": 8 }, { "id": "rJl_NhR9K7", "title": "ISA-VAE: Independent Subspace Analysis with Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "We present structured priors for unsupervised learning of disentangled representations in VAEs that significantly mitigate the trade-off between disentanglement and reconstruction loss.", "abstract": "Recent work has shown increased interest in using the Variational Autoencoder (VAE) framework to discover interpretable representations of data in an unsupervised way. These methods have focussed largely on modifying the variational cost function to achieve this goal. However, we show that methods like beta-VAE simplify the tendency of variational inference to underfit causing pathological over-pruning and over-orthogonalization of learned components. In this paper we take a complementary approach: to modify the probabilistic model to encourage structured latent variable representations to be discovered. Specifically, the standard VAE probabilistic model is unidentifiable: the likelihood of the parameters is invariant under rotations of the latent space. This means there is no pressure to identify each true factor of variation with a latent variable.\nWe therefore employ a rich prior distribution, akin to the ICA model, that breaks the rotational symmetry.\nExtensive quantitative and qualitative experiments demonstrate that the proposed prior mitigates the trade-off introduced by modified cost functions like beta-VAE and TCVAE between reconstruction loss and disentanglement. The proposed prior allows to improve these approaches with respect to both disentanglement and reconstruction quality significantly over the state of the art.", "keywords": "representation learning;disentanglement;interpretability;variational autoencoders", "primary_area": "", "supplementary_material": "", "author": "Jan St\u00fchmer;Richard Turner;Sebastian Nowozin", "authorids": "t-jastuh@microsoft.com;ret26@cam.ac.uk;senowozi@microsoft.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nst\u00fchmer2019isavae,\ntitle={{ISA}-{VAE}: Independent Subspace Analysis with Variational Autoencoders},\nauthor={Jan St\u00fchmer and Richard Turner and Sebastian Nowozin},\nyear={2019},\nurl={https://openreview.net/forum?id=rJl_NhR9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=rJl_NhR9K7", "pdf_size": 0, "rating": "4;4;7", "confidence": "5;3;4", "wc_review": "520;241;199", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "836;535;144", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 320.0, 142.45701105947717 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 505.0, 283.3031356456661 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4386100647852071391&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJlcV2Actm", "title": "MahiNet: A Neural Network for Many-Class Few-Shot Learning with Class Hierarchy", "track": "main", "status": "Reject", "tldr": "A memory-augmented neural network that addresses many-class few-shot problem by leveraging class hierarchy in both supervised learning and meta-learning.", "abstract": "We study many-class few-shot (MCFS) problem in both supervised learning and meta-learning scenarios. Compared to the well-studied many-class many-shot and few-class few-shot problems, MCFS problem commonly occurs in practical applications but is rarely studied. MCFS brings new challenges because it needs to distinguish between many classes, but only a few samples per class are available for training. In this paper, we propose ``memory-augmented hierarchical-classification network (MahiNet)'' for MCFS learning. It addresses the ``many-class'' problem by exploring the class hierarchy, e.g., the coarse-class label that covers a subset of fine classes, which helps to narrow down the candidates for the fine class and is cheaper to obtain. MahiNet uses a convolutional neural network (CNN) to extract features, and integrates a memory-augmented attention module with a multi-layer perceptron (MLP) to produce the probabilities over coarse and fine classes. While the MLP extends the linear classifier, the attention module extends a KNN classifier, both together targeting the ''`few-shot'' problem. We design different training strategies of MahiNet for supervised learning and meta-learning. Moreover, we propose two novel benchmark datasets ''mcfsImageNet'' (as a subset of ImageNet) and ''mcfsOmniglot'' (re-splitted Omniglot) specifically for MCFS problem. In experiments, we show that MahiNet outperforms several state-of-the-art models on MCFS classification tasks in both supervised learning and meta-learning scenarios.", "keywords": "deep learning;many-class few-shot;class hierarchy;meta learning", "primary_area": "", "supplementary_material": "", "author": "Lu Liu;Tianyi Zhou;Guodong Long;Jing Jiang;Chengqi Zhang", "authorids": "lu.liu.cs.uts@gmail.com;tianyizh@uw.edu;guodong.long@uts.edu.au;jing.jiang@uts.edu.au;chengqi.zhang@uts.edu.au", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nliu2019mahinet,\ntitle={MahiNet: A Neural Network for Many-Class Few-Shot Learning with Class Hierarchy},\nauthor={Lu Liu and Tianyi Zhou and Guodong Long and Jing Jiang and Chengqi Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlcV2Actm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJlcV2Actm", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;3", "wc_review": "364;414;175", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "858;951;981", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 317.6666666666667, 102.9249996626454 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 930.0, 52.364109846344185 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:7k7DF-KbmrAJ:scholar.google.com/&scioq=MahiNet:+A+Neural+Network+for+Many-Class+Few-Shot+Learning+with+Class+Hierarchy&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Two-Timescale Networks for Nonlinear Value Function Approximation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1021", "id": "rJleN20qK7", "author_site": "Wesley Chung, Somjit Nath, Ajin Joseph, Martha White", "tldr": "We propose an architecture for learning value functions which allows the use of any linear policy evaluation algorithm in tandem with nonlinear feature learning.", "abstract": "A key component for many reinforcement learning agents is to learn a value function, either for policy evaluation or control. Many of the algorithms for learning values, however, are designed for linear function approximation---with a fixed basis or fixed representation. Though there have been a few sound extensions to nonlinear function approximation, such as nonlinear gradient temporal difference learning, these methods have largely not been adopted, eschewed in favour of simpler but not sound methods like temporal difference learning and Q-learning. In this work, we provide a two-timescale network (TTN) architecture that enables linear methods to be used to learn values, with a nonlinear representation learned at a slower timescale. The approach facilitates the use of algorithms developed for the linear setting, such as data-efficient least-squares methods, eligibility traces and the myriad of recently developed linear policy evaluation algorithms, to provide nonlinear value estimates. We prove convergence for TTNs, with particular care given to ensure convergence of the fast linear component under potentially dependent features provided by the learned representation. We empirically demonstrate the benefits of TTNs, compared to other nonlinear value function approximation algorithms, both for policy evaluation and control. ", "keywords": "Reinforcement learning;policy evaluation;nonlinear function approximation", "primary_area": "", "supplementary_material": "", "author": "Wesley Chung;Somjit Nath;Ajin Joseph;Martha White", "authorids": "wchung@ualberta.ca;somjit@ualberta.ca;ajoseph@ualberta.ca;whitem@ualberta.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchung2018twotimescale,\ntitle={Two-Timescale Networks for Nonlinear Value Function Approximation},\nauthor={Wesley Chung and Somjit Nath and Ajin Joseph and Martha White},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJleN20qK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;6;6;7", "confidence": "4;4;4;4", "wc_review": "435;517;707;320", "wc_reply_reviewers": "43;88;0;23", "wc_reply_authors": "505;307;276;310", "reply_reviewers": "1;1;0;1", "reply_authors": "1;1;1;1", "rating_avg": [ 6.25, 0.4330127018922193 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 494.75, 141.11409390985722 ], "wc_reply_reviewers_avg": [ 38.5, 32.37668914512415 ], "wc_reply_authors_avg": [ 349.5, 90.75929704443507 ], "reply_reviewers_avg": [ 0.75, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3127516683890133284&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rJleN20qK7", "pdf": "https://openreview.net/pdf?id=rJleN20qK7", "email": ";;;", "author_num": 4 }, { "id": "rJlfzhA9Y7", "title": "Distributed Deep Policy Gradient for Competitive Adversarial Environment", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "This work considers the problem of cooperative learners in partially observable, stochastic environment, receiving feedback in the form of joint reward. The paper presents a flexible multi-agent competitive environment for online training and direct policy performance comparison. This forms a formal problem of a multi-agent Reinforcement Learning (RL) under partial observability, where the goal is to maximize the score performance measured in a direct confrontation. To address the complexity of the problem we propose a distributed deep stochastic policy gradient with individual observations, experience replay, policy transfer, and self-play.", "keywords": "multi-agent;partially observable;reinforcement learning;deepRL;self play;competitive environment", "primary_area": "", "supplementary_material": "", "author": "Denis Osipychev;Girish Chowdhary", "authorids": "deniso2@illinois.edu;girishc@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rJlfzhA9Y7", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;3;4", "wc_review": "114;439;179", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 244.0, 140.41604846550365 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:UfM0B2RJAFkJ:scholar.google.com/&scioq=Distributed+Deep+Policy+Gradient+for+Competitive+Adversarial+Environment&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rJlg1n05YX", "title": "Penetrating the Fog: the Path to Efficient CNN Models", "track": "main", "status": "Reject", "tldr": "We are the first in the field to show how to craft an effective sparse kernel design from three aspects: composition, performance and efficiency.", "abstract": "With the increasing demand to deploy convolutional neural networks (CNNs) on mobile platforms, the sparse kernel approach was proposed, which could save more parameters than the standard convolution while maintaining accuracy. However, despite the great potential, no prior research has pointed out how to craft an sparse kernel design with such potential (i.e., effective design), and all prior works just adopt simple combinations of existing sparse kernels such as group convolution. Meanwhile due to the large design space it is also impossible to try all combinations of existing sparse kernels. In this paper, we are the first in the field to consider how to craft an effective sparse kernel design by eliminating the large design space. Specifically, we present a sparse kernel scheme to illustrate how to reduce the space from three aspects. First, in terms of composition we remove designs composed of repeated layers. Second, to remove designs with large accuracy degradation, we find an unified property named~\\emph{information field} behind various sparse kernel designs, which could directly indicate the final accuracy. Last, we remove designs in two cases where a better parameter efficiency could be achieved. Additionally, we provide detailed efficiency analysis on the final 4 designs in our scheme. Experimental results validate the idea of our scheme by showing that our scheme is able to find designs which are more efficient in using parameters and computation with similar or higher accuracy.", "keywords": "Efficient CNN models;Computer Vision", "primary_area": "", "supplementary_material": "", "author": "Kun Wan;Boyuan Feng;Shu Yang;Yufei Ding", "authorids": "kun@cs.ucsb.edu;boyuan@cs.ucsb.edu;shuyang1995@ucsb.edu;yufeiding@cs.ucsb.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwan2019penetrating,\ntitle={Penetrating the Fog: the Path to Efficient {CNN} Models},\nauthor={Kun Wan and Boyuan Feng and Shu Yang and Yufei Ding},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlg1n05YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJlg1n05YX", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;3;3", "wc_review": "775;241;90", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 368.6666666666667, 293.8597549096432 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uG3ZNROun78J:scholar.google.com/&scioq=Penetrating+the+Fog:+the+Path+to+Efficient+CNN+Models&hl=en&as_sdt=0,5", "gs_version_total": 3 }, { "title": "Diversity-Sensitive Conditional Generative Adversarial Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1080", "id": "rJliMh09F7", "author_site": "Dingdong Yang, Seunghoon Hong, Yunseok Jang, Tianchen Zhao, Honglak Lee", "tldr": "We propose a simple and general approach that avoids a mode collapse problem in various conditional GANs.", "abstract": "We propose a simple yet highly effective method that addresses the mode-collapse problem in the Conditional Generative Adversarial Network (cGAN). Although conditional distributions are multi-modal (i.e., having many modes) in practice, most cGAN approaches tend to learn an overly simplified distribution where an input is always mapped to a single output regardless of variations in latent code. To address such issue, we propose to explicitly regularize the generator to produce diverse outputs depending on latent codes. The proposed regularization is simple, general, and can be easily integrated into most conditional GAN objectives. Additionally, explicit regularization on generator allows our method to control a balance between visual quality and diversity. We demonstrate the effectiveness of our method on three conditional generation tasks: image-to-image translation, image inpainting, and future video prediction. We show that simple addition of our regularization to existing models leads to surprisingly diverse generations, substantially outperforming the previous approaches for multi-modal conditional generation specifically designed in each individual task.", "keywords": "Conditional Generative Adversarial Network;mode-collapse;multi-modal generation;image-to-image translation;image in-painting;video prediction", "primary_area": "", "supplementary_material": "", "author": "Dingdong Yang;Seunghoon Hong;Yunseok Jang;Tianchen Zhao;Honglak Lee", "authorids": "didoyang@umich.edu;hongseu@umich.edu;yunseokj@umich.edu;ericolon@umich.edu;honglak@eecs.umich.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nyang2018diversitysensitive,\ntitle={Diversity-Sensitive Conditional Generative Adversarial Networks},\nauthor={Dingdong Yang and Seunghoon Hong and Yunseok Jang and Tiangchen Zhao and Honglak Lee},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJliMh09F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;5;3", "wc_review": "318;124;284", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1443;202;467", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 242.0, 84.58526270377521 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 704.0, 533.6334572219649 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 253, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=154904573992759731&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rJliMh09F7", "pdf": "https://openreview.net/pdf?id=rJliMh09F7", "email": ";;;;", "author_num": 5 }, { "title": "Query-Efficient Hard-label Black-box Attack: An Optimization-based Approach", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/850", "id": "rJlk6iRqKX", "author_site": "Minhao Cheng, Thong M Le, Pin-Yu Chen, Huan Zhang, Jinfeng Yi, Cho-Jui Hsieh", "tldr": "", "abstract": "We study the problem of attacking machine learning models in the hard-label black-box setting, where no model information is revealed except that the attacker can make queries to probe the corresponding hard-label decisions. This is a very challenging problem since the direct extension of state-of-the-art white-box attacks (e.g., C&W or PGD) to the hard-label black-box setting will require minimizing a non-continuous step function, which is combinatorial and cannot be solved by a gradient-based optimizer. The only two current approaches are based on random walk on the boundary (Brendel et al., 2017) and random trials to evaluate the loss function (Ilyas et al., 2018), which require lots of queries and lacks convergence guarantees. \nWe propose a novel way to formulate the hard-label black-box attack as a real-valued optimization problem which is usually continuous and can be solved by any zeroth order optimization algorithm. For example, using the Randomized Gradient-Free method (Nesterov & Spokoiny, 2017), we are able to bound the number of iterations needed for our algorithm to achieve stationary points under mild assumptions. We demonstrate that our proposed method outperforms the previous stochastic approaches to attacking convolutional neural networks on MNIST, CIFAR, and ImageNet datasets. More interestingly, we show that the proposed algorithm can also be used to attack other discrete and non-continuous machine learning models, such as Gradient Boosting Decision Trees (GBDT).", "keywords": "Adversarial example;Hard-label;Black-box attack;Query-efficient", "primary_area": "", "supplementary_material": "", "author": "Minhao Cheng;Thong Le;Pin-Yu Chen;Huan Zhang;JinFeng Yi;Cho-Jui Hsieh", "authorids": "mhcheng@ucla.edu;thmle@ucdavis.edu;pin-yu.chen@ibm.com;huan@huan-zhang.com;yijinfeng@jd.com;chohsieh@cs.ucla.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\ncheng2018queryefficient,\ntitle={Query-Efficient Hard-label Black-box Attack: An Optimization-based Approach},\nauthor={Minhao Cheng and Thong Le and Pin-Yu Chen and Huan Zhang and JinFeng Yi and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlk6iRqKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;3", "wc_review": "252;575;124", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "324;171;108", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 317.0, 189.77003627197487 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 201.0, 90.69729874698585 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 520, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5116459169179417425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "openreview": "https://openreview.net/forum?id=rJlk6iRqKX", "pdf": "https://openreview.net/pdf?id=rJlk6iRqKX", "email": ";;;;;", "author_num": 6 }, { "title": "Rethinking the Value of Network Pruning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/877", "id": "rJlnB3C5Ym", "author_site": "Zhuang Liu, Mingjie Sun, Tinghui Zhou, Gao Huang, Trevor Darrell", "tldr": "In structured network pruning, fine-tuning a pruned model only gives comparable performance with training it from scratch.", "abstract": "Network pruning is widely used for reducing the heavy inference cost of deep models in low-resource settings. A typical pruning algorithm is a three-stage pipeline, i.e., training (a large model), pruning and fine-tuning. During pruning, according to a certain criterion, redundant weights are pruned and important weights are kept to best preserve the accuracy. In this work, we make several surprising observations which contradict common beliefs. For all state-of-the-art structured pruning algorithms we examined, fine-tuning a pruned model only gives comparable or worse performance than training that model with randomly initialized weights. For pruning algorithms which assume a predefined target network architecture, one can get rid of the full pipeline and directly train the target network from scratch. Our observations are consistent for multiple network architectures, datasets, and tasks, which imply that: 1) training a large, over-parameterized model is often not necessary to obtain an efficient final model, 2) learned ``important'' weights of the large model are typically not useful for the small pruned model, 3) the pruned architecture itself, rather than a set of inherited ``important'' weights, is more crucial to the efficiency in the final model, which suggests that in some cases pruning can be useful as an architecture search paradigm. Our results suggest the need for more careful baseline evaluations in future research on structured pruning methods. We also compare with the \"Lottery Ticket Hypothesis\" (Frankle & Carbin 2019), and find that with optimal learning rate, the \"winning ticket\" initialization as used in Frankle & Carbin (2019) does not bring improvement over random initialization.", "keywords": "network pruning;network compression;architecture search;train from scratch", "primary_area": "", "supplementary_material": "", "author": "Zhuang Liu;Mingjie Sun;Tinghui Zhou;Gao Huang;Trevor Darrell", "authorids": "zhuangl@berkeley.edu;sunmj15@gmail.com;tinghuiz@eecs.berkeley.edu;gaohuang.thu@gmail.com;trevor@eecs.berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nliu2018rethinking,\ntitle={Rethinking the Value of Network Pruning},\nauthor={Zhuang Liu and Mingjie Sun and Tinghui Zhou and Gao Huang and Trevor Darrell},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlnB3C5Ym},\n}", "github": "[![github](/images/github_icon.svg) Eric-mingjie/rethinking-network-pruning](https://github.com/Eric-mingjie/rethinking-network-pruning) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rJlnB3C5Ym)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "5;4;5", "wc_review": "540;498;1610", "wc_reply_reviewers": "25;102;517", "wc_reply_authors": "1693;1134;3556", "reply_reviewers": "1;1;1", "reply_authors": "3;3;7", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 882.6666666666666, 514.5880769012132 ], "wc_reply_reviewers_avg": [ 214.66666666666666, 216.08074622438923 ], "wc_reply_authors_avg": [ 2127.6666666666665, 1035.4459050197756 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 4.333333333333333, 1.8856180831641267 ], "replies_avg": [ 65, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 1960, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3601827758437367761&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rJlnB3C5Ym", "pdf": "https://openreview.net/pdf?id=rJlnB3C5Ym", "email": ";;;;", "author_num": 5 }, { "id": "rJlpUiAcYX", "title": "Holographic and other Point Set Distances for Machine Learning", "track": "main", "status": "Reject", "tldr": "Permutation-invariant loss function for point set prediction.", "abstract": "We introduce an analytic distance function for moderately sized point sets of known cardinality that is shown to have very desirable properties, both as a loss function as well as a regularizer for machine learning applications. We compare our novel construction to other point set distance functions and show proof of concept experiments for training neural networks end-to-end on point set prediction tasks such as object detection.", "keywords": "point set;set;permutation-invariant;loss function", "primary_area": "", "supplementary_material": "", "author": "Lukas Balles;Thomas Fischbacher", "authorids": "lukas.balles@tuebingen.mpg.de;tfish@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nballes2019holographic,\ntitle={Holographic and other Point Set Distances for Machine Learning},\nauthor={Lukas Balles and Thomas Fischbacher},\nyear={2019},\nurl={https://openreview.net/forum?id=rJlpUiAcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJlpUiAcYX", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;3;3", "wc_review": "275;166;206", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 215.66666666666666, 45.02098276236192 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.6933752452815364, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4748319039997752116&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rJxA-h05KQ", "title": "Inhibited Softmax for Uncertainty Estimation in Neural Networks", "track": "main", "status": "Withdraw", "tldr": "Uncertainty estimation in a single forward pass without additional learnable parameters.", "abstract": "We present a new method for uncertainty estimation and out-of-distribution detection in neural networks with softmax output. We extend softmax layer with an additional constant input. The corresponding additional output is able to represent the uncertainty of the network. The proposed method requires neither additional parameters nor multiple forward passes nor input preprocessing nor out-of-distribution datasets. We show that our method performs comparably to more computationally expensive methods and outperforms baselines on our experiments from image recognition and sentiment analysis domains.", "keywords": "uncertainty estimation;out-of-distribution detection;inhibited softmax", "primary_area": "", "supplementary_material": "", "author": "Marcin Mo\u017cejko;Mateusz Susik;Rafa\u0142 Karczewski", "authorids": "marcin@sigmoidal.io;msusik@sigmoidal.io;rafal@sigmoidal.io", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJxA-h05KQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;4", "wc_review": "1033;649;212", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 631.3333333333334, 335.4045649990802 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2208599364450746780&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "rJxF73R9tX", "title": "Knows When it Doesn\u2019t Know: Deep Abstaining Classifiers", "track": "main", "status": "Reject", "tldr": "A deep abstaining neural network trained with a novel loss function that learns representations for when to abstain enabling robust learning in the presence of different types of noise.", "abstract": "We introduce the deep abstaining classifier -- a deep neural network trained with a novel loss function that provides an abstention option during training. This allows the DNN to abstain on confusing or difficult-to-learn examples while improving performance on the non-abstained samples. We show that such deep abstaining classifiers can: (i) learn representations for structured noise -- where noisy training labels or confusing examples are correlated with underlying features -- and then learn to abstain based on such features; (ii) enable robust learning in the presence of arbitrary or unstructured noise by identifying noisy samples; and (iii) be used as an effective out-of-category detector that learns to reliably abstain when presented with samples from unknown classes. We provide analytical results on loss function behavior that enable automatic tuning of accuracy and coverage, and demonstrate the utility of the deep abstaining classifier using multiple image benchmarks, Results indicate significant improvement in learning in the presence of label noise.", "keywords": "deep learning;robust learning;abstention;representation learning;abstaining classifier;open-set detection", "primary_area": "", "supplementary_material": "", "author": "Sunil Thulasidasan;Tanmoy Bhattacharya;Jeffrey Bilmes;Gopinath Chennupati;Jamal Mohd-Yusof", "authorids": "sunil@lanl.gov;tanmoy@lanl.gov;bilmes@uw.edu;gchennupati@lanl.gov;jamal@lanl.gov", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nthulasidasan2019knows,\ntitle={Knows When it Doesn\u2019t Know: Deep Abstaining Classifiers},\nauthor={Sunil Thulasidasan and Tanmoy Bhattacharya and Jeffrey Bilmes and Gopinath Chennupati and Jamal Mohd-Yusof},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxF73R9tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxF73R9tX", "pdf_size": 0, "rating": "5;5;6", "confidence": "5;3;4", "wc_review": "455;254;273", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "330;310;205", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 327.3333333333333, 90.60659774848384 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 281.6666666666667, 54.82294734466176 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16300246619313751187&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "title": "Hyperbolic Attention Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/805", "id": "rJxHsjRqFQ", "author_site": "Caglar Gulcehre, Misha Denil, Mateusz Malinowski, Ali Razavi, Razvan Pascanu, Karl Moritz Hermann, Victor Bapst, Victor Bapst, Adam Santoro, Nando de Freitas", "tldr": "We propose to incorporate inductive biases and operations coming from hyperbolic geometry to improve the attention mechanism of the neural networks.", "abstract": "Recent approaches have successfully demonstrated the benefits of learning the parameters of shallow networks in hyperbolic space. We extend this line of work by imposing hyperbolic geometry on the embeddings used to compute the ubiquitous attention mechanisms for different neural networks architectures. By only changing the geometry of embedding of object representations, we can use the embedding space more efficiently without increasing the number of parameters of the model. Mainly as the number of objects grows exponentially for any semantic distance from the query, hyperbolic geometry --as opposed to Euclidean geometry-- can encode those objects without having any interference. Our method shows improvements in generalization on neural machine translation on WMT'14 (English to German), learning on graphs (both on synthetic and real-world graph tasks) and visual question answering (CLEVR) tasks while keeping the neural representations compact.", "keywords": "Hyperbolic Geometry;Attention Methods;Reasoning on Graphs;Relation Learning;Scale Free Graphs;Transformers;Power Law", "primary_area": "", "supplementary_material": "", "author": "Caglar Gulcehre;Misha Denil;Mateusz Malinowski;Ali Razavi;Razvan Pascanu;Karl Moritz Hermann;Peter Battaglia;Victor Bapst;David Raposo;Adam Santoro;Nando de Freitas", "authorids": "ca9lar@gmail.com;;mateuszm@google.com;alirazavi@google.com;;;;;;adamsantoro@google.com;nandodefreitas@google.com", "gender": ";;;;;;;;;;", "homepage": ";;;;;;;;;;", "dblp": ";;;;;;;;;;", "google_scholar": ";;;;;;;;;;", "orcid": ";;;;;;;;;;", "linkedin": ";;;;;;;;;;", "or_profile": ";;;;;;;;;;", "aff": ";;;;;;;;;;", "aff_domain": ";;;;;;;;;;", "position": ";;;;;;;;;;", "bibtex": "@inproceedings{\ngulcehre2018hyperbolic,\ntitle={Hyperbolic Attention Networks},\nauthor={Caglar Gulcehre and Misha Denil and Mateusz Malinowski and Ali Razavi and Razvan Pascanu and Karl Moritz Hermann and Peter Battaglia and Victor Bapst and David Raposo and Adam Santoro and Nando de Freitas},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxHsjRqFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;5", "wc_review": "562;119;264", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "577;135;277", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 315.0, 184.41438844804563 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 329.6666666666667, 184.24862429759293 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 11, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 302, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1595119013035173525&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rJxHsjRqFQ", "pdf": "https://openreview.net/pdf?id=rJxHsjRqFQ", "email": ";;;;;;;;;;", "author_num": 11 }, { "id": "rJxMM2C5K7", "title": "Nested Dithered Quantization for Communication Reduction in Distributed Training", "track": "main", "status": "Reject", "tldr": "The paper proposes and analyzes two quantization schemes for communicating Stochastic Gradients in distributed learning which would reduce communication costs compare to the state of the art while maintaining the same accuracy. ", "abstract": "In distributed training, the communication cost due to the transmission of gradients\nor the parameters of the deep model is a major bottleneck in scaling up the number\nof processing nodes. To address this issue, we propose dithered quantization for\nthe transmission of the stochastic gradients and show that training with Dithered\nQuantized Stochastic Gradients (DQSG) is similar to the training with unquantized\nSGs perturbed by an independent bounded uniform noise, in contrast to the other\nquantization methods where the perturbation depends on the gradients and hence,\ncomplicating the convergence analysis. We study the convergence of training\nalgorithms using DQSG and the trade off between the number of quantization\nlevels and the training time. Next, we observe that there is a correlation among the\nSGs computed by workers that can be utilized to further reduce the communication\noverhead without any performance loss. Hence, we develop a simple yet effective\nquantization scheme, nested dithered quantized SG (NDQSG), that can reduce the\ncommunication significantly without requiring the workers communicating extra\ninformation to each other. We prove that although NDQSG requires significantly\nless bits, it can achieve the same quantization variance bound as DQSG. Our\nsimulation results confirm the effectiveness of training using DQSG and NDQSG\nin reducing the communication bits or the convergence time compared to the\nexisting methods without sacrificing the accuracy of the trained model.", "keywords": "machine learning;distributed training;dithered quantization;nested quantization;distributed compression", "primary_area": "", "supplementary_material": "", "author": "Afshin Abdi;Faramarz Fekri", "authorids": "abdi@ece.gatech.edu;fekri@ece.gatech.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nabdi2019nested,\ntitle={Nested Dithered Quantization for Communication Reduction in Distributed Training},\nauthor={Afshin Abdi and Faramarz Fekri},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxMM2C5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxMM2C5K7", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;4;4", "wc_review": "54;161;291", "wc_reply_reviewers": "0;48;0", "wc_reply_authors": "224;531;111", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 168.66666666666666, 96.90659878918922 ], "wc_reply_reviewers_avg": [ 16.0, 22.627416997969522 ], "wc_reply_authors_avg": [ 288.6666666666667, 177.45672398894578 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6191228804846061822&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rJxNAjC5F7", "title": "Learning Hash Codes via Hamming Distance Targets", "track": "main", "status": "Reject", "tldr": "We present a new loss function for training any differentiable model to hash that can vastly improve recall and lookup speed.", "abstract": "We present a powerful new loss function and training scheme for learning binary hash codes with any differentiable model and similarity function.\nOur loss function improves over prior methods by using log likelihood loss on top of an accurate approximation for the probability that two inputs fall within a Hamming distance target.\nOur novel training scheme obtains a good estimate of the true gradient by better sampling inputs and evaluating loss terms between all pairs of inputs in each minibatch.\nTo fully leverage the resulting hashes, we use multi-indexing.\nWe demonstrate that these techniques provide large improvements to a similarity search tasks.\nWe report the best results to date on competitive information retrieval tasks for Imagenet and SIFT 1M, improving recall from 73% to 85% and reducing query cost by a factor of 2-8, respectively.", "keywords": "information retrieval;learning to hash;cbir", "primary_area": "", "supplementary_material": "", "author": "Martin Loncaric;Ryan Weber;Bowei Liu", "authorids": "martin@thehive.ai;ryan@thehive.ai;liubowei@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nloncaric2019learning,\ntitle={Learning Hash Codes via Hamming Distance Targets},\nauthor={Martin Loncaric and Ryan Weber and Bowei Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxNAjC5F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxNAjC5F7", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;3;3", "wc_review": "124;490;607", "wc_reply_reviewers": "0;61;20", "wc_reply_authors": "244;615;324", "reply_reviewers": "0;1;1", "reply_authors": "1;2;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 407.0, 205.73283646515935 ], "wc_reply_reviewers_avg": [ 27.0, 25.39028685672272 ], "wc_reply_authors_avg": [ 394.3333333333333, 159.41629639266984 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11102376738109874566&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rJxXDsCqYX", "title": "Sentence Encoding with Tree-Constrained Relation Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "The meaning of a sentence is a function of the relations that hold between its words. We instantiate this relational view of semantics in a series of neural models based on variants of relation networks (RNs) which represent a set of objects (for us, words forming a sentence) in terms of representations of pairs of objects. We propose two extensions to the basic RN model for natural language. First, building on the intuition that not all word pairs are equally informative about the meaning of a sentence, we use constraints based on both supervised and unsupervised dependency syntax to control which relations influence the representation. Second, since higher-order relations are poorly captured by a sum of pairwise relations, we use a recurrent extension of RNs to propagate information so as to form representations of higher order relations. Experiments on sentence classification, sentence pair classification, and machine translation reveal that, while basic RNs are only modestly effective for sentence representation, recurrent RNs with latent syntax are a reliably powerful representational device.", "keywords": "sentence encoder;relation networks;tree;machine translation", "primary_area": "", "supplementary_material": "", "author": "Lei Yu;Cyprien de Masson d'Autume;Chris Dyer;Phil Blunsom;Lingpeng Kong;Wang Ling", "authorids": "leiyu@google.com;cyprien@google.com;cdyer@google.com;pblunsom@google.com;lingpenk@google.com;lingwang@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nyu2019sentence,\ntitle={Sentence Encoding with Tree-Constrained Relation Networks},\nauthor={Lei Yu and Cyprien de Masson d'Autume and Chris Dyer and Phil Blunsom and Lingpeng Kong and Wang Ling},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxXDsCqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rJxXDsCqYX", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;4", "wc_review": "782;172;293", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "438;17;91", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 415.6666666666667, 263.70480128776995 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 182.0, 183.5229322637001 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3518630635716378021&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rJxY_oCqKQ", "title": "A Forensic Representation to Detect Non-Trivial Image Duplicates, and How it Applies to Semantic Segmentation", "track": "main", "status": "Withdraw", "tldr": "A forensic metric to determine if a given image is a copy (with possible manipulation) of another image from a given dataset.", "abstract": "Manipulation and re-use of images in scientific publications is a recurring problem, at present lacking a scalable solution. Existing tools for detecting image duplication are mostly manual or semi-automated, despite the fact that generating data for a learning-based approach is straightforward, as we here illustrate. This paper addresses the problem of determining if, given two images, one is a manipulated version of the other by means of certain geometric and statistical manipulations, e.g. copy, rotation, translation, scale, perspective transform, histogram adjustment, partial erasing, and compression artifacts. We propose a solution based on a 3-branch Siamese Convolutional Neural Network. The ConvNet model is trained to map images into a 128-dimensional space, where the Euclidean distance between duplicate (respectively, unique) images is no greater (respectively, greater) than 1. Our results suggest that such an approach can serve as tool to improve surveillance of the published and in-peer-review literature for image manipulation. We also show that as a byproduct the network learns useful representations for semantic segmentation, with performance comparable to that of domain-specific models.", "keywords": "metric learning;image similarity;image forensics;siamese network;semantic segmentation", "primary_area": "", "supplementary_material": "", "author": "M. Cicconet;H. Elliott;D.L. Richmond;D. Wainstock;M. Walsh", "authorids": "cicconet@gmail.com;elliott.hunter@gmail.com;daverichmond@gmail.com;daniel_wainstock@hms.harvard.edu;mary_walsh@hms.harvard.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxY_oCqKQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;4", "wc_review": "258;223;390", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 290.3333333333333, 71.90889297499226 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:WKirpE_AII8J:scholar.google.com/&scioq=A+Forensic+Representation+to+Detect+Non-Trivial+Image+Duplicates,+and+How+it+Applies+to+Semantic+Segmentation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rJx_b3RqY7", "title": "AIM: Adversarial Inference by Matching Priors and Conditionals", "track": "main", "status": "Reject", "tldr": "", "abstract": "Effective inference for a generative adversarial model remains an important and challenging problem. We propose a novel approach, Adversarial Inference by Matching priors and conditionals (AIM), which explicitly matches prior and conditional distributions in both data and code spaces, and puts a direct constraint on the dependency structure of the generative model. We derive an equivalent form of the prior and conditional matching objective that can be optimized efficiently without any parametric assumption on the data. We validate the effectiveness of AIM on the MNIST, CIFAR-10, and CelebA datasets by conducting quantitative and qualitative evaluations. Results demonstrate that AIM significantly improves both reconstruction and generation as compared to other adversarial inference models.", "keywords": "Generative adversarial network;inference;generative model", "primary_area": "", "supplementary_material": "", "author": "Hanbo Li;Yaqing Wang;Changyou Chen;Jing Gao", "authorids": "alexanderhanboli@gmail.com;yaqingwa@buffalo.edu;cchangyou@gmail.com;jing@buffalo.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2019aim,\ntitle={{AIM}: Adversarial Inference by Matching Priors and Conditionals},\nauthor={Hanbo Li and Yaqing Wang and Changyou Chen and Jing Gao},\nyear={2019},\nurl={https://openreview.net/forum?id=rJx_b3RqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJx_b3RqY7", "pdf_size": 0, "rating": "4;6;7", "confidence": "5;4;4", "wc_review": "462;388;264", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "798;1133;85", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 371.3333333333333, 81.68775237995527 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 672.0, 437.0225013276395 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:b1OmYZeuwqUJ:scholar.google.com/&scioq=AIM:+Adversarial+Inference+by+Matching+Priors+and+Conditionals&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJxcHnRqYQ", "title": "Local Binary Pattern Networks for Character Recognition", "track": "main", "status": "Reject", "tldr": "", "abstract": "Memory and computation efficient deep learning architectures are crucial to the continued proliferation of machine learning capabilities to new platforms and systems. Binarization of operations in convolutional neural networks has shown promising results in reducing the model size and computing efficiency. \nIn this paper, we tackle the character recognition problem using a strategy different from the existing literature by proposing local binary pattern networks or LBPNet that can learn and perform bit-wise operations in an end-to-end fashion. LBPNet uses local binary comparisons and random projection in place of conventional convolution (or approximation of convolution) operations, providing important means to improve memory and speed efficiency that is particularly suited for small footprint devices and hardware accelerators. These operations can be implemented efficiently on different platforms including direct hardware implementation. LBPNet demonstrates its particular advantage on the character classification task where the content is composed of strokes. We applied LBPNet to benchmark datasets like MNIST, SVHN, DHCD, ICDAR, and Chars74K and observed encouraging results.", "keywords": "deep learning;local binary pattern;supervised learning;hardware-friendly", "primary_area": "", "supplementary_material": "", "author": "Jeng-Hau Lin;Yunfan Yang;Rajesh K. Gupta;Zhuowen Tu", "authorids": "jel252@ucsd.edu;yuy130@ucsd.edu;rgupta@ucsd.edu;ztu@ucsd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nlin2019local,\ntitle={Local Binary Pattern Networks for Character Recognition},\nauthor={Jeng-Hau Lin and Yunfan Yang and Rajesh K. Gupta and Zhuowen Tu},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxcHnRqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxcHnRqYQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;4", "wc_review": "241;136;547", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "321;321;776", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 308.0, 174.35022225394493 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 472.6666666666667, 214.48905695991942 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13238442530340736947&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "FFJORD: Free-Form Continuous Dynamics for Scalable Reversible Generative Models", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/778", "id": "rJxgknCcK7", "author_site": "Will Grathwohl, Tian Qi Chen, Jesse Bettencourt, Ilya Sutskever, David Duvenaud", "tldr": "We use continuous time dynamics to define a generative model with exact likelihoods and efficient sampling that is parameterized by unrestricted neural networks.", "abstract": "A promising class of generative models maps points from a simple distribution to a complex distribution through an invertible neural network. Likelihood-based training of these models requires restricting their architectures to allow cheap computation of Jacobian determinants. Alternatively, the Jacobian trace can be used if the transformation is specified by an ordinary differential equation. In this paper, we use Hutchinson\u2019s trace estimator to give a scalable unbiased estimate of the log-density. The result is a continuous-time invertible generative model with unbiased density estimation and one-pass sampling, while allowing unrestricted neural network architectures. We demonstrate our approach on high-dimensional density estimation, image generation, and variational inference, achieving the state-of-the-art among exact likelihood methods with efficient sampling.", "keywords": "generative models;density estimation;approximate inference;ordinary differential equations", "primary_area": "", "supplementary_material": "", "author": "Will Grathwohl;Ricky T. Q. Chen;Jesse Bettencourt;Ilya Sutskever;David Duvenaud", "authorids": "wgrathwohl@cs.toronto.edu;rtqichen@cs.toronto.edu;jessebett@cs.toronto.edu;ilyasu@openai.com;duvenaud@cs.toronto.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ngrathwohl2018scalable,\ntitle={Scalable Reversible Generative Models with Free-form Continuous Dynamics},\nauthor={Will Grathwohl and Ricky T. Q. Chen and Jesse Bettencourt and David Duvenaud},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxgknCcK7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 7 community implementations](https://paperswithcode.com/paper/?openreview=rJxgknCcK7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "7;7;7", "confidence": "3;4;4", "wc_review": "314;1330;498", "wc_reply_reviewers": "0;0;28", "wc_reply_authors": "406;692;345", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 714.0, 442.00754141379383 ], "wc_reply_reviewers_avg": [ 9.333333333333334, 13.199326582148887 ], "wc_reply_authors_avg": [ 481.0, 151.26356688464895 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1014, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12849237214531885593&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rJxgknCcK7", "pdf": "https://openreview.net/pdf?id=rJxgknCcK7", "email": ";;;;", "author_num": 5 }, { "id": "rJxpuoCqtQ", "title": "Likelihood-based Permutation Invariant Loss Function for Probability Distributions", "track": "main", "status": "Reject", "tldr": "The proposed method, Set Cross Entropy, measures the information-theoretic similarity of sets in a permutation-invariant manner.", "abstract": "We propose a permutation-invariant loss function designed for the neural networks reconstructing a set of elements without considering the order within its vector representation. Unlike popular approaches for encoding and decoding a set, our work does not rely on a carefully engineered network topology nor by any additional sequential algorithm. The proposed method, Set Cross Entropy, has a natural information-theoretic interpretation and is related to the metrics defined for sets. We evaluate the proposed approach in two object reconstruction tasks and a rule learning task.", "keywords": "Set reconstruction;maximum likelihood;permutation invariance", "primary_area": "", "supplementary_material": "", "author": "Masataro Asai", "authorids": "masataro.asai@ibm.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nasai2019likelihoodbased,\ntitle={Likelihood-based Permutation Invariant Loss Function for Probability Distributions},\nauthor={Masataro Asai},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxpuoCqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxpuoCqtQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "271;413;98", "wc_reply_reviewers": "38;222;0", "wc_reply_authors": "605;1156;96", "reply_reviewers": "1;1;0", "reply_authors": "2;3;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 260.6666666666667, 128.8056244458637 ], "wc_reply_reviewers_avg": [ 86.66666666666667, 96.94442852594584 ], "wc_reply_authors_avg": [ 619.0, 432.8564042112195 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:l0uI8UDOG00J:scholar.google.com/&scioq=Likelihood-based+Permutation+Invariant+Loss+Function+for+Probability+Distributions&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rJxug2R9Km", "title": "Meta-Learning for Contextual Bandit Exploration", "track": "main", "status": "Reject", "tldr": "We present a meta-learning algorithm, ME\u0302LE\u0301E, for learning a good exploration function in the interactive contextual bandit setting.", "abstract": "We describe M\u00caL\u00c9E, a meta-learning algorithm for learning a good exploration policy in the interactive contextual bandit setting. Here, an algorithm must take actions based on contexts, and learn based only on a reward signal from the action taken, thereby generating an exploration/exploitation trade-off. M\u00caL\u00c9E addresses this trade-off by learning a good exploration strategy based on offline synthetic tasks, on which it can simulate the contextual bandit setting. Based on these simulations, M\u00caL\u00c9E uses an imitation learning strategy to learn a good exploration policy that can then be applied to true contextual bandit tasks at test time. We compare M\u00caL\u00c9E to seven strong baseline contextual bandit algorithms on a set of three hundred real-world datasets, on which it outperforms alternatives in most settings, especially when differences in rewards are large. Finally, we demonstrate the importance of having a rich feature representation for learning how to explore.\n", "keywords": "meta-learning;bandits;exploration;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Amr Sharaf;Hal Daum\u00e9 III", "authorids": "amr@cs.umd.edu;hal@umiacs.umd.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nsharaf2019metalearning,\ntitle={Meta-Learning for Contextual Bandit Exploration},\nauthor={Amr Sharaf and Hal Daum\u00e9 III},\nyear={2019},\nurl={https://openreview.net/forum?id=rJxug2R9Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rJxug2R9Km", "pdf_size": 0, "rating": "3;6;7", "confidence": "4;4;4", "wc_review": "444;394;279", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "612;605;252", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.699673171197595 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 372.3333333333333, 69.08127258687568 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 489.6666666666667, 168.08000740388158 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14271514342461278058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Learning from Positive and Unlabeled Data with a Selection Bias", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1024", "id": "rJzLciCqKm", "author_site": "Masahiro Kato, Takeshi Teshima, Junya Honda", "tldr": "", "abstract": "We consider the problem of learning a binary classifier only from positive data and unlabeled data (PU learning). Recent methods of PU learning commonly assume that the labeled positive data are identically distributed as the unlabeled positive data. However, this assumption is unrealistic in many instances of PU learning because it fails to capture the existence of a selection bias in the labeling process. When the data has a selection bias, it is difficult to learn the Bayes optimal classifier by conventional methods of PU learning. In this paper, we propose a method to partially identify the classifier. The proposed algorithm learns a scoring function that preserves the order induced by the class posterior under mild assumptions, which can be used as a classifier by setting an appropriate threshold. Through experiments, we show that the method outperforms previous methods for PU learning on various real-world datasets.", "keywords": "PU learning;deep learning;machine learning;anomaly detection;sampling bias", "primary_area": "", "supplementary_material": "", "author": "Masahiro Kato;Takeshi Teshima;Junya Honda", "authorids": "mkato@ms.k.u-tokyo.ac.jp;teshima@ms.k.u-tokyo.ac.jp;honda@edu.k.u-tokyo.ac.jp", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkato2018learning,\ntitle={Learning from Positive and Unlabeled Data with a Selection Bias},\nauthor={Masahiro Kato and Takeshi Teshima and Junya Honda},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rJzLciCqKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;2", "wc_review": "554;944;201", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "835;829;204", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 566.3333333333334, 303.45382222378123 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 622.6666666666666, 296.0521725792413 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 143, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18364553294968896955&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=rJzLciCqKm", "pdf": "https://openreview.net/pdf?id=rJzLciCqKm", "email": ";;", "author_num": 3 }, { "id": "rJzoujRct7", "title": "A Solution to China Competitive Poker Using Deep Learning", "track": "main", "status": "Reject", "tldr": "This paper introduces a method to play China competitive poker using deep neural network, gets the state of the art performance.", "abstract": "Recently, deep neural networks have achieved superhuman performance in various games such as Go, chess and Shogi. Compared to Go, China Competitive Poker, also known as Dou dizhu, is a type of imperfect information game, including hidden information, randomness, multi-agent cooperation and competition. It has become widespread and is now a national game in China. We introduce an approach to play China Competitive Poker using Convolutional Neural Network (CNN) to predict actions. This network is trained by supervised learning from human game records. Without any search, the network already beats the best AI program by a large margin, and also beats the best human amateur players in duplicate mode.", "keywords": "artificial intelligence;China competitive poker;Dou dizhu;CNN;imperfect information game", "primary_area": "", "supplementary_material": "", "author": "Zhenxing Liu;Maoyu Hu;Zhangfei Zhang", "authorids": "liuzx@smzy.cc;humaoyu@smzy.cc;zzf@smzy.cc", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nliu2019a,\ntitle={A Solution to China Competitive Poker Using Deep Learning},\nauthor={Zhenxing Liu and Maoyu Hu and Zhangfei Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=rJzoujRct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rJzoujRct7", "pdf_size": 0, "rating": "2;3", "confidence": "3;4", "wc_review": "275;441", "wc_reply_reviewers": "301;0", "wc_reply_authors": "765;173", "reply_reviewers": "2;0", "reply_authors": "2;1", "rating_avg": [ 2.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 358.0, 83.0 ], "wc_reply_reviewers_avg": [ 150.5, 150.5 ], "wc_reply_authors_avg": [ 469.0, 296.0 ], "reply_reviewers_avg": [ 1.0, 1.0 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 39, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17612525344905702426&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Adv-BNN: Improved Adversarial Defense through Robust Bayesian Neural Network", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/663", "id": "rk4Qso0cKm", "author_site": "Xuanqing Liu, Yao Li, Chongruo Wu, Cho-Jui Hsieh", "tldr": "We design an adversarial training method to Bayesian neural networks, showing a much stronger defense to white-box adversarial attacks", "abstract": "We present a new algorithm to train a robust neural network against adversarial attacks. \nOur algorithm is motivated by the following two ideas. First, although recent work has demonstrated that fusing randomness can improve the robustness of neural networks (Liu 2017), we noticed that adding noise blindly to all the layers is not the optimal way to incorporate randomness. \nInstead, we model randomness under the framework of Bayesian Neural Network (BNN) to formally learn the posterior distribution of models in a scalable way. Second, we formulate the mini-max problem in BNN to learn the best model distribution under adversarial attacks, leading to an adversarial-trained Bayesian neural net. Experiment results demonstrate that the proposed algorithm achieves state-of-the-art performance under strong attacks. On CIFAR-10 with VGG network, our model leads to 14% accuracy improvement compared with adversarial training (Madry 2017) and random self-ensemble (Liu, 2017) under PGD attack with 0.035 distortion, and the gap becomes even larger on a subset of ImageNet.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Xuanqing Liu;Yao Li;Chongruo Wu;Cho-Jui Hsieh", "authorids": "xqliu@cs.ucla.edu;yaoli@ucdavis.edu;crwu@ucdavis.edu;chohsieh@cs.ucla.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nliu2018advbnn,\ntitle={Adv-{BNN}: Improved Adversarial Defense through Robust Bayesian Neural Network},\nauthor={Xuanqing Liu and Yao Li and Chongruo Wu and Cho-Jui Hsieh},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rk4Qso0cKm},\n}", "github": "[![github](/images/github_icon.svg) xuanqing94/BayesianDefense](https://github.com/xuanqing94/BayesianDefense)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;3;3", "wc_review": "622;393;945", "wc_reply_reviewers": "334;0;0", "wc_reply_authors": "974;316;428", "reply_reviewers": "2;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 653.3333333333334, 226.43959214079348 ], "wc_reply_reviewers_avg": [ 111.33333333333333, 157.4491099442046 ], "wc_reply_authors_avg": [ 572.6666666666666, 287.4454537627772 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 226, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16111397550296660225&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rk4Qso0cKm", "pdf": "https://openreview.net/pdf?id=rk4Qso0cKm", "email": ";;;", "author_num": 4 }, { "id": "rk4Wf30qKQ", "title": "Security Analysis of Deep Neural Networks Operating in the Presence of Cache Side-Channel Attacks", "track": "main", "status": "Reject", "tldr": "We conduct the first in-depth security analysis of DNN fingerprinting attacks that exploit cache side-channels, which represents a step toward understanding the DNN\u2019s vulnerability to side-channel attacks.", "abstract": "Recent work has introduced attacks that extract the architecture information of deep neural networks (DNN), as this knowledge enhances an adversary\u2019s capability to conduct attacks on black-box networks. This paper presents the first in-depth security analysis of DNN fingerprinting attacks that exploit cache side-channels. First, we define the threat model for these attacks: our adversary does not need the ability to query the victim model; instead, she runs a co-located process on the host machine victim \u2019s deep learning (DL) system is running and passively monitors the accesses of the target functions in the shared framework. Second, we introduce DeepRecon, an attack that reconstructs the architecture of the victim network by using the internal information extracted via Flush+Reload, a cache side-channel technique. Once the attacker observes function invocations that map directly to architecture attributes of the victim network, the attacker can reconstruct the victim\u2019s entire network architecture. In our evaluation, we demonstrate that an attacker can accurately reconstruct two complex networks (VGG19 and ResNet50) having only observed one forward propagation. Based on the extracted architecture attributes, we also demonstrate that an attacker can build a meta-model that accurately fingerprints the architecture and family of the pre-trained model in a transfer learning setting. From this meta-model, we evaluate the importance of the observed attributes in the fingerprinting process. Third, we propose and evaluate new framework-level defense techniques that obfuscate our attacker\u2019s observations. Our empirical security analysis represents a step toward understanding the DNNs\u2019 vulnerability to cache side-channel attacks.", "keywords": "DNN Security Analysis;Fingerprinting Attacks;Cache Side-Channel", "primary_area": "", "supplementary_material": "", "author": "Sanghyun Hong;Michael Davinroy;Yigitcan Kaya;Stuart Nevans Locke;Ian Rackow;Kevin Kulda;Dana Dachman-Soled;Tudor Dumitra\u0219", "authorids": "shhong@cs.umd.edu;mdavinr1@swarthmore.edu;yigitcan@cs.umd.edu;stnevans@mail.rit.edu;ian.rackow@gmail.com;kevin_kulda1@baylor.edu;danadach@ece.umd.edu;tdumitra@umiacs.umd.edu", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@misc{\nhong2019security,\ntitle={Security Analysis of Deep Neural Networks Operating in the Presence of Cache Side-Channel Attacks},\nauthor={Sanghyun Hong and Michael Davinroy and Yigitcan Kaya and Stuart Nevans Locke and Ian Rackow and Kevin Kulda and Dana Dachman-Soled and Tudor Dumitra\u0219},\nyear={2019},\nurl={https://openreview.net/forum?id=rk4Wf30qKQ},\n}", "github": "[![github](/images/github_icon.svg) Sanghyun-Hong/DeepRecon](https://github.com/Sanghyun-Hong/DeepRecon)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rk4Wf30qKQ", "pdf_size": 0, "rating": "4;4;6", "confidence": "2;4;4", "wc_review": "485;625;481", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1134;1195;729", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 530.3333333333334, 66.95935749061184 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1019.3333333333334, 206.8015688743412 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7228216031896602369&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "rkGG6s0qKQ", "title": "The GAN Landscape: Losses, Architectures, Regularization, and Normalization", "track": "main", "status": "Reject", "tldr": "A sober view on the current state of GANs from a practical perspective", "abstract": "Generative adversarial networks (GANs) are a class of deep generative models which aim to learn a target distribution in an unsupervised fashion. While they were successfully applied to many problems, training a GAN is a notoriously challenging task and requires a significant amount of hyperparameter tuning, neural architecture engineering, and a non-trivial amount of ``tricks\". The success in many practical applications coupled with the lack of a measure to quantify the failure modes of GANs resulted in a plethora of proposed losses, regularization and normalization schemes, and neural architectures. In this work we take a sober view of the current state of GANs from a practical perspective. We reproduce the current state of the art and go beyond fairly exploring the GAN landscape. We discuss common pitfalls and reproducibility issues, open-source our code on Github, and provide pre-trained models on TensorFlow Hub.", "keywords": "GANs;empirical evaluation;large-scale;reproducibility", "primary_area": "", "supplementary_material": "", "author": "Karol Kurach;Mario Lucic;Xiaohua Zhai;Marcin Michalski;Sylvain Gelly", "authorids": "kkurach@gmail.com;lucic@google.com;xzhai@google.com;michalski@google.com;sylvain.gelly@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nkurach2019the,\ntitle={The {GAN} Landscape: Losses, Architectures, Regularization, and Normalization},\nauthor={Karol Kurach and Mario Lucic and Xiaohua Zhai and Marcin Michalski and Sylvain Gelly},\nyear={2019},\nurl={https://openreview.net/forum?id=rkGG6s0qKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkGG6s0qKQ", "pdf_size": 0, "rating": "4;4;7", "confidence": "2;3;4", "wc_review": "601;372;761", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "490;670;765", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 578.0, 159.63917647829015 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 641.6666666666666, 114.04190263037336 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10216506861485607701&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "rkGcYi09Km", "title": "NUTS: Network for Unsupervised Telegraphic Summarization", "track": "main", "status": "Reject", "tldr": "In this paper, we propose an unsupervised deep learning network (NUTS) to generate telegraphic summaries.", "abstract": "Extractive summarization methods operate by ranking and selecting the sentences which best encapsulate the theme of a given document. They do not fare well in domains like fictional narratives where there is no central theme and core information is not encapsulated by a small set of sentences. For the purpose of reducing the size of the document while conveying the idea expressed by each sentence, we need more sentence specific methods. Telegraphic summarization, which selects short segments across several sentences, is better suited for such domains. Telegraphic summarization captures the plot better by retaining shorter versions of each sentence while not really concerning itself with grammatically linking these segments. In this paper, we propose an unsupervised deep learning network (NUTS) to generate telegraphic summaries.\nWe use multiple encoder-decoder networks and learn to drop portions of the text that are inferable from the chosen segments. The model is agnostic to both sentence length and style. We demonstrate that the summaries produced by our model show significant quantitative and qualitative improvement over those produced by existing methods and baselines.", "keywords": "nlp;summarization;unsupervised learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Chanakya Malireddy;Tirth Maniar;Sajal Maheshwari;Manish Shrivastava", "authorids": "chanakya.malireddy@gmail.com;tirthmaniar1998@gmail.com;sajalmaheshwari624@gmail.com;m.shrivastava@iiit.ac.in", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmalireddy2019nuts,\ntitle={{NUTS}: Network for Unsupervised Telegraphic Summarization},\nauthor={Chanakya Malireddy and Tirth Maniar and Sajal Maheshwari and Manish Shrivastava},\nyear={2019},\nurl={https://openreview.net/forum?id=rkGcYi09Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkGcYi09Km", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "wc_review": "235;497;400", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "251;573;699", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 377.3333333333333, 108.15523822522678 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 507.6666666666667, 188.6395740264722 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:TexcMkOZd9UJ:scholar.google.com/&scioq=NUTS:+Network+for+Unsupervised+Telegraphic+Summarization&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rkGqLoR5tX", "title": "ODIN: Outlier Detection In Neural Networks", "track": "main", "status": "Withdraw", "tldr": "An add-on method for deep learning to detect outliers during prediction-time", "abstract": "Adoption of deep learning in safety-critical systems raise the need for understanding what deep neural networks do not understand. Several methodologies to estimate model uncertainty have been proposed, but these methodologies constrain either how the neural network is trained or constructed. We present Outlier Detection In Neural networks (ODIN), an assumption-free method for detecting outlier observations during prediction, based on principles widely used in manufacturing process monitoring. By using a linear approximation of the hidden layer manifold, we add prediction-time outlier detection to models after training without altering architecture or training. We demonstrate that ODIN efficiently detect outliers during prediction on Fashion-MNIST, ImageNet-synsets and speech command recognition.", "keywords": "Outlier Detection;Model Uncertainty;Safety", "primary_area": "", "supplementary_material": "", "author": "Rickard Sj\u00f6gren;Johan Trygg", "authorids": "rickard.sjoegren@sartorius-stedim.com;johan.trygg@sartorius-stedim.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkGqLoR5tX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "299;262;93", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "683;212;66", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 218.0, 89.66976450658643 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 320.3333333333333, 263.27974138209385 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PwS6IRmhoqwJ:scholar.google.com/&scioq=ODIN:+Outlier+Detection+In+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rkMD73A5FX", "title": "Can I trust you more? Model-Agnostic Hierarchical Explanations", "track": "main", "status": "Reject", "tldr": "A new framework for context-dependent and context-free explanations of predictions", "abstract": "Interactions such as double negation in sentences and scene interactions in images are common forms of complex dependencies captured by state-of-the-art machine learning models. We propose Mah\u00e9, a novel approach to provide Model-Agnostic Hierarchical Explanations of how powerful machine learning models, such as deep neural networks, capture these interactions as either dependent on or free of the context of data instances. Specifically, Mah\u00e9 provides context-dependent explanations by a novel local interpretation algorithm that effectively captures any-order interactions, and obtains context-free explanations through generalizing context-dependent interactions to explain global behaviors. Experimental results show that Mah\u00e9 obtains improved local interaction interpretations over state-of-the-art methods and successfully provides explanations of interactions that are context-free.", "keywords": "interpretability;interactions;context-dependent;context-free", "primary_area": "", "supplementary_material": "", "author": "Michael Tsang;Youbang Sun;Dongxu Ren;Beibei Xin;Yan Liu", "authorids": "tsangm@usc.edu;syb98@mail.ustc.edu.cn;rdx15@mails.tsinghua.edu.cn;bxin@usc.edu;yanliu.cs@usc.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntsang2019can,\ntitle={Can I trust you more? Model-Agnostic Hierarchical Explanations},\nauthor={Michael Tsang and Youbang Sun and Dongxu Ren and Beibei Xin and Yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=rkMD73A5FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkMD73A5FX", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "1078;292;216", "wc_reply_reviewers": "550;0;0", "wc_reply_authors": "1841;714;717", "reply_reviewers": "3;0;0", "reply_authors": "4;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 528.6666666666666, 389.6745080477067 ], "wc_reply_reviewers_avg": [ 183.33333333333334, 259.27248643506744 ], "wc_reply_authors_avg": [ 1090.6666666666667, 530.5672017337253 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16202376299751337489&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Optimal Completion Distillation for Sequence Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/738", "id": "rkMW1hRqKX", "author_site": "Sara Sabour, William Chan, Mohammad Norouzi", "tldr": "Optimal Completion Distillation (OCD) is a training procedure for optimizing sequence to sequence models based on edit distance which achieves state-of-the-art on end-to-end Speech Recognition tasks.", "abstract": "We present Optimal Completion Distillation (OCD), a training procedure for optimizing sequence to sequence models based on edit distance. OCD is efficient, has no hyper-parameters of its own, and does not require pre-training or joint optimization with conditional log-likelihood. Given a partial sequence generated by the model, we first identify the set of optimal suffixes that minimize the total edit distance, using an efficient dynamic programming algorithm. Then, for each position of the generated sequence, we use a target distribution which puts equal probability on the first token of all the optimal suffixes. OCD achieves the state-of-the-art performance on end-to-end speech recognition, on both Wall Street Journal and Librispeech datasets, achieving $9.3\\%$ WER and $4.5\\%$ WER, respectively.", "keywords": "Sequence Learning;Edit Distance;Speech Recognition;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Sara Sabour;William Chan;Mohammad Norouzi", "authorids": "sasabour@google.com;williamchan@google.com;mnorouzi@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsabour2018optimal,\ntitle={Optimal Completion Distillation for Sequence Learning},\nauthor={Sara Sabour and William Chan and Mohammad Norouzi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkMW1hRqKX},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkMW1hRqKX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;4;4", "wc_review": "454;584;273", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "197;647;567", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 437.0, 127.53300226477327 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 470.3333333333333, 196.01587237318876 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1067727282240510663&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkMW1hRqKX", "pdf": "https://openreview.net/pdf?id=rkMW1hRqKX", "email": ";;", "author_num": 3 }, { "id": "rkMhusC5Y7", "title": "Learning to Coordinate Multiple Reinforcement Learning Agents for Diverse Query Reformulation", "track": "main", "status": "Reject", "tldr": "Multiple diverse query reformulation agents trained with reinforcement learning to improve search engines.", "abstract": "We propose a method to efficiently learn diverse strategies in reinforcement learning for query reformulation in the tasks of document retrieval and question answering. In the proposed framework an agent consists of multiple specialized sub-agents and a meta-agent that learns to aggregate the answers from sub-agents to produce a final answer. Sub-agents are trained on disjoint partitions of the training data, while the meta-agent is trained on the full training set. Our method makes learning faster, because it is highly parallelizable, and has better generalization performance than strong baselines, such as an ensemble of agents trained on the full data. We show that the improved performance is due to the increased diversity of reformulation strategies. ", "keywords": "Reinforcement Learning;Multi-agent;Information Retrieval;Question-Answering;Query Reformulation;Query Expansion", "primary_area": "", "supplementary_material": "", "author": "Rodrigo Nogueira;Jannis Bulian;Massimiliano Ciaramita", "authorids": "rodrigonogueira@nyu.edu;jbulian@google.com;massi@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnogueira2019learning,\ntitle={Learning to Coordinate Multiple Reinforcement Learning Agents for Diverse Query Reformulation},\nauthor={Rodrigo Nogueira and Jannis Bulian and Massimiliano Ciaramita},\nyear={2019},\nurl={https://openreview.net/forum?id=rkMhusC5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkMhusC5Y7", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;4;4", "wc_review": "486;253;487", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "231;596;330", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 408.6666666666667, 110.07371267574389 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 385.6666666666667, 154.12188538801217 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13868622136493361853&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rkMk9j0qYm", "title": "Explainable Adversarial Learning: Implicit Generative Modeling of Random Noise during Training for Adversarial Robustness", "track": "main", "status": "Withdraw", "tldr": "Noise modeling at the input during discriminative training improves adversarial robustness. Propose PCA based evaluation metric for adversarial robustness", "abstract": "We introduce Explainable Adversarial Learning, ExL, an approach for training neural networks that are intrinsically robust to adversarial attacks. We find that the implicit generative modeling of random noise with the same loss function used during posterior maximization, improves a model's understanding of the data manifold furthering adversarial robustness. We prove our approach's efficacy and provide a simplistic visualization tool for understanding adversarial data, using Principal Component Analysis. Our analysis reveals that adversarial robustness, in general, manifests in models with higher variance along the high-ranked principal components. We show that models learnt with our approach perform remarkably well against a wide-range of attacks. Furthermore, combining ExL with state-of-the-art adversarial training extends the robustness of a model, even beyond what it is adversarially trained for, in both white-box and black-box attack scenarios.", "keywords": "Adversarial Robustness;PCA variance;PCA subspace;Generative Noise Modeling;Adversarial attack;Adversarial Robustness Metric", "primary_area": "", "supplementary_material": "", "author": "Priyadarshini Panda;Kaushik Roy", "authorids": "pandap@purdue.edu;kaushik@purdue.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkMk9j0qYm", "pdf_size": 0, "rating": "3;5;5", "confidence": "4;4;4", "wc_review": "526;238;510", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1126;1125;1620", "reply_reviewers": "0;0;0", "reply_authors": "2;2;3", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 424.6666666666667, 132.15479139588123 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1290.3333333333333, 233.10989301662474 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=549308163145025478&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rkMlSnAqYX", "title": "Mitigating Bias in Natural Language Inference Using Adversarial Learning", "track": "main", "status": "Withdraw", "tldr": "Adversarial learning methods encourage NLI models to ignore dataset-specific biases and help models transfer across datasets.", "abstract": "Recognizing the relationship between two texts is an important aspect of natural language understanding (NLU), and a variety of neural network models have been proposed for solving NLU tasks. Unfortunately, recent work showed that the datasets these models are trained on often contain biases that allow models to achieve non-trivial performance without possibly learning the relationship between the two texts. We propose a framework for building robust models by using adversarial learning to encourage models to learn latent, bias-free representations. We test our approach in a Natural Language Inference (NLI) scenario, and show that our adversarially-trained models learn robust representations that ignore known dataset-specific biases. Our experiments demonstrate that our models are more robust to new NLI datasets. ", "keywords": "natural language inference;adversarial learning;bias;artifacts", "primary_area": "", "supplementary_material": "", "author": "Yonatan Belinkov;Adam Poliak;Stuart M. Shieber;Benjamin Van Durme", "authorids": "belinkov@seas.harvard.edu;azpoliak@cs.jhu.edu;shieber@seas.harvard.edu;vandurme@cs.jhu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkMlSnAqYX", "pdf_size": 0, "rating": "4;4;8", "confidence": "4;5;4", "wc_review": "658;922;223", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "461;1115;219", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.333333333333333, 1.8856180831641267 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 601.0, 288.1978487081401 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 598.3333333333334, 378.46121186838803 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3278412062829523628&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rkMnHjC5YQ", "title": "Improved Learning of One-hidden-layer Convolutional Neural Networks with Overlaps", "track": "main", "status": "Reject", "tldr": "We propose an algorithm for provably recovering parameters (convolutional and output weights) of a convolutional network with overlapping patches.", "abstract": "We propose a new algorithm to learn a one-hidden-layer convolutional neural network where both the convolutional weights and the outputs weights are parameters to be learned. Our algorithm works for a general class of (potentially overlapping) patches, including commonly used structures for computer vision tasks. Our algorithm draws ideas from (1) isotonic regression for learning neural networks and (2) landscape analysis of non-convex matrix factorization problems. We believe these findings may inspire further development in designing provable algorithms for learning neural networks and other complex models. While our focus is theoretical, we also present experiments that illustrate our theoretical findings.", "keywords": "deep learning;parameter recovery;convolutional neural networks;non-convex optimization", "primary_area": "", "supplementary_material": "", "author": "Simon S. Du;Surbhi Goel", "authorids": "ssdu@cs.cmu.edu;surbhi@cs.utexas.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndu2019improved,\ntitle={Improved Learning of One-hidden-layer Convolutional Neural Networks with Overlaps},\nauthor={Simon S. Du and Surbhi Goel},\nyear={2019},\nurl={https://openreview.net/forum?id=rkMnHjC5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rkMnHjC5YQ", "pdf_size": 0, "rating": "5;6;6", "confidence": "1;4;3", "wc_review": "92;277;211", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "129;188;155", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 193.33333333333334, 76.5520882943256 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 157.33333333333334, 24.143091949642425 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6207101644801642908&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rkVOXhAqY7", "title": "The Conditional Entropy Bottleneck", "track": "main", "status": "Reject", "tldr": "The Conditional Entropy Bottleneck is an information-theoretic objective function for learning optimal representations.", "abstract": "We present a new family of objective functions, which we term the Conditional Entropy Bottleneck (CEB). These objectives are motivated by the Minimum Necessary Information (MNI) criterion. We demonstrate the application of CEB to classification tasks. We show that CEB gives: well-calibrated predictions; strong detection of challenging out-of-distribution examples and powerful whitebox adversarial examples; and substantial robustness to those adversaries. Finally, we report that CEB fails to learn from information-free datasets, providing a possible resolution to the problem of generalization observed in Zhang et al. (2016).", "keywords": "representation learning;information theory;uncertainty;out-of-distribution detection;adversarial example robustness;generalization;objective function", "primary_area": "", "supplementary_material": "", "author": "Ian Fischer", "authorids": "iansf@google.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nfischer2019the,\ntitle={The Conditional Entropy Bottleneck},\nauthor={Ian Fischer},\nyear={2019},\nurl={https://openreview.net/forum?id=rkVOXhAqY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rkVOXhAqY7", "pdf_size": 0, "rating": "2;6;6", "confidence": "4;3;3", "wc_review": "387;428;853", "wc_reply_reviewers": "0;587;308", "wc_reply_authors": "1827;2100;692", "reply_reviewers": "0;2;1", "reply_authors": "3;5;2", "rating_avg": [ 4.666666666666667, 1.8856180831641267 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 556.0, 210.67668752538015 ], "wc_reply_reviewers_avg": [ 298.3333333333333, 239.73921016156052 ], "wc_reply_authors_avg": [ 1539.6666666666667, 609.6645708001153 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 3.3333333333333335, 1.247219128924647 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 126, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6592592781559357988&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "id": "rke41hC5Km", "title": "Generating Realistic Stock Market Order Streams", "track": "main", "status": "Reject", "tldr": "We propose an approach to generate realistic and high-fidelity stock market data based on generative adversarial networks.", "abstract": "We propose an approach to generate realistic and high-fidelity stock market data based on generative adversarial networks.\nWe model the order stream as a stochastic process with finite history dependence, and employ a conditional Wasserstein GAN to capture history dependence of orders in a stock market. \nWe test our approach with actual market and synthetic data on a number of different statistics, and find the generated data to be close to real data. ", "keywords": "application in finance;stock markets;generative models", "primary_area": "", "supplementary_material": "", "author": "Junyi Li;Xintong Wang;Yaoyang Lin;Arunesh Sinha;Michael P. Wellman", "authorids": "junyili@umich.edu;xintongw@umich.edu;yaoyang@umich.edu;arunesh@umich.edu;wellman@umich.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019generating,\ntitle={Generating Realistic Stock Market Order Streams},\nauthor={Junyi Li and Xintong Wang and Yaoyang Lin and Arunesh Sinha and Michael P. Wellman},\nyear={2019},\nurl={https://openreview.net/forum?id=rke41hC5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rke41hC5Km", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;4", "wc_review": "273;211;299", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "116;183;172", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 261.0, 36.914315199752345 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 157.0, 29.337120967584166 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5512423903485842855&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 20 }, { "title": "Caveats for information bottleneck in deterministic scenarios", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/668", "id": "rke4HiAcY7", "author_site": "Artemy Kolchinsky, Brendan D Tracey, Steven Van Kuyk", "tldr": "Information bottleneck behaves in surprising ways whenever the output is a deterministic function of the input.", "abstract": "Information bottleneck (IB) is a method for extracting information from one random variable X that is relevant for predicting another random variable Y. To do so, IB identifies an intermediate \"bottleneck\" variable T that has low mutual information I(X;T) and high mutual information I(Y;T). The \"IB curve\" characterizes the set of bottleneck variables that achieve maximal I(Y;T) for a given I(X;T), and is typically explored by maximizing the \"IB Lagrangian\", I(Y;T) - \u03b2I(X;T). In some cases, Y is a deterministic function of X, including many classification problems in supervised learning where the output class Y is a deterministic function of the input X. We demonstrate three caveats when using IB in any situation where Y is a deterministic function of X: (1) the IB curve cannot be recovered by maximizing the IB Lagrangian for different values of \u03b2; (2) there are \"uninteresting\" trivial solutions at all points of the IB curve; and (3) for multi-layer classifiers that achieve low prediction error, different layers cannot exhibit a strict trade-off between compression and prediction, contrary to a recent proposal. We also show that when Y is a small perturbation away from being a deterministic function of X, these three caveats arise in an approximate way. To address problem (1), we propose a functional that, unlike the IB Lagrangian, can recover the IB curve in all cases. We demonstrate the three caveats on the MNIST dataset.", "keywords": "information bottleneck;supervised learning;deep learning;information theory", "primary_area": "", "supplementary_material": "", "author": "Artemy Kolchinsky;Brendan D. Tracey;Steven Van Kuyk", "authorids": "artemyk@gmail.com;tracey.brendan@gmail.com;steven.jvk@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkolchinsky2018caveats,\ntitle={Caveats for information bottleneck in deterministic scenarios},\nauthor={Artemy Kolchinsky and Brendan D. Tracey and Steven Van Kuyk},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rke4HiAcY7},\n}", "github": "[![github](/images/github_icon.svg) artemyk/ibcurve](https://github.com/artemyk/ibcurve)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "2;6;8", "confidence": "4;4;4", "wc_review": "408;544;657", "wc_reply_reviewers": "0;163;99", "wc_reply_authors": "1258;1719;1058", "reply_reviewers": "0;1;1", "reply_authors": "2;4;2", "rating_avg": [ 5.333333333333333, 2.494438257849294 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 536.3333333333334, 101.79827547109474 ], "wc_reply_reviewers_avg": [ 87.33333333333333, 67.05387551978052 ], "wc_reply_authors_avg": [ 1345.0, 276.77548060958486 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8561375002982335569&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rke4HiAcY7", "pdf": "https://openreview.net/pdf?id=rke4HiAcY7", "email": ";;", "author_num": 3 }, { "id": "rke8ZhCcFQ", "title": "ATTACK GRAPH CONVOLUTIONAL NETWORKS BY ADDING FAKE NODES", "track": "main", "status": "Reject", "tldr": "non-targeted and targeted attack on GCN by adding fake nodes", "abstract": "Graph convolutional networks (GCNs) have been widely used for classifying graph nodes in the semi-supervised setting.\nPrevious works have shown that GCNs are vulnerable to the perturbation on adjacency and feature matrices of existing nodes. However, it is unrealistic to change the connections of existing nodes in many applications, such as existing users in social networks. In this paper, we investigate methods attacking GCNs by adding fake nodes. A greedy algorithm is proposed to generate adjacency and feature matrices of fake nodes, aiming to minimize the classification accuracy on the existing ones. In additional, we introduce a discriminator to classify fake nodes from real nodes, and propose a Greedy-GAN algorithm to simultaneously update the discriminator and the attacker, to make fake nodes indistinguishable to the real ones. Our non-targeted attack decreases the accuracy of GCN down to 0.10, and our targeted attack reaches a success rate of 0.99 for attacking the whole datasets, and 0.94 on average for attacking a single node.", "keywords": "Graph Convolutional Network;adversarial attack;node classification", "primary_area": "", "supplementary_material": "", "author": "Xiaoyun Wang;Joe Eaton;Cho-Jui Hsieh;Felix Wu", "authorids": "xiywang@ucdavis.edu;featon@nvidia.com;chohsieh@ucdavis.edu;sfwu@ucdavis.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nwang2019attack,\ntitle={{ATTACK} {GRAPH} {CONVOLUTIONAL} {NETWORKS} {BY} {ADDING} {FAKE} {NODES}},\nauthor={Xiaoyun Wang and Joe Eaton and Cho-Jui Hsieh and Felix Wu},\nyear={2019},\nurl={https://openreview.net/forum?id=rke8ZhCcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rke8ZhCcFQ", "pdf_size": 0, "rating": "3;3;4", "confidence": "2;4;3", "wc_review": "300;469;114", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 294.3333333333333, 144.9835239681469 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7403794455479836050&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rkeMHjR9Ym", "title": "Stochastic Gradient Descent Learns State Equations with Nonlinear Activations", "track": "main", "status": "Reject", "tldr": "We study the state equation of a recurrent neural network. We show that SGD can efficiently learn the unknown dynamics from few input/output observations under proper assumptions.", "abstract": "We study discrete time dynamical systems governed by the state equation $h_{t+1}=\u03d5(Ah_t+Bu_t)$. Here A,B are weight matrices, \u03d5 is an activation function, and $u_t$ is the input data. This relation is the backbone of recurrent neural networks (e.g. LSTMs) which have broad applications in sequential learning tasks. We utilize stochastic gradient descent to learn the weight matrices from a finite input/state trajectory $(u_t,h_t)_{t=0}^N$. We prove that SGD estimate linearly converges to the ground truth weights while using near-optimal sample size. Our results apply to increasing activations whose derivatives are bounded away from zero. The analysis is based on i) an SGD convergence result with nonlinear activations and ii) careful statistical characterization of the state vector. Numerical experiments verify the fast convergence of SGD on ReLU and leaky ReLU in consistence with our theory.", "keywords": "recurrent neural network;state equation;gradient descent;sample complexity", "primary_area": "", "supplementary_material": "", "author": "Samet Oymak", "authorids": "sametoymak@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\noymak2019stochastic,\ntitle={Stochastic Gradient Descent Learns State Equations with Nonlinear Activations},\nauthor={Samet Oymak},\nyear={2019},\nurl={https://openreview.net/forum?id=rkeMHjR9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkeMHjR9Ym", "pdf_size": 0, "rating": "5;7;7", "confidence": "5;3;3", "wc_review": "585;470;538", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "632;577;376", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 531.0, 47.20875624994442 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 528.3333333333334, 110.0313086757078 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7528291881711452634&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "title": "Deep Learning 3D Shapes Using Alt-az Anisotropic 2-Sphere Convolution", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/946", "id": "rkeSiiA5Fm", "author_site": "Min Liu, Fupin Yao, Chiho Choi, Ayan Sinha, Karthik Ramani", "tldr": "A method for applying deep learning to 3D surfaces using their spherical descriptors and alt-az anisotropic convolution on 2-sphere.", "abstract": "The ground-breaking performance obtained by deep convolutional neural networks (CNNs) for image processing tasks is inspiring research efforts attempting to extend it for 3D geometric tasks. One of the main challenge in applying CNNs to 3D shape analysis is how to define a natural convolution operator on non-euclidean surfaces. In this paper, we present a method for applying deep learning to 3D surfaces using their spherical descriptors and alt-az anisotropic convolution on 2-sphere. A cascade set of geodesic disk filters rotate on the 2-sphere and collect spherical patterns and so to extract geometric features for various 3D shape analysis tasks. We demonstrate theoretically and experimentally that our proposed method has the possibility to bridge the gap between 2D images and 3D shapes with the desired rotation equivariance/invariance, and its effectiveness is evaluated in applications of non-rigid/ rigid shape classification and shape retrieval.", "keywords": "Spherical Convolution;Geometric deep learning;3D shape analysis", "primary_area": "", "supplementary_material": "", "author": "Min Liu;Fupin Yao;Chiho Choi;Ayan Sinha;Karthik Ramani", "authorids": "liu66@purdue.edu;yao153@purdue.edu;chihochoi@purdue.edu;asinha@magicleap.com;ramani@purdue.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nliu2018deep,\ntitle={Deep Learning 3D Shapes Using Alt-az Anisotropic 2-Sphere Convolution},\nauthor={Min Liu and Fupin Yao and Chiho Choi and Sinha Ayan and Karthik Ramani},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkeSiiA5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;5;5", "wc_review": "486;1301;112", "wc_reply_reviewers": "0;293;0", "wc_reply_authors": "2022;1402;703", "reply_reviewers": "0;1;0", "reply_authors": "3;4;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 633.0, 496.4117914258954 ], "wc_reply_reviewers_avg": [ 97.66666666666667, 138.12152459177227 ], "wc_reply_authors_avg": [ 1375.6666666666667, 538.8013445500011 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5655071054318144878&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rkeSiiA5Fm", "pdf": "https://openreview.net/pdf?id=rkeSiiA5Fm", "email": ";;;;", "author_num": 5 }, { "id": "rkeT8iR9Y7", "title": "Directional Analysis of Stochastic Gradient Descent via von Mises-Fisher Distributions in Deep Learning", "track": "main", "status": "Reject", "tldr": "One of theoretical issues in deep learning", "abstract": "Although stochastic gradient descent (SGD) is a driving force behind the recent success of deep learning, our understanding of its dynamics in a high-dimensional parameter space is limited. In recent years, some researchers have used the stochasticity of minibatch gradients, or the signal-to-noise ratio, to better characterize the learning dynamics of SGD. Inspired from these work, we here analyze SGD from a geometrical perspective by inspecting the stochasticity of the norms and directions of minibatch gradients. We propose a model of the directional concentration for minibatch gradients through von Mises-Fisher (VMF) distribution, and show that the directional uniformity of minibatch gradients increases over the course of SGD. We empirically verify our result using deep convolutional networks and observe a higher correlation between the gradient stochasticity and the proposed directional uniformity than that against the gradient norm stochasticity, suggesting that the directional statistics of minibatch gradients is a major factor behind SGD.", "keywords": "directional statistics;deep learning;SNR;gradient stochasticity;SGD;stochastic gradient;von Mises-Fisher;angle", "primary_area": "", "supplementary_material": "", "author": "Cheolhyoung Lee;Kyunghyun Cho;Wanmo Kang", "authorids": "bloodwass@kaist.ac.kr;kyunghyun.cho@nyu.edu;wanmo.kang@kaist.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nlee2019directional,\ntitle={Directional Analysis of Stochastic Gradient Descent via von Mises-Fisher Distributions in Deep Learning},\nauthor={Cheolhyoung Lee and Kyunghyun Cho and Wanmo Kang},\nyear={2019},\nurl={https://openreview.net/forum?id=rkeT8iR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=rkeT8iR9Y7", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;3", "wc_review": "413;707;252", "wc_reply_reviewers": "0;181;0", "wc_reply_authors": "420;1257;216", "reply_reviewers": "0;1;0", "reply_authors": "1;3;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 457.3333333333333, 188.3796403247678 ], "wc_reply_reviewers_avg": [ 60.333333333333336, 85.32421826317673 ], "wc_reply_authors_avg": [ 631.0, 450.4153638587387 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1670161511942786393&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rkeUrjCcYQ", "title": "Monge-Amp\\`ere Flow for Generative Modeling", "track": "main", "status": "Reject", "tldr": "A gradient flow based dynamical system for invertible generative modeling", "abstract": "We present a deep generative model, named Monge-Amp\\`ere flow, which builds on continuous-time gradient flow arising from the Monge-Amp\\`ere equation in optimal transport theory. The generative map from the latent space to the data space follows a dynamical system, where a learnable potential function guides a compressible fluid to flow towards the target density distribution. Training of the model amounts to solving an optimal control problem. The Monge-Amp\\`ere flow has tractable likelihoods and supports efficient sampling and inference. One can easily impose symmetry constraints in the generative model by designing suitable scalar potential functions. We apply the approach to unsupervised density estimation of the MNIST dataset and variational calculation of the two-dimensional Ising model at the critical point. This approach brings insights and techniques from Monge-Amp\\`ere equation, optimal transport, and fluid dynamics into reversible flow-based generative models. ", "keywords": "generative modeling;Monge-Amp\\`ere equation;dynamical system;optimal transport;density estimation;free energy calculation", "primary_area": "", "supplementary_material": "", "author": "Linfeng Zhang;Weinan E;Lei Wang", "authorids": "linfengz@princeton.edu;weinan@math.princeton.edu;wanglei@iphy.ac.cn", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019mongeampere,\ntitle={Monge-Amp\\`ere Flow for Generative Modeling},\nauthor={Linfeng Zhang and Weinan E and Lei Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=rkeUrjCcYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkeUrjCcYQ", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "wc_review": "797;676;778", "wc_reply_reviewers": "0;192;0", "wc_reply_authors": "1085;1081;757", "reply_reviewers": "0;1;0", "reply_authors": "2;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 750.3333333333334, 53.1308656892478 ], "wc_reply_reviewers_avg": [ 64.0, 90.50966799187809 ], "wc_reply_authors_avg": [ 974.3333333333334, 153.68654968980064 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 49, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7247383870657032281&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rkeX-3Rqtm", "title": "Training Hard-Threshold Networks with Combinatorial Search in a Discrete Target Propagation Setting", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning deep neural networks with hard-threshold activation has recently become an important problem due to the proliferation of resource-constrained computing devices. In order to circumvent the inability to train with backpropagation in the present of hard-threshold activations, \\cite{friesen2017} introduced a discrete target propagation framework for training hard-threshold networks in a layer-by-layer fashion. Rather than using a gradient-based target heuristic, we explore the use of search methods for solving the target setting problem. Building on both traditional combinatorial optimization algorithms and gradient-based techniques, we develop a novel search algorithm Guided Random Local Search (GRLS). We demonstrate the effectiveness of our algorithm in training small networks on several datasets and evaluate our target-setting algorithm compared to simpler search methods and gradient-based techniques. Our results indicate that combinatorial optimization is a viable method for training hard-threshold networks that may have the potential to eventually surpass gradient-based methods in many settings. ", "keywords": "hard-threshold network;combinatorial optimization;search;target propagation", "primary_area": "", "supplementary_material": "", "author": "Lukas Nabergall;Justin Toth;Leah Cousins", "authorids": "lnaberga@uwaterloo.ca;wjtoth@uwaterloo.ca;lm2cousi@uwaterloo.ca", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnabergall2019training,\ntitle={Training Hard-Threshold Networks with Combinatorial Search in a Discrete Target Propagation Setting},\nauthor={Lukas Nabergall and Justin Toth and Leah Cousins},\nyear={2019},\nurl={https://openreview.net/forum?id=rkeX-3Rqtm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkeX-3Rqtm", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;5", "wc_review": "619;126;1211", "wc_reply_reviewers": "8;0;10", "wc_reply_authors": "138;0;636", "reply_reviewers": "1;0;1", "reply_authors": "1;0;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 652.0, 443.56359934812804 ], "wc_reply_reviewers_avg": [ 6.0, 4.320493798938574 ], "wc_reply_authors_avg": [ 258.0, 273.1592941856455 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:DhOmILPp-qIJ:scholar.google.com/&scioq=Training+Hard-Threshold+Networks+with+Combinatorial+Search+in+a+Discrete+Target+Propagation+Setting&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rkeYUsRqKQ", "title": "An Adversarial Learning Framework for a Persona-based Multi-turn Dialogue Model", "track": "main", "status": "Reject", "tldr": "This paper develops an adversarial learning framework for neural conversation models with persona", "abstract": "In this paper, we extend the persona-based sequence-to-sequence (Seq2Seq) neural network conversation model to a multi-turn dialogue scenario by modifying the state-of-the-art hredGAN architecture to simultaneously capture utterance attributes such as speaker identity, dialogue topic, speaker sentiments and so on. The proposed system, phredGAN has a persona-based HRED generator (PHRED) and a conditional discriminator. We also explore two approaches to accomplish the conditional discriminator: (1) $phredGAN_a$, a system that passes the attribute representation as an additional input into a traditional adversarial discriminator, and (2) $phredGAN_d$, a dual discriminator system which in addition to the adversarial discriminator, collaboratively predicts the attribute(s) that generated the input utterance. To demonstrate the superior performance of phredGAN over the persona SeqSeq model, we experiment with two conversational datasets, the Ubuntu Dialogue Corpus (UDC) and TV series transcripts from the Big Bang Theory and Friends. Performance comparison is made with respect to a variety of quantitative measures as well as crowd-sourced human evaluation. We also explore the trade-offs from using either variant of $phredGAN$ on datasets with many but weak attribute modalities (such as with Big Bang Theory and Friends) and ones with few but strong attribute modalities (customer-agent interactions in Ubuntu dataset).", "keywords": "conversation model;dialogue system;adversarial net;persona", "primary_area": "", "supplementary_material": "", "author": "Oluwatobi O. Olabiyi;Anish Khazane;Alan Salimov;Erik T.Mueller", "authorids": "oluwatobi.olabiyi@capitalone.com;anish.khazan@capitalone.com;alan.salimov@capitalone.com;erik.mueller@capitalone.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nolabiyi2019an,\ntitle={An Adversarial Learning Framework for a Persona-based Multi-turn Dialogue Model},\nauthor={Oluwatobi O. Olabiyi and Anish Khazane and Alan Salimov and Erik T.Mueller},\nyear={2019},\nurl={https://openreview.net/forum?id=rkeYUsRqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkeYUsRqKQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "580;633;216", "wc_reply_reviewers": "135;0;0", "wc_reply_authors": "581;527;26", "reply_reviewers": "1;0;0", "reply_authors": "2;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 476.3333333333333, 185.35071860904367 ], "wc_reply_reviewers_avg": [ 45.0, 63.63961030678928 ], "wc_reply_authors_avg": [ 378.0, 249.87596923273753 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1024358769837049963&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Small nonlinearities in activation functions create bad local minima in neural networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/808", "id": "rke_YiRct7", "author_site": "Chulhee Yun, Suvrit Sra, Ali Jadbabaie", "tldr": "We constructively prove that even the slightest nonlinear activation functions introduce spurious local minima, for general datasets and activation functions.", "abstract": "We investigate the loss surface of neural networks. We prove that even for one-hidden-layer networks with \"slightest\" nonlinearity, the empirical risks have spurious local minima in most cases. Our results thus indicate that in general \"no spurious local minim\" is a property limited to deep linear networks, and insights obtained from linear networks may not be robust. Specifically, for ReLU(-like) networks we constructively prove that for almost all practical datasets there exist infinitely many local minima. We also present a counterexample for more general activations (sigmoid, tanh, arctan, ReLU, etc.), for which there exists a bad local minimum. Our results make the least restrictive assumptions relative to existing results on spurious local optima in neural networks. We complete our discussion by presenting a comprehensive characterization of global optimality for deep linear networks, which unifies other results on this topic.", "keywords": "spurious local minima;loss surface;optimization landscape;neural network", "primary_area": "", "supplementary_material": "", "author": "Chulhee Yun;Suvrit Sra;Ali Jadbabaie", "authorids": "chulheey@mit.edu;suvrit@mit.edu;jadbabai@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nyun2018small,\ntitle={Small nonlinearities in activation functions create bad local minima in neural networks},\nauthor={Chulhee Yun and Suvrit Sra and Ali Jadbabaie},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rke_YiRct7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "3;3;4", "wc_review": "279;249;396", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "329;280;334", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 308.0, 63.419239982831705 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 314.3333333333333, 24.36299561949547 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12267478949647511567&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rke_YiRct7", "pdf": "https://openreview.net/pdf?id=rke_YiRct7", "email": ";;", "author_num": 3 }, { "id": "rkelDoCqFX", "title": "Transfer Learning via Unsupervised Task Discovery for Visual Question Answering", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "We study how to leverage off-the-shelf visual and linguistic data to cope with out-of-vocabulary answers in visual question answering. Existing large-scale visual data with annotations such as image class labels, bounding boxes and region descriptions are good sources for learning rich and diverse visual concepts. However, it is not straightforward how the visual concepts should be captured and transferred to visual question answering models due to missing link between question dependent answering models and visual data without question or task specification. We tackle this problem in two steps: 1) learning a task conditional visual classifier based on unsupervised task discovery and 2) transferring and adapting the task conditional visual classifier to visual question answering models. Specifically, we employ linguistic knowledge sources such as structured lexical database (e.g. Wordnet) and visual descriptions for unsupervised task discovery, and adapt a learned task conditional visual classifier to answering unit in a visual question answering model. We empirically show that the proposed algorithm generalizes to unseen answers successfully using the knowledge transferred from the visual data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Hyeonwoo Noh;Taehoon Kim;Jonghwan Mun;Bohyung Han", "authorids": "shgusdngogo@postech.ac.kr;carpedm20@gmail.com;choco1916@postech.ac.kr;bhhan@snu.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkelDoCqFX", "pdf_size": 0, "rating": "4;5;8", "confidence": "5;5;5", "wc_review": "442;548;347", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.666666666666667, 1.699673171197595 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 445.6666666666667, 82.09885640020951 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9442012245217169205&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "title": "Information Theoretic lower bounds on negative log likelihood", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1056", "id": "rkemqsC9Fm", "author_site": "Luis Lastras", "tldr": "Use rate-distortion theory to bound how much a latent variable model can be improved", "abstract": "In this article we use rate-distortion theory, a branch of information theory devoted to the problem of lossy compression, to shed light on an important problem in latent variable modeling of data: is there room to improve the model? One way to address this question is to find an upper bound on the probability (equivalently a lower bound on the negative log likelihood) that the model can assign to some data as one varies the prior and/or the likelihood function in a latent variable model. The core of our contribution is to formally show that the problem of optimizing priors in latent variable models is exactly an instance of the variational optimization problem that information theorists solve when computing rate-distortion functions, and then to use this to derive a lower bound on negative log likelihood. Moreover, we will show that if changing the prior can improve the log likelihood, then there is a way to change the likelihood function instead and attain the same log likelihood, and thus rate-distortion theory is of relevance to both optimizing priors as well as optimizing likelihood functions. We will experimentally argue for the usefulness of quantities derived from rate-distortion theory in latent variable modeling by applying them to a problem in image modeling.", "keywords": "latent variable modeling;rate-distortion theory;log likelihood bounds", "primary_area": "", "supplementary_material": "", "author": "Luis A. Lastras-Monta\u00f1o", "authorids": "lastrasl@us.ibm.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\nlastras-monta\u00f1o2018information,\ntitle={Information Theoretic lower bounds on negative log likelihood},\nauthor={Luis A. Lastras-Monta\u00f1o},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkemqsC9Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;3", "wc_review": "388;314;845", "wc_reply_reviewers": "0;59;110", "wc_reply_authors": "507;569;648", "reply_reviewers": "0;1;1", "reply_authors": "1;2;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 515.6666666666666, 234.8252305202507 ], "wc_reply_reviewers_avg": [ 56.333333333333336, 44.94688223027513 ], "wc_reply_authors_avg": [ 574.6666666666666, 57.70230112877264 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13031039824552411051&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rkemqsC9Fm", "pdf": "https://openreview.net/pdf?id=rkemqsC9Fm", "email": "", "author_num": 1 }, { "id": "rkeqCoA5tX", "title": "LEARNING GENERATIVE MODELS FOR DEMIXING OF STRUCTURED SIGNALS FROM THEIR SUPERPOSITION USING GANS", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recently, Generative Adversarial Networks (GANs) have emerged as a popular alternative for modeling complex high dimensional distributions. Most of the existing works implicitly assume that the clean samples from the target distribution are easily available. However, in many applications, this assumption is violated. In this paper, we consider the problem of learning GANs under the observation setting when the samples from target distribution are given by the superposition of two structured components. We propose two novel frameworks: denoising-GAN and demixing-GAN. The denoising-GAN assumes access to clean samples from the second component and try to learn the other distribution, whereas demixing-GAN learns the distribution of the components at the same time. Through comprehensive numerical experiments, we demonstrate that proposed frameworks can generate clean samples from unknown distributions, and provide competitive performance in tasks such as denoising, demixing, and compressive sensing.", "keywords": "Generative Models;GANs;Denosing;Demixing;Structured Recovery", "primary_area": "", "supplementary_material": "", "author": "Mohammadreza Soltani;Swayambhoo Jain;Abhinav V. Sambasivan", "authorids": "msoltani@iastate.edu;swayambhoo.jain@technicolor.com;samba014@umn.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsoltani2019learning,\ntitle={{LEARNING} {GENERATIVE} {MODELS} {FOR} {DEMIXING} {OF} {STRUCTURED} {SIGNALS} {FROM} {THEIR} {SUPERPOSITION} {USING} {GANS}},\nauthor={Mohammadreza Soltani and Swayambhoo Jain and Abhinav V. Sambasivan},\nyear={2019},\nurl={https://openreview.net/forum?id=rkeqCoA5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkeqCoA5tX", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;4", "wc_review": "416;196;492", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "772;365;407", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 368.0, 125.51759504813127 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 514.6666666666666, 182.76821994598026 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6pw9s14fCbsJ:scholar.google.com/&scioq=LEARNING+GENERATIVE+MODELS+FOR+DEMIXING+OF+STRUCTURED+SIGNALS+FROM+THEIR+SUPERPOSITION+USING+GANS&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rket4i0qtX", "title": "The meaning of \"most\" for visual question answering models", "track": "main", "status": "Reject", "tldr": "Psychology-inspired evaluation of quantifier understanding for visual question answering models", "abstract": "The correct interpretation of quantifier statements in the context of a visual scene requires non-trivial inference mechanisms. For the example of \"most\", we discuss two strategies which rely on fundamentally different cognitive concepts. Our aim is to identify what strategy deep learning models for visual question answering learn when trained on such questions. To this end, we carefully design data to replicate experiments from psycholinguistics where the same question was investigated for humans. Focusing on the FiLM visual question answering model, our experiments indicate that a form of approximate number system emerges whose performance declines with more difficult scenes as predicted by Weber's law. Moreover, we identify confounding factors, like spatial arrangement of the scene, which impede the effectiveness of this system.", "keywords": "quantifier;evaluation methodology;psycholinguistics;visual question answering", "primary_area": "", "supplementary_material": "", "author": "Alexander Kuhnle;Ann Copestake", "authorids": "aok25@cam.ac.uk;aac10@cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkuhnle2019the,\ntitle={The meaning of \"most\" for visual question answering models},\nauthor={Alexander Kuhnle and Ann Copestake},\nyear={2019},\nurl={https://openreview.net/forum?id=rket4i0qtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rket4i0qtX", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;4", "wc_review": "628;595;453", "wc_reply_reviewers": "0;231;0", "wc_reply_authors": "845;572;273", "reply_reviewers": "0;1;0", "reply_authors": "2;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 558.6666666666666, 75.92247507966414 ], "wc_reply_reviewers_avg": [ 77.0, 108.89444430272832 ], "wc_reply_authors_avg": [ 563.3333333333334, 233.59842084702163 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Preferences Implicit in the State of the World", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1092", "id": "rkevMnRqYQ", "author_site": "Rohin Shah, Dmitrii Krasheninnikov, Jordan Alexander, Pieter Abbeel, Anca Dragan", "tldr": "When a robot is deployed in an environment that humans have been acting in, the state of the environment is already optimized for what humans want, and we can use this to infer human preferences.", "abstract": "Reinforcement learning (RL) agents optimize only the features specified in a reward function and are indifferent to anything left out inadvertently. This means that we must not only specify what to do, but also the much larger space of what not to do. It is easy to forget these preferences, since these preferences are already satisfied in our environment. This motivates our key insight: when a robot is deployed in an environment that humans act in, the state of the environment is already optimized for what humans want. We can therefore use this implicit preference information from the state to fill in the blanks. We develop an algorithm based on Maximum Causal Entropy IRL and use it to evaluate the idea in a suite of proof-of-concept environments designed to show its properties. We find that information from the initial state can be used to infer both side effects that should be avoided as well as preferences for how the environment should be organized. Our code can be found at https://github.com/HumanCompatibleAI/rlsp.", "keywords": "Preference learning;Inverse reinforcement learning;Inverse optimal stochastic control;Maximum entropy reinforcement learning;Apprenticeship learning", "primary_area": "", "supplementary_material": "", "author": "Rohin Shah;Dmitrii Krasheninnikov;Jordan Alexander;Pieter Abbeel;Anca Dragan", "authorids": "rohinmshah@berkeley.edu;dmitrii.krasheninnikov@student.uva.nl;jfalex@stanford.edu;pabbeel@cs.berkeley.edu;anca@berkeley.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nshah2018the,\ntitle={The Implicit Preference Information in an Initial State},\nauthor={Rohin Shah and Dmitrii Krasheninnikov and Jordan Alexander and Pieter Abbeel and Anca Dragan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkevMnRqYQ},\n}", "github": "[![github](/images/github_icon.svg) HumanCompatibleAI/rlsp](https://github.com/HumanCompatibleAI/rlsp)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7;7", "confidence": "3;4;4;3", "wc_review": "597;226;650;378", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "1284;661;722;774", "reply_reviewers": "0;0;0;0", "reply_authors": "2;1;1;1", "rating_avg": [ 6.5, 0.5 ], "confidence_avg": [ 3.5, 0.5 ], "wc_review_avg": [ 462.75, 170.52767370723146 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 860.25, 247.89955122992862 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.25, 0.4330127018922193 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 72, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9659325123261489202&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rkevMnRqYQ", "pdf": "https://openreview.net/pdf?id=rkevMnRqYQ", "email": ";;;;", "author_num": 5 }, { "id": "rkg5fh0ctQ", "title": "Transferring SLU Models in Novel Domains", "track": "main", "status": "Reject", "tldr": "v3", "abstract": "Spoken language understanding (SLU) is a critical component in building dialogue systems. When building models for novel natural language domains, a major challenge is the lack of data in the new domains, no matter whether the data is annotated or not. Recognizing and annotating ``intent'' and ``slot'' of natural languages is a time-consuming process. Therefore, spoken language understanding in low resource domains remains a crucial problem to address. In this paper, we address this problem by proposing a transfer-learning method, whereby a SLU model is transferred to a novel but data-poor domain via a deep neural network framework. We also introduce meta-learning in our work to bridge the semantic relations between seen and unseen data, allowing new intents to be recognized and new slots to be filled with much lower new training effort. We show the performance improvement with extensive experimental results for spoken language understanding in low resource domains. We show that our method can also handle novel intent recognition and slot-filling tasks. Our methodology provides a feasible solution for alleviating data shortages in spoken language understanding.", "keywords": "transfer learning;semantic representation;spoken language understanding", "primary_area": "", "supplementary_material": "", "author": "Yaohua Tang;Kaixiang Mo;Qian Xu;Chao Zhang;Qiang Yang", "authorids": "yaohuatang@webank.com;kxmo@connect.ust.hk;fleurxq@outlook.com;carlzzhang@webank.com;qyang@cse.ust.hk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntang2019transferring,\ntitle={Transferring {SLU} Models in Novel Domains},\nauthor={Yaohua Tang and Kaixiang Mo and Qian Xu and Chao Zhang and Qiang Yang},\nyear={2019},\nurl={https://openreview.net/forum?id=rkg5fh0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkg5fh0ctQ", "pdf_size": 0, "rating": "4;5;6", "confidence": "3;3;4", "wc_review": "301;284;240", "wc_reply_reviewers": "0;0;25", "wc_reply_authors": "546;307;234", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 275.0, 25.703436864876 ], "wc_reply_reviewers_avg": [ 8.333333333333334, 11.785113019775793 ], "wc_reply_authors_avg": [ 362.3333333333333, 133.24747235459623 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5TBh0uQrT-kJ:scholar.google.com/&scioq=Transferring+SLU+Models+in+Novel+Domains&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "A Kernel Random Matrix-Based Approach for Sparse PCA", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/785", "id": "rkgBHoCqYX", "author_site": "Mohamed El Amine Seddik, mohamed Tamaazousti, Romain Couillet", "tldr": "", "abstract": "In this paper, we present a random matrix approach to recover sparse principal components from n p-dimensional vectors. Specifically, considering the large dimensional setting where n, p \u2192 \u221e with p/n \u2192 c \u2208 (0, \u221e) and under Gaussian vector observations, we study kernel random matrices of the type f (\u0108), where f is a three-times continuously differentiable function applied entry-wise to the sample covariance matrix \u0108 of the data. Then, assuming that the principal components are sparse, we show that taking f in such a way that f'(0) = f''(0) = 0 allows for powerful recovery of the principal components, thereby generalizing previous ideas involving more specific f functions such as the soft-thresholding function.", "keywords": "Random Matrix Theory;Concentration of Measure;Sparse PCA;Covariance Thresholding", "primary_area": "", "supplementary_material": "", "author": "Mohamed El Amine Seddik;Mohamed Tamaazousti;Romain Couillet", "authorids": "melaseddik@gmail.com;mohamed.tamaazousti@cea.fr;romain.couillet@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nseddik2018a,\ntitle={A Kernel Random Matrix-Based Approach for Sparse {PCA}},\nauthor={Mohamed El Amine Seddik and Mohamed Tamaazousti and Romain Couillet},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgBHoCqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;2", "wc_review": "369;200;302", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "615;421;325", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 290.3333333333333, 69.48541013926751 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 453.6666666666667, 120.62430195537806 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14436367511979422240&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rkgBHoCqYX", "pdf": "https://openreview.net/pdf?id=rkgBHoCqYX", "email": ";;", "author_num": 3 }, { "title": "Bayesian Prediction of Future Street Scenes using Synthetic Likelihoods", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/670", "id": "rkgK3oC5Fm", "author_site": "Apratim Bhattacharyya, Mario Fritz, Bernt Schiele", "tldr": "Dropout based Bayesian inference is extended to deal with multi-modality and is evaluated on scene anticipation tasks.", "abstract": "For autonomous agents to successfully operate in the real world, the ability to anticipate future scene states is a key competence. In real-world scenarios, future states become increasingly uncertain and multi-modal, particularly on long time horizons. Dropout based Bayesian inference provides a computationally tractable, theoretically well grounded approach to learn different hypotheses/models to deal with uncertain futures and make predictions that correspond well to observations -- are well calibrated. However, it turns out that such approaches fall short to capture complex real-world scenes, even falling behind in accuracy when compared to the plain deterministic approaches. This is because the used log-likelihood estimate discourages diversity. In this work, we propose a novel Bayesian formulation for anticipating future scene states which leverages synthetic likelihoods that encourage the learning of diverse models to accurately capture the multi-modal nature of future scene states. We show that our approach achieves accurate state-of-the-art predictions and calibrated probabilities through extensive experiments for scene anticipation on Cityscapes dataset. Moreover, we show that our approach generalizes across diverse tasks such as digit generation and precipitation forecasting.", "keywords": "bayesian inference;segmentation;anticipation;multi-modality", "primary_area": "", "supplementary_material": "", "author": "Apratim Bhattacharyya;Mario Fritz;Bernt Schiele", "authorids": "abhattac@mpi-inf.mpg.de;mfritz@mpi-inf.mpg.de;schiele@mpi-inf.mpg.de", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nbhattacharyya2018bayesian,\ntitle={Bayesian Prediction of Future Street Scenes using Synthetic Likelihoods},\nauthor={Apratim Bhattacharyya and Mario Fritz and Bernt Schiele},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgK3oC5Fm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "2;4;4", "wc_review": "139;829;335", "wc_reply_reviewers": "0;125;33", "wc_reply_authors": "42;1565;409", "reply_reviewers": "0;1;1", "reply_authors": "1;3;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 434.3333333333333, 290.31630260038025 ], "wc_reply_reviewers_avg": [ 52.666666666666664, 52.891923348991156 ], "wc_reply_authors_avg": [ 672.0, 648.9781711788669 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8839058001244993401&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=rkgK3oC5Fm", "pdf": "https://openreview.net/pdf?id=rkgK3oC5Fm", "email": ";;", "author_num": 3 }, { "title": "There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/903", "id": "rkgKBhA5Y7", "author_site": "Ben Athiwaratkun, Marc A Finzi, Pavel Izmailov, Andrew G Wilson", "tldr": "Consistency-based models for semi-supervised learning do not converge to a single point but continue to explore a diverse set of plausible solutions on the perimeter of a flat region. Weight averaging helps improve generalization performance.", "abstract": "Presently the most successful approaches to semi-supervised learning are based on consistency regularization, whereby a model is trained to be robust to small perturbations of its inputs and parameters. To understand consistency regularization, we conceptually explore how loss geometry interacts with training procedures. The consistency loss dramatically improves generalization performance over supervised-only training; however, we show that SGD struggles to converge on the consistency loss and continues to make large steps that lead to changes in predictions on the test data. Motivated by these observations, we propose to train consistency-based methods with Stochastic Weight Averaging (SWA), a recent approach which averages weights along the trajectory of SGD with a modified learning rate schedule. We also propose fast-SWA, which further accelerates convergence by averaging multiple points within each cycle of a cyclical learning rate schedule. With weight averaging, we achieve the best known semi-supervised results on CIFAR-10 and CIFAR-100, over many different quantities of labeled training data. For example, we achieve 5.0% error on CIFAR-10 with only 4000 labels, compared to the previous best result in the literature of 6.3%.", "keywords": "semi-supervised learning;computer vision;classification;consistency regularization;flatness;weight averaging;stochastic weight averaging", "primary_area": "", "supplementary_material": "", "author": "Ben Athiwaratkun;Marc Finzi;Pavel Izmailov;Andrew Gordon Wilson", "authorids": "pa338@cornell.edu;maf388@cornell.edu;izmailovpavel@gmail.com;andrew@cornell.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nathiwaratkun2018there,\ntitle={There Are Many Consistent Explanations of Unlabeled Data: Why You Should Average},\nauthor={Ben Athiwaratkun and Marc Finzi and Pavel Izmailov and Andrew Gordon Wilson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgKBhA5Y7},\n}", "github": "[![github](/images/github_icon.svg) benathi/fastswa-semi-sup](https://github.com/benathi/fastswa-semi-sup) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rkgKBhA5Y7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;8", "confidence": "1;4;4", "wc_review": "299;445;392", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "404;1353;489", "reply_reviewers": "0;0;0", "reply_authors": "1;3;1", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 378.6666666666667, 60.34530268012214 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 748.6666666666666, 428.73483128334175 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.49999999999999994, "gs_citation": 294, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16133183473908875555&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rkgKBhA5Y7", "pdf": "https://openreview.net/pdf?id=rkgKBhA5Y7", "email": ";;;", "author_num": 4 }, { "id": "rkgMNnC9YQ", "title": "ATTENTIVE EXPLAINABILITY FOR PATIENT TEMPORAL EMBEDDING", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning explainable patient temporal embeddings from observational data has mostly ignored the use of RNN architecture that excel in capturing temporal data dependencies but at the expense of explainability. This paper addresses this problem by introducing and applying an information theoretic approach to estimate the degree of explainability of such architectures. Using a communication paradigm, we formalize metrics of explainability by estimating the amount of information that an AI model needs to convey to a human end user to explain and rationalize its outputs. A key aspect of this work is to model human prior knowledge at the receiving end and measure the lack of explainability as a deviation from human prior knowledge. We apply this paradigm to medical concept representation problems by regularizing loss functions of temporal autoencoders according to the derived explainability metrics to guide the learning process towards models producing explainable outputs. We illustrate the approach with convincing experimental results for the generation of explainable temporal embeddings for critical care patient data.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Daby Sow;Mohamed Ghalwash;Zach Shahn;Sanjoy Dey;Moulay Draidia;Li-wei Lehmann", "authorids": "sowdaby@us.ibm.com;mohamed.ghalwash@ibm.com;zach.shahn@ibm.com;deysa@us.ibm.com;mzdraidia@berkeley.edu;lilehman@mit.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nsow2019attentive,\ntitle={{ATTENTIVE} {EXPLAINABILITY} {FOR} {PATIENT} {TEMPORAL} {EMBEDDING}},\nauthor={Daby Sow and Mohamed Ghalwash and Zach Shahn and Sanjoy Dey and Moulay Draidia and Li-wei Lehmann},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgMNnC9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkgMNnC9YQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "3;4;4", "wc_review": "196;393;264", "wc_reply_reviewers": "31;0;0", "wc_reply_authors": "242;516;50", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 284.3333333333333, 81.69999320005412 ], "wc_reply_reviewers_avg": [ 10.333333333333334, 14.613540144521982 ], "wc_reply_authors_avg": [ 269.3333333333333, 191.2229646831735 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:w9EJVZd5sjIJ:scholar.google.com/&scioq=ATTENTIVE+EXPLAINABILITY+FOR+PATIENT+TEMPORAL+EMBEDDING&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "Large-Scale Answerer in Questioner's Mind for Visual Dialog Question Generation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1039", "id": "rkgT3jRct7", "author_site": "Sang-Woo Lee, Tong Gao, Sohee Yang, Jaejun Yoo, Jung-Woo Ha", "tldr": "", "abstract": "Answerer in Questioner's Mind (AQM) is an information-theoretic framework that has been recently proposed for task-oriented dialog systems. AQM benefits from asking a question that would maximize the information gain when it is asked. However, due to its intrinsic nature of explicitly calculating the information gain, AQM has a limitation when the solution space is very large. To address this, we propose AQM+ that can deal with a large-scale problem and ask a question that is more coherent to the current context of the dialog. We evaluate our method on GuessWhich, a challenging task-oriented visual dialog problem, where the number of candidate classes is near 10K. Our experimental results and ablation studies show that AQM+ outperforms the state-of-the-art models by a remarkable margin with a reasonable approximation. In particular, the proposed AQM+ reduces more than 60% of error as the dialog proceeds, while the comparative algorithms diminish the error by less than 6%. Based on our results, we argue that AQM+ is a general task-oriented dialog algorithm that can be applied for non-yes-or-no responses. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Sang-Woo Lee;Tong Gao;Sohee Yang;Jaejun Yoo;Jung-Woo Ha", "authorids": "sang.woo.lee@navercorp.com;tong.gao@navercorp.com;sh.yang@navercorp.com;jaejun.yoo@navercorp.com;jungwoo.ha@navercorp.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlee2018largescale,\ntitle={Large-Scale Answerer in Questioner's Mind for Visual Dialog Question Generation},\nauthor={Sang-Woo Lee and Tong Gao and Sohee Yang and Jaejun Yoo and Jung-Woo Ha},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgT3jRct7},\n}", "github": "[![github](/images/github_icon.svg) naver/aqm-plus](https://github.com/naver/aqm-plus)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "2;4;5", "wc_review": "275;720;823", "wc_reply_reviewers": "30;301;12", "wc_reply_authors": "375;1062;1422", "reply_reviewers": "1;1;1", "reply_authors": "2;3;3", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 606.0, 237.79963554780034 ], "wc_reply_reviewers_avg": [ 114.33333333333333, 132.19766345220413 ], "wc_reply_authors_avg": [ 953.0, 434.32936810673993 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 18, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.7559289460184545, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7353352535802475325&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkgT3jRct7", "pdf": "https://openreview.net/pdf?id=rkgT3jRct7", "email": ";;;;", "author_num": 5 }, { "title": "Graph HyperNetworks for Neural Architecture Search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/740", "id": "rkgW0oA9FX", "author_site": "Chris Zhang, Mengye Ren, Raquel Urtasun", "tldr": "", "abstract": "Neural architecture search (NAS) automatically finds the best task-specific neural network topology, outperforming many manual architecture designs. However, it can be prohibitively expensive as the search requires training thousands of different networks, while each training run can last for hours. In this work, we propose the Graph HyperNetwork (GHN) to amortize the search cost: given an architecture, it directly generates the weights by running inference on a graph neural network. GHNs model the topology of an architecture and therefore can predict network performance more accurately than regular hypernetworks and premature early stopping. To perform NAS, we randomly sample architectures and use the validation accuracy of networks with GHN generated weights as the surrogate search signal. GHNs are fast - they can search nearly 10\u00d7 faster than other random search methods on CIFAR-10 and ImageNet. GHNs can be further extended to the anytime prediction setting, where they have found networks with better speed-accuracy tradeoff than the state-of-the-art manual designs.", "keywords": "neural;architecture;search;graph;network;hypernetwork;meta;learning;anytime;prediction", "primary_area": "", "supplementary_material": "", "author": "Chris Zhang;Mengye Ren;Raquel Urtasun", "authorids": "cjzhang@edu.uwaterloo.ca;mren@cs.toronto.edu;urtasun@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhang2018graph,\ntitle={Graph HyperNetworks for Neural Architecture Search},\nauthor={Chris Zhang and Mengye Ren and Raquel Urtasun},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgW0oA9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "345;906;329", "wc_reply_reviewers": "157;0;0", "wc_reply_authors": "431;612;200", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 526.6666666666666, 268.30869452098557 ], "wc_reply_reviewers_avg": [ 52.333333333333336, 74.01050976419197 ], "wc_reply_authors_avg": [ 414.3333333333333, 168.61066263897888 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 331, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18192411628962893749&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rkgW0oA9FX", "pdf": "https://openreview.net/pdf?id=rkgW0oA9FX", "email": ";;", "author_num": 3 }, { "id": "rkgWBi09Ym", "title": "Multi-Modal Generative Adversarial Networks for Diverse Datasets", "track": "main", "status": "Withdraw", "tldr": "Multi modal Guassian distribution of latent space in GAN models improves performance and allows to trade-off quality vs. diversity", "abstract": "Generative Adversarial Networks (GANs) have been shown to produce realistically looking synthetic images with remarkable success, yet their performance seems less impressive when the training set is highly diverse. In order to provide a better fit to the target data distribution when the dataset includes many different classes, we propose a variant of the basic GAN model, a Multi-Modal Gaussian-Mixture GAN (GM-GAN), where the probability distribution over the latent space is a mixture of Gaussians. We also propose a supervised variant which is capable of conditional sample synthesis. In order to evaluate the model's performance, we propose a new scoring method which separately takes into account two (typically conflicting) measures - diversity vs. quality of the generated data. Through a series of experiments, using both synthetic and real-world datasets, we quantitatively show that GM-GANs outperform baselines, both when evaluated using the commonly used Inception Score, and when evaluated using our own alternative scoring method. In addition, we qualitatively demonstrate how the unsupervised variant of GM-GAN tends to map latent vectors sampled from different Gaussians in the latent space to samples of different classes in the data space. We show how this phenomenon can be exploited for the task of unsupervised clustering, and provide quantitative evaluation showing the superiority of our method for the unsupervised clustering of image datasets. Finally, we demonstrate a feature which further sets our model apart from other GAN models: the option to control the quality-diversity trade-off by altering, post-training, the probability distribution of the latent space. This allows one to sample higher quality and lower diversity samples, or vice versa, according to one's needs.", "keywords": "generative adversarial networks;generative models;clustering;visual object recognition", "primary_area": "", "supplementary_material": "", "author": "Matan Ben-Yosef;Daphna Weinshall", "authorids": "matan.benyosef@mail.huji.ac.il;daphna@cs.huji.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkgWBi09Ym", "pdf_size": 0, "rating": "4;6", "confidence": "4;4", "wc_review": "400;198", "wc_reply_reviewers": "0;0", "wc_reply_authors": "0;0", "reply_reviewers": "0;0", "reply_authors": "0;0", "rating_avg": [ 5.0, 1.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 299.0, 101.0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 3, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HC0glpYF0w4J:scholar.google.com/&scioq=Multi-Modal+Generative+Adversarial+Networks+for+Diverse+Datasets&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rkgZ3oR9FX", "title": "Learning to Refer to 3D Objects with Natural Language", "track": "main", "status": "Reject", "tldr": "How to build neural-speakers/listeners that learn fine-grained characteristics of 3D objects, from referential language.", "abstract": "Human world knowledge is both structured and flexible. When people see an object, they represent it not as a pixel array but as a meaningful arrangement of semantic parts. Moreover, when people refer to an object, they provide descriptions that are not merely true but also relevant in the current context. Here, we combine these two observations in order to learn fine-grained correspondences between language and contextually relevant geometric properties of 3D objects. To do this, we employed an interactive communication task with human participants to construct a large dataset containing natural utterances referring to 3D objects from ShapeNet in a wide variety of contexts. Using this dataset, we developed neural listener and speaker models with strong capacity for generalization. By performing targeted lesions of visual and linguistic input, we discovered that the neural listener depends heavily on part-related words and associates these words correctly with the corresponding geometric properties of objects, suggesting that it has learned task-relevant structure linking the two input modalities. We further show that a neural speaker that is `listener-aware' --- that plans its utterances according to how an imagined listener would interpret its words in context --- produces more discriminative referring expressions than an `listener-unaware' speaker, as measured by human performance in identifying the correct object.", "keywords": "Referential Language;3D Objects;Part-Awareness;Neural Speakers;Neural Listeners", "primary_area": "", "supplementary_material": "", "author": "Panos Achlioptas;Judy E. Fan;Robert X.D. Hawkins;Noah D. Goodman;Leo Guibas", "authorids": "optas@cs.stanford.edu;jefan@stanford.edu;rxdh@stanford.edu;ngoodman@stanford.edu;guibas@cs.stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nachlioptas2019learning,\ntitle={Learning to Refer to 3D Objects with Natural Language},\nauthor={Panos Achlioptas and Judy E. Fan and Robert X.D. Hawkins and Noah D. Goodman and Leo Guibas},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgZ3oR9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkgZ3oR9FX", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;3;4", "wc_review": "417;144;323", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 294.6666666666667, 113.23819536220492 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ZD3pw10cCSwJ:scholar.google.com/&scioq=Learning+to+Refer+to+3D+Objects+with+Natural+Language&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "DELTA: DEEP LEARNING TRANSFER USING FEATURE MAP WITH ATTENTION FOR CONVOLUTIONAL NETWORKS", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/644", "id": "rkgbwsAcYm", "author_site": "Xingjian Li, Haoyi Xiong, Hanchao Wang, Yuxuan Rao, Liping Liu, Luke Huan", "tldr": "improving deep transfer learning with regularization using attention based feature maps", "abstract": "Transfer learning through fine-tuning a pre-trained neural network with an extremely large dataset, such as ImageNet, can significantly accelerate training while the accuracy is frequently bottlenecked by the limited dataset size of the new target task. To solve the problem, some regularization methods, constraining the outer layer weights of the target network using the starting point as references (SPAR), have been studied. In this paper, we propose a novel regularized transfer learning framework DELTA, namely DEep Learning Transfer using Feature Map with Attention. Instead of constraining the weights of neural network, DELTA aims to preserve the outer layer outputs of the target network. Specifically, in addition to minimizing the empirical loss, DELTA intends to align the outer layer outputs of two networks, through constraining a subset of feature maps that are precisely selected by attention that has been learned in an supervised learning manner. We evaluate DELTA with the state-of-the-art algorithms, including L2 and L2-SP. The experiment results show that our proposed method outperforms these baselines with higher accuracy for new tasks.", "keywords": "transfer learning;deep learning;regularization;attention;cnn", "primary_area": "", "supplementary_material": "", "author": "Xingjian Li;Haoyi Xiong;Hanchao Wang;Yuxuan Rao;Liping Liu;Jun Huan", "authorids": "1762778193@qq.com;xhyccc@gmail.com;wanghanchao01@baidu.com;yrao4@illinois.edu;liuliping@baidu.com;huanjun@baidu.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nli2018delta,\ntitle={{DELTA}: {DEEP} {LEARNING} {TRANSFER} {USING} {FEATURE} {MAP} {WITH} {ATTENTION} {FOR} {CONVOLUTIONAL} {NETWORKS}},\nauthor={Xingjian Li and Haoyi Xiong and Hanchao Wang and Yuxuan Rao and Liping Liu and Jun Huan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgbwsAcYm},\n}", "github": "[![github](/images/github_icon.svg) lixingjian/DELTA](https://github.com/lixingjian/DELTA) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rkgbwsAcYm)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;4;3", "wc_review": "275;148;126", "wc_reply_reviewers": "0;0;118", "wc_reply_authors": "552;456;763", "reply_reviewers": "0;0;2", "reply_authors": "1;1;3", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 183.0, 65.67089664887078 ], "wc_reply_reviewers_avg": [ 39.333333333333336, 55.62573345334173 ], "wc_reply_authors_avg": [ 590.3333333333334, 128.2298283898442 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 207, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1065820725505324380&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkgbwsAcYm", "pdf": "https://openreview.net/pdf?id=rkgbwsAcYm", "email": ";;;;;", "author_num": 6 }, { "id": "rkgd0iA9FQ", "title": "Convergence Guarantees for RMSProp and ADAM in Non-Convex Optimization and an Empirical Comparison to Nesterov Acceleration", "track": "main", "status": "Reject", "tldr": "In this paper we prove convergence to criticality of (stochastic and deterministic) RMSProp and deterministic ADAM for smooth non-convex objectives and we demonstrate an interesting beta_1 sensitivity for ADAM on autoencoders. ", "abstract": "RMSProp and ADAM continue to be extremely popular algorithms for training neural nets but their theoretical convergence properties have remained unclear. Further, recent work has seemed to suggest that these algorithms have worse generalization properties when compared to carefully tuned stochastic gradient descent or its momentum variants. In this work, we make progress towards a deeper understanding of ADAM and RMSProp in two ways. First, we provide proofs that these adaptive gradient algorithms are guaranteed to reach criticality for smooth non-convex objectives, and we give bounds on the running time.\n\nNext we design experiments to empirically study the convergence and generalization properties of RMSProp and ADAM against Nesterov's Accelerated Gradient method on a variety of common autoencoder setups and on VGG-9 with CIFAR-10. Through these experiments we demonstrate the interesting sensitivity that ADAM has to its momentum parameter \\beta_1. We show that at very high values of the momentum parameter (\\beta_1 = 0.99) ADAM outperforms a carefully tuned NAG on most of our experiments, in terms of getting lower training and test losses. On the other hand, NAG can sometimes do better when ADAM's \\beta_1 is set to the most commonly used value: \\beta_1 = 0.9, indicating the importance of tuning the hyperparameters of ADAM to get better generalization performance.\n\nWe also report experiments on different autoencoders to demonstrate that NAG has better abilities in terms of reducing the gradient norms, and it also produces iterates which exhibit an increasing trend for the minimum eigenvalue of the Hessian of the loss function at the iterates. ", "keywords": "adaptive gradient descent;deeplearning;ADAM;RMSProp;autoencoders", "primary_area": "", "supplementary_material": "", "author": "Soham De;Anirbit Mukherjee;Enayat Ullah", "authorids": "sohamde@cs.umd.edu;amukhe14@jhu.edu;enayat@jhu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nde2019convergence,\ntitle={Convergence Guarantees for {RMSP}rop and {ADAM} in Non-Convex Optimization and an Empirical Comparison to Nesterov Acceleration},\nauthor={Soham De and Anirbit Mukherjee and Enayat Ullah},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgd0iA9FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkgd0iA9FQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "wc_review": "77;299;448", "wc_reply_reviewers": "0;92;0", "wc_reply_authors": "209;756;431", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 274.6666666666667, 152.43432101145143 ], "wc_reply_reviewers_avg": [ 30.666666666666668, 43.36921591277491 ], "wc_reply_authors_avg": [ 465.3333333333333, 224.6275930413616 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13669345530223257280&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "rkgfWh0qKX", "title": "Do Language Models Have Common Sense?", "track": "main", "status": "Reject", "tldr": "We present evidence that LMs do capture common sense with state-of-the-art results on both Winograd Schema Challenge and Commonsense Knowledge Mining.", "abstract": "It has been argued that current machine learning models do not have commonsense, and therefore must be hard-coded with prior knowledge (Marcus, 2018). Here we show surprising evidence that language models can already learn to capture certain common sense knowledge. Our key observation is that a language model can compute the probability of any statement, and this probability can be used to evaluate the truthfulness of that statement. On the Winograd Schema Challenge (Levesque et al., 2011), language models are 11% higher in accuracy than previous state-of-the-art supervised methods. Language models can also be fine-tuned for the task of Mining Commonsense Knowledge on ConceptNet to achieve an F1 score of 0.912 and 0.824, outperforming previous best results (Jastrzebskiet al., 2018). Further analysis demonstrates that language models can discover unique features of Winograd Schema contexts that decide the correct answers without explicit supervision.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Trieu H. Trinh;Quoc V. Le", "authorids": "thtrieu@google.com;qvl@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ntrinh2019do,\ntitle={Do Language Models Have Common Sense?},\nauthor={Trieu H. Trinh and Quoc V. Le},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgfWh0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkgfWh0qKX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "631;246;1334", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 737.0, 450.4538452124331 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6691265244874630262&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rkglvsC9Ym", "title": "Log Hyperbolic Cosine Loss Improves Variational Auto-Encoder", "track": "main", "status": "Reject", "tldr": "We propose to train VAE with a new reconstruction loss, the log hyperbolic cosine (log-cosh) loss, which can significantly improve the performance of VAE and its variants in output quality, measured by sharpness and FID score.", "abstract": "In Variational Auto-Encoder (VAE), the default choice of reconstruction loss function between the decoded sample and the input is the squared $L_2$. We propose to replace it with the log hyperbolic cosine (log-cosh) loss, which behaves as $L_2$ at small values and as $L_1$ at large values, and differentiable everywhere. Compared with $L_2$, the log-cosh loss improves the reconstruction without damaging the latent space optimization, thus automatically keeping a balance between the reconstruction and the generation. Extensive experiments on MNIST and CelebA datasets show that the log-cosh reconstruction loss significantly improves the performance of VAE and its variants in output quality, measured by sharpness and FID score. In addition, the gradient of the log-cosh is a simple tanh function, which makes the implementation of gradient descent as simple as adding one sentence in coding. ", "keywords": "Unsupervised Generative Model;VAE;log hyperbolic cosine loss", "primary_area": "", "supplementary_material": "", "author": "Pengfei Chen;Guangyong Chen;Shengyu Zhang", "authorids": "chenpf.cuhk@gmail.com;gycchen@tencent.com;shengyuzhang@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nchen2019log,\ntitle={Log Hyperbolic Cosine Loss Improves Variational Auto-Encoder},\nauthor={Pengfei Chen and Guangyong Chen and Shengyu Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=rkglvsC9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkglvsC9Ym", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "313;209;789", "wc_reply_reviewers": "0;0;113", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;1", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 437.0, 252.49686466700268 ], "wc_reply_reviewers_avg": [ 37.666666666666664, 53.268710849386586 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=332921554686170495&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "textTOvec: DEEP CONTEXTUALIZED NEURAL AUTOREGRESSIVE TOPIC MODELS OF LANGUAGE WITH DISTRIBUTED COMPOSITIONAL PRIOR", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1069", "id": "rkgoyn09KQ", "author_site": "Pankaj Gupta, Yatin Chaudhary, Florian Buettner, Hinrich Schuetze", "tldr": "Unified neural model of topic and language modeling to introduce language structure in topic models for contextualized topic vectors ", "abstract": "We address two challenges of probabilistic topic modelling in order to better estimate\nthe probability of a word in a given context, i.e., P(wordjcontext) : (1) No\nLanguage Structure in Context: Probabilistic topic models ignore word order by\nsummarizing a given context as a \u201cbag-of-word\u201d and consequently the semantics\nof words in the context is lost. In this work, we incorporate language structure\nby combining a neural autoregressive topic model (TM) with a LSTM based language\nmodel (LSTM-LM) in a single probabilistic framework. The LSTM-LM\nlearns a vector-space representation of each word by accounting for word order\nin local collocation patterns, while the TM simultaneously learns a latent representation\nfrom the entire document. In addition, the LSTM-LM models complex\ncharacteristics of language (e.g., syntax and semantics), while the TM discovers\nthe underlying thematic structure in a collection of documents. We unite two complementary\nparadigms of learning the meaning of word occurrences by combining\na topic model and a language model in a unified probabilistic framework, named\nas ctx-DocNADE. (2) Limited Context and/or Smaller training corpus of documents:\nIn settings with a small number of word occurrences (i.e., lack of context)\nin short text or data sparsity in a corpus of few documents, the application of TMs\nis challenging. We address this challenge by incorporating external knowledge\ninto neural autoregressive topic models via a language modelling approach: we\nuse word embeddings as input of a LSTM-LM with the aim to improve the wordtopic\nmapping on a smaller and/or short-text corpus. The proposed DocNADE\nextension is named as ctx-DocNADEe.\n\nWe present novel neural autoregressive topic model variants coupled with neural\nlanguage models and embeddings priors that consistently outperform state-of-theart\ngenerative topic models in terms of generalization (perplexity), interpretability\n(topic coherence) and applicability (retrieval and classification) over 6 long-text\nand 8 short-text datasets from diverse domains.", "keywords": "neural topic model;natural language processing;text representation;language modeling;information retrieval;deep learning", "primary_area": "", "supplementary_material": "", "author": "Pankaj Gupta;Yatin Chaudhary;Florian Buettner;Hinrich Schuetze", "authorids": "pankaj_gupta96@yahoo.com;yatinchaudhary91@gmail.com;fbuettner.phys@gmail.com;hinrich@hotmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ngupta2018texttovec,\ntitle={text{TO}vec: {DEEP} {CONTEXTUALIZED} {NEURAL} {AUTOREGRESSIVE} {TOPIC} {MODELS} {OF} {LANGUAGE} {WITH} {DISTRIBUTED} {COMPOSITIONAL} {PRIOR}},\nauthor={Pankaj Gupta and Yatin Chaudhary and Florian Buettner and Hinrich Schuetze},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgoyn09KQ},\n}", "github": "[![github](/images/github_icon.svg) pgcool/textTOvec](https://github.com/pgcool/textTOvec)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "wc_review": "698;553;387", "wc_reply_reviewers": "0;303;0", "wc_reply_authors": "762;1763;965", "reply_reviewers": "0;2;0", "reply_authors": "1;5;3", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 546.0, 127.06166481935716 ], "wc_reply_reviewers_avg": [ 101.0, 142.8355697996826 ], "wc_reply_authors_avg": [ 1163.3333333333333, 432.05118009585647 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 3.0, 1.632993161855452 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16604775897027080889&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rkgoyn09KQ", "pdf": "https://openreview.net/pdf?id=rkgoyn09KQ", "email": ";;;", "author_num": 4 }, { "id": "rkgpCoRctm", "title": "Detecting Out-Of-Distribution Samples Using Low-Order Deep Features Statistics", "track": "main", "status": "Reject", "tldr": "Detecting out-of-distribution samples by using low-order feature statistics without requiring any change in underlying DNN.", "abstract": "The ability to detect when an input sample was not drawn from the training distribution is an important desirable property of deep neural networks. In this paper, we show that a simple ensembling of first and second order deep feature statistics can be exploited to effectively differentiate in-distribution and out-of-distribution samples. Specifically, we observe that the mean and standard deviation within feature maps differs greatly between in-distribution and out-of-distribution samples. Based on this observation, we propose a simple and efficient plug-and-play detection procedure that does not require re-training, pre-processing or changes to the model. The proposed method outperforms the state-of-the-art by a large margin in all standard benchmarking tasks, while being much simpler to implement and execute. Notably, our method improves the true negative rate from 39.6% to 95.3% when 95% of in-distribution (CIFAR-100) are correctly detected using a DenseNet and the out-of-distribution dataset is TinyImageNet resize. The source code of our method will be made publicly available.", "keywords": "computer vision;out-of-distribution detection;image classification", "primary_area": "", "supplementary_material": "", "author": "Igor M. Quintanilha;Roberto de M. E. Filho;Jos\u00e9 Lezama;Mauricio Delbracio;Leonardo O. Nunes", "authorids": "igormq@poli.ufrj.br;robertomest@poli.ufrj.br;jlezama@fing.edu.uy;mdelbra@fing.edu.uy;lnunes@microsoft.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nquintanilha2019detecting,\ntitle={Detecting Out-Of-Distribution Samples Using Low-Order Deep Features Statistics},\nauthor={Igor M. Quintanilha and Roberto de M. E. Filho and Jos\u00e9 Lezama and Mauricio Delbracio and Leonardo O. Nunes},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgpCoRctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkgpCoRctm", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;4;4", "wc_review": "243;611;360", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "319;2184;107", "reply_reviewers": "0;0;0", "reply_authors": "1;4;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 404.6666666666667, 153.51945226003843 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 870.0, 933.1605792502525 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 24, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2342305393478348598&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Amortized Bayesian Meta-Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/940", "id": "rkgpy3C5tX", "author_site": "Sachin Ravi, Alex Beatson", "tldr": "We propose a meta-learning method which efficiently amortizes hierarchical variational inference across training episodes.", "abstract": "Meta-learning, or learning-to-learn, has proven to be a successful strategy in attacking problems in supervised learning and reinforcement learning that involve small amounts of data. State-of-the-art solutions involve learning an initialization and/or learning algorithm using a set of training episodes so that the meta learner can generalize to an evaluation episode quickly. These methods perform well but often lack good quantification of uncertainty, which can be vital to real-world applications when data is lacking. We propose a meta-learning method which efficiently amortizes hierarchical variational inference across tasks, learning a prior distribution over neural network weights so that a few steps of Bayes by Backprop will produce a good task-specific approximate posterior. We show that our method produces good uncertainty estimates on contextual bandit and few-shot learning benchmarks.", "keywords": "variational inference;meta-learning;few-shot learning;uncertainty quantification", "primary_area": "", "supplementary_material": "", "author": "Sachin Ravi;Alex Beatson", "authorids": "sachinr@princeton.edu;abeatson@cs.princeton.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nravi2018amortized,\ntitle={Amortized Bayesian Meta-Learning},\nauthor={Sachin Ravi and Alex Beatson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgpy3C5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;4", "wc_review": "263;1412;139", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "761;1599;397", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 604.6666666666666, 573.1110034035485 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 919.0, 503.271960938285 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15296806669050127926&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rkgpy3C5tX", "pdf": "https://openreview.net/pdf?id=rkgpy3C5tX", "email": ";", "author_num": 2 }, { "id": "rkgqCiRqKQ", "title": "Inferring Reward Functions from Demonstrators with Unknown Biases", "track": "main", "status": "Reject", "tldr": "When we infer preferences from behavior, we can try to improve accuracy by jointly learning a bias model and preferences, though this requires new assumptions to make progress.", "abstract": "Our goal is to infer reward functions from demonstrations. In order to infer the correct reward function, we must account for the systematic ways in which the demonstrator is suboptimal. Prior work in inverse reinforcement learning can account for specific, known biases, but cannot handle demonstrators with unknown biases. In this work, we explore the idea of learning the demonstrator's planning algorithm (including their unknown biases), along with their reward function. What makes this challenging is that any demonstration could be explained either by positing a term in the reward function, or by positing a particular systematic bias. We explore what assumptions are sufficient for avoiding this impossibility result: either access to tasks with known rewards which enable estimating the planner separately, or that the demonstrator is sufficiently close to optimal that this can serve as a regularizer. In our exploration with synthetic models of human biases, we find that it is possible to adapt to different biases and perform better than assuming a fixed model of the demonstrator, such as Boltzmann rationality.", "keywords": "Inverse reinforcement learning;differentiable planning", "primary_area": "", "supplementary_material": "", "author": "Rohin Shah;Noah Gundotra;Pieter Abbeel;Anca Dragan", "authorids": "rohinmshah@berkeley.edu;noah.gundotra@berkeley.edu;pabbeel@cs.berkeley.edu;anca@berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nshah2019inferring,\ntitle={Inferring Reward Functions from Demonstrators with Unknown Biases},\nauthor={Rohin Shah and Noah Gundotra and Pieter Abbeel and Anca Dragan},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgqCiRqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkgqCiRqKQ", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "wc_review": "530;407;417", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 451.3333333333333, 55.77534301901593 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10613803224759792124&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rkgs0oAqFQ", "title": "Rethinking Knowledge Graph Propagation for Zero-Shot Learning", "track": "main", "status": "Withdraw", "tldr": "We rethink the way information can be exploited more efficiently in the knowledge graph in order to improve performance on the Zero-Shot Learning task and propose a dense graph propagation (DGP) module for this purpose.", "abstract": "Graph convolutional neural networks have recently shown great potential for the task of zero-shot learning. These models are highly sample efficient as related concepts in the graph structure share statistical strength allowing generalization to new classes when faced with a lack of data. However, we find that the extensive use of Laplacian smoothing at each layer in current approaches can easily dilute the knowledge from distant nodes and consequently decrease the performance in zero-shot learning. In order to still enjoy the benefit brought by the graph structure while preventing the dilution of knowledge from distant nodes, we propose a Dense Graph Propagation (DGP) module with carefully designed direct links among distant nodes. DGP allows us to exploit the hierarchical graph structure of the knowledge graph through additional connections. These connections are added based on a node's relationship to its ancestors and descendants. A weighting scheme is further used to weigh their contribution depending on the distance to the node. Combined with finetuning of the representations in a two-stage training approach our method outperforms state-of-the-art zero-shot learning approaches.", "keywords": "Dense graph propagation;zero-shot learning", "primary_area": "", "supplementary_material": "", "author": "Michael Kampffmeyer;Yinbo Chen;Xiaodan Liang;Hao Wang;Yujia Zhang;Eric P. Xing", "authorids": "michael.c.kampffmeyer@uit.no;cyvius96@gmail.com;xdliang328@gmail.com;hwang87@mit.edu;zhangyujia2014@ia.ac.cn;epxing@cs.cmu.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkgs0oAqFQ", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;3;4", "wc_review": "317;340;126", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "498;401;9", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 261.0, 95.9201056435337 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 302.6666666666667, 211.3958897950058 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 399, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1266179195913592313&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 18 }, { "id": "rkgsvoA9K7", "title": "Dirichlet Variational Autoencoder", "track": "main", "status": "Reject", "tldr": "", "abstract": "This paper proposes Dirichlet Variational Autoencoder (DirVAE) using a Dirichlet prior for a continuous latent variable that exhibits the characteristic of the categorical probabilities. To infer the parameters of DirVAE, we utilize the stochastic gradient method by approximating the Gamma distribution, which is a component of the Dirichlet distribution, with the inverse Gamma CDF approximation. Additionally, we reshape the component collapsing issue by investigating two problem sources, which are decoder weight collapsing and latent value collapsing, and we show that DirVAE has no component collapsing; while Gaussian VAE exhibits the decoder weight collapsing and Stick-Breaking VAE shows the latent value collapsing. The experimental results show that 1) DirVAE models the latent representation result with the best log-likelihood compared to the baselines; and 2) DirVAE produces more interpretable latent values with no collapsing issues which the baseline models suffer from. Also, we show that the learned latent representation from the DirVAE achieves the best classification accuracy in the semi-supervised and the supervised classification tasks on MNIST, OMNIGLOT, and SVHN compared to the baseline VAEs. Finally, we demonstrated that the DirVAE augmented topic models show better performances in most cases.", "keywords": "Variational autoencoder;Unsupervised learning;(Semi-)Supervised learning;Topic modeling", "primary_area": "", "supplementary_material": "", "author": "Weonyoung Joo;Wonsung Lee;Sungrae Park;and Il-Chul Moon", "authorids": "weonyoungjoo@gmail.com;aporia@kaist.ac.kr;sungraepark@kaist.ac.kr;icmoon@kaist.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\njoo2019dirichlet,\ntitle={Dirichlet Variational Autoencoder},\nauthor={Weonyoung Joo and Wonsung Lee and Sungrae Park and and Il-Chul Moon},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgsvoA9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkgsvoA9K7", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;4;3", "wc_review": "228;711;309", "wc_reply_reviewers": "72;329;9", "wc_reply_authors": "530;947;146", "reply_reviewers": "1;1;1", "reply_authors": "2;4;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 416.0, 211.20132575341472 ], "wc_reply_reviewers_avg": [ 136.66666666666666, 138.41082167069 ], "wc_reply_authors_avg": [ 541.0, 327.099373279742 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 116, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18327625695001891985&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "rkgv9oRqtQ", "title": "Compound Density Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite the huge success of deep neural networks (NNs), finding good mechanisms for quantifying their prediction uncertainty is still an open problem. It was recently shown, that using an ensemble of NNs trained with a proper scoring rule leads to results competitive to those of Bayesian NNs. This ensemble method can be understood as finite mixture model with uniform mixing weights. We build on this mixture model approach and increase its flexibility by replacing the fixed mixing weights by an adaptive, input-dependent distribution (specifying the probability of each component) represented by an NN, and by considering uncountably many mixture components. The resulting model can be seen as the continuous counterpart to mixture density networks and is therefore referred to as compound density network. We empirically show that the proposed model results in better uncertainty estimates and is more robust to adversarial examples than previous approaches.", "keywords": "uncertainty in neural networks;ensemble;mixture model", "primary_area": "", "supplementary_material": "", "author": "Agustinus Kristiadi;Asja Fischer", "authorids": "kristiadi@protonmail.com;asja.fischer@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nkristiadi2019compound,\ntitle={Compound Density Networks},\nauthor={Agustinus Kristiadi and Asja Fischer},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgv9oRqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkgv9oRqtQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "786;666;732", "wc_reply_reviewers": "264;64;162", "wc_reply_authors": "1065;582;1080", "reply_reviewers": "1;1;1", "reply_authors": "2;2;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 728.0, 49.07137658554119 ], "wc_reply_reviewers_avg": [ 163.33333333333334, 81.65510122188053 ], "wc_reply_authors_avg": [ 909.0, 231.3049934610146 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:11aehUwnEz8J:scholar.google.com/&scioq=Compound+Density+Networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rkgwuiA9F7", "title": "Cramer-Wold AutoEncoder", "track": "main", "status": "Reject", "tldr": "Inspired by prior work on Sliced-Wasserstein Autoencoders (SWAE) and kernel smoothing we construct a new generative model \u2013 Cramer-Wold AutoEncoder (CWAE).", "abstract": "Assessing distance betweeen the true and the sample distribution is a key component of many state of the art generative models, such as Wasserstein Autoencoder (WAE). Inspired by prior work on Sliced-Wasserstein Autoencoders (SWAE) and\nkernel smoothing we construct a new generative model \u2013 Cramer-Wold AutoEncoder (CWAE). CWAE cost function, based on introduced Cramer-Wold distance between samples, has a simple closed-form in the case of normal prior. As a consequence, while simplifying the optimization procedure (no need of sampling necessary to evaluate the distance function in the training loop), CWAE performance matches quantitatively and qualitatively that of WAE-MMD (WAE using maximum mean discrepancy based distance function) and often improves upon SWAE.", "keywords": "autoencoder;generative models;deep neural networks", "primary_area": "", "supplementary_material": "", "author": "Jacek Tabor;Szymon Knop;Przemys\u0142aw Spurek;Igor Podolak;Marcin Mazur;Stanis\u0142aw Jastrz\u0119bski", "authorids": "jacek.tabor@uj.edu.pl;szymon.knop@doctoral.uj.edu.pl;przemyslaw.spurek@uj.edu.pl;igor.podolak@uj.edu.pl;marcin.mazur@uj.edu.pl;staszek.jastrzebski@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\ntabor2019cramerwold,\ntitle={Cramer-Wold AutoEncoder},\nauthor={Jacek Tabor and Szymon Knop and Przemys\u0142aw Spurek and Igor Podolak and Marcin Mazur and Stanis\u0142aw Jastrz\u0119bski},\nyear={2019},\nurl={https://openreview.net/forum?id=rkgwuiA9F7},\n}", "github": "[![github](/images/github_icon.svg) gmum/cwae](https://github.com/gmum/cwae) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rkgwuiA9F7)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkgwuiA9F7", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "323;502;595", "wc_reply_reviewers": "0;165;0", "wc_reply_authors": "346;687;667", "reply_reviewers": "0;2;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 473.3333333333333, 112.87849908443837 ], "wc_reply_reviewers_avg": [ 55.0, 77.78174593052023 ], "wc_reply_authors_avg": [ 566.6666666666666, 156.24837776935655 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 43, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8265045554582236991&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10 }, { "id": "rkl3-hA5Y7", "title": "Towards Decomposed Linguistic Representation with Holographic Reduced Representation", "track": "main", "status": "Reject", "tldr": "Holographic Reduced Representation enables language model to discover linguistic roles.", "abstract": "The vast majority of neural models in Natural Language Processing adopt a form of structureless distributed representations. While these models are powerful at making predictions, the representational form is rather crude and does not provide insights into linguistic structures. In this paper we introduce novel language models with representations informed by the framework of Holographic Reduced Representation (HRR). This allows us to inject structures directly into our word-level and chunk-level representations. Our analyses show that by using HRR as a structured compositional representation, our models are able to discover crude linguistic roles, which roughly resembles a classic division between syntax and semantics.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Jiaming Luo;Yuan Cao;Yonghui Wu", "authorids": "j_luo@csail.mit.edu;yuancao@google.com;yonghui@google.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nluo2019towards,\ntitle={Towards Decomposed Linguistic Representation with Holographic Reduced Representation},\nauthor={Jiaming Luo and Yuan Cao and Yonghui Wu},\nyear={2019},\nurl={https://openreview.net/forum?id=rkl3-hA5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkl3-hA5Y7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "wc_review": "175;505;1194", "wc_reply_reviewers": "0;0;398", "wc_reply_authors": "415;957;2301", "reply_reviewers": "0;0;1", "reply_authors": "1;2;4", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 624.6666666666666, 424.5235237560131 ], "wc_reply_reviewers_avg": [ 132.66666666666666, 187.61899927483063 ], "wc_reply_authors_avg": [ 1224.3333333333333, 792.8216837487622 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16889725118865118861&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rkl42iA5t7", "title": "NETWORK COMPRESSION USING CORRELATION ANALYSIS OF LAYER RESPONSES", "track": "main", "status": "Reject", "tldr": "We propose an easy to implement, yet effective method for neural network compression. PFA exploits the intrinsic correlation between filter responses within network layers to recommend a smaller network footprints.", "abstract": "Principal Filter Analysis (PFA) is an easy to implement, yet effective method for neural network compression. PFA exploits the intrinsic correlation between filter responses within network layers to recommend a smaller network footprint. We propose two compression algorithms: the first allows a user to specify the proportion of the original spectral energy that should be preserved in each layer after compression, while the second is a heuristic that leads to a parameter-free approach that automatically selects the compression used at each layer. Both algorithms are evaluated against several architectures and datasets, and we show considerable compression rates without compromising accuracy, e.g., for VGG-16 on CIFAR-10, CIFAR-100 and ImageNet, PFA achieves a compression rate of 8x, 3x, and 1.4x with an accuracy gain of 0.4%, 1.4% points, and 2.4% respectively. In our tests we also demonstrate that networks compressed with PFA achieve an accuracy that is very close to the empirical upper bound for a given compression ratio. Finally, we show how PFA is an effective tool for simultaneous compression and domain adaptation.", "keywords": "Artificial Intelligence;Deep learning;Machine learning;Compression", "primary_area": "", "supplementary_material": "", "author": "Xavier Suau;Luca Zappella;Nicholas Apostoloff", "authorids": "xsuaucuadros@apple.com;lzappella@apple.com;napostoloff@apple.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsuau2019network,\ntitle={{NETWORK} {COMPRESSION} {USING} {CORRELATION} {ANALYSIS} {OF} {LAYER} {RESPONSES}},\nauthor={Xavier Suau and Luca Zappella and Nicholas Apostoloff},\nyear={2019},\nurl={https://openreview.net/forum?id=rkl42iA5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkl42iA5t7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;3", "wc_review": "700;227;233", "wc_reply_reviewers": "109;0;0", "wc_reply_authors": "1246;741;110", "reply_reviewers": "1;0;0", "reply_authors": "3;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 386.6666666666667, 221.5736647006789 ], "wc_reply_reviewers_avg": [ 36.333333333333336, 51.383092766222454 ], "wc_reply_authors_avg": [ 699.0, 464.71998737591076 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7317331873042800252&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rkl4M3R5K7", "title": "Optimal Attacks against Multiple Classifiers", "track": "main", "status": "Reject", "tldr": "Paper analyzes the problem of designing adversarial attacks against multiple classifiers, introducing algorithms that are optimal for linear classifiers and which provide state-of-the-art results for deep learning.", "abstract": "We study the problem of designing provably optimal adversarial noise algorithms that induce misclassification in settings where a learner aggregates decisions from multiple classifiers. Given the demonstrated vulnerability of state-of-the-art models to adversarial examples, recent efforts within the field of robust machine learning have focused on the use of ensemble classifiers as a way of boosting the robustness of individual models. In this paper, we design provably optimal attacks against a set of classifiers. We demonstrate how this problem can be framed as finding strategies at equilibrium in a two player, zero sum game between a learner and an adversary and consequently illustrate the need for randomization in adversarial attacks. The main technical challenge we consider is the design of best response oracles that can be implemented in a Multiplicative Weight Updates framework to find equilibrium strategies in the zero-sum game. We develop a series of scalable noise generation algorithms for deep neural networks, and show that it outperforms state-of-the-art attacks on various image classification tasks. Although there are generally no guarantees for deep learning, we show this is a well-principled approach in that it is provably optimal for linear classifiers. The main insight is a geometric characterization of the decision space that reduces the problem of designing best response oracles to minimizing a quadratic function over a set of convex polytopes.", "keywords": "online learning;nonconvex optimization;robust optimization", "primary_area": "", "supplementary_material": "", "author": "Juan C. Perdomo;Yaron Singer", "authorids": "jcperdomo@berkeley.edu;yaron@seas.harvard.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nperdomo2019optimal,\ntitle={Optimal Attacks against Multiple Classifiers},\nauthor={Juan C. Perdomo and Yaron Singer},\nyear={2019},\nurl={https://openreview.net/forum?id=rkl4M3R5K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkl4M3R5K7", "pdf_size": 0, "rating": "4;5;6;6", "confidence": "4;4;3;4", "wc_review": "234;780;158;359", "wc_reply_reviewers": "0;400;0;0", "wc_reply_authors": "471;705;95;718", "reply_reviewers": "0;1;0;0", "reply_authors": "1;1;1;1", "rating_avg": [ 5.25, 0.82915619758885 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 382.75, 240.31788843113614 ], "wc_reply_reviewers_avg": [ 100.0, 173.20508075688772 ], "wc_reply_authors_avg": [ 497.25, 252.18284537216246 ], "reply_reviewers_avg": [ 0.25, 0.4330127018922193 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dwHSXPmjhD0J:scholar.google.com/&scioq=Optimal+Attacks+against+Multiple+Classifiers&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rkl5CjC9Fm", "title": "Dual Importance Weight GAN", "track": "main", "status": "Reject", "tldr": "", "abstract": "Generative Adversarial Networks (GAN) are trained to generate a sample image of interest. To this end, generative network of GAN learns implicit distribution of true dataset from the classification samples with candidate generated samples. However, in real implementation of GAN, training the generative network with limited number of candidate samples guarantees to properly represent neither true distribution nor the distribution of generator outputs. In this paper, we propose dual importance weights for the candidate samples represented in the latent space of auto-encoder. The auto-encoder is pre-trained with real target dataset. Therefore, the latent space representation allows us to compare real distribution and the distribution of generated samples explicitly. Dual importance weights iteratively maximize the representation of generated samples for both distributions: current generator outputs and real dataset. Proposed generative model not only resolves mode collapse problem of GAN but also improves the convergence on target distribution. Experimental evaluation shows that the proposed network learns complete modes of target distribution more stable and faster than state of the art methods. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Gahye Lee;Seungkyu Lee", "authorids": "waldstein94@gmail.com;seungkyu@khu.ac.kr", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkl5CjC9Fm", "pdf_size": 0, "rating": "3;4;5", "confidence": "5;4;4", "wc_review": "664;811;166", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 547.0, 276.0108693511906 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Probabilistic Recursive Reasoning for Multi-Agent Reinforcement Learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/653", "id": "rkl6As0cF7", "author_site": "Ying Wen, Yaodong Yang, Rui Luo, Jun Wang, Wei Pan", "tldr": "We proposed a novel probabilisitic recursive reasoning (PR2) framework for multi-agent deep reinforcement learning tasks.", "abstract": "Humans are capable of attributing latent mental contents such as beliefs, or intentions to others. The social skill is critical in everyday life to reason about the potential consequences of their behaviors so as to plan ahead. It is known that humans use this reasoning ability recursively, i.e. considering what others believe about their own beliefs. In this paper, we start from level-$1$ recursion and introduce a probabilistic recursive reasoning (PR2) framework for multi-agent reinforcement learning. Our hypothesis is that it is beneficial for each agent to account for how the opponents would react to its future behaviors. Under the PR2 framework, we adopt variational Bayes methods to approximate the opponents' conditional policy, to which each agent finds the best response and then improve their own policy. We develop decentralized-training-decentralized-execution algorithms, PR2-Q and PR2-Actor-Critic, that are proved to converge in the self-play scenario when there is one Nash equilibrium. Our methods are tested on both the matrix game and the differential game, which have a non-trivial equilibrium where common gradient-based methods fail to converge. Our experiments show that it is critical to reason about how the opponents believe about what the agent believes. We expect our work to contribute a new idea of modeling the opponents to the multi-agent reinforcement learning community. \n", "keywords": "Multi-agent Reinforcement Learning;Recursive Reasoning", "primary_area": "", "supplementary_material": "", "author": "Ying Wen;Yaodong Yang;Rui Luo;Jun Wang;Wei Pan", "authorids": "ying.wen@cs.ucl.ac.uk;yaodong.yang@cs.ucl.ac.uk;rui.luo@cs.ucl.ac.uk;jun.wang@cs.ucl.ac.uk;wei.pan@tudelft.nl", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nwen2018probabilistic,\ntitle={Probabilistic Recursive Reasoning for Multi-Agent Reinforcement Learning},\nauthor={Ying Wen and Yaodong Yang and Rui Luo and Jun Wang and Wei Pan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkl6As0cF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4", "pdf_size": 0, "rating": "7;7;8", "confidence": "4;4;3", "wc_review": "658;558;598", "wc_reply_reviewers": "0;41;82", "wc_reply_authors": "1149;813;610", "reply_reviewers": "0;1;1", "reply_authors": "2;2;2", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 604.6666666666666, 41.09609335312651 ], "wc_reply_reviewers_avg": [ 41.0, 33.47635981803677 ], "wc_reply_authors_avg": [ 857.3333333333334, 222.2676064767174 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 196, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12761322458577853549&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkl6As0cF7", "pdf": "https://openreview.net/pdf?id=rkl6As0cF7", "email": ";;;;", "author_num": 5 }, { "id": "rkl85oRqYX", "title": "Exploiting Invariant Structures for Compression in Neural Networks", "track": "main", "status": "Withdraw", "tldr": "Compression of neural networks which improves the state-of-the-art low rank approximation techniques and is complementary to most of other compression techniques. ", "abstract": "Modern neural networks often require deep compositions of high-dimensional nonlinear functions (wide architecture) to achieve high test accuracy, and thus can have overwhelming number of parameters. Repeated high cost in prediction at test-time makes neural networks ill-suited for devices with constrained memory or computational power. We introduce an efficient mechanism, reshaped tensor decomposition, to compress neural networks by exploiting three types of invariant structures: periodicity, modulation and low rank. Our reshaped tensor decomposition method exploits such invariance structures using a technique called tensorization (reshaping the layers into higher-order tensors) combined with higher order tensor decompositions on top of the tensorized layers. Our compression method improves low rank approximation methods and can be incorporated to (is complementary to) most of the existing compression methods for neural networks to achieve better compression. Experiments on LeNet-5 (MNIST), ResNet-32 (CI- FAR10) and ResNet-50 (ImageNet) demonstrate that our reshaped tensor decomposition outperforms (5% test accuracy improvement universally on CIFAR10) the state-of-the-art low-rank approximation techniques under same compression rate, besides achieving orders of magnitude faster convergence rates.", "keywords": "Neural Network Compression;Low Rank Approximation;Higher Order Tensor Decomposition", "primary_area": "", "supplementary_material": "", "author": "Jiahao Su;Jingling Li;Bobby Bhattacharjee;Furong Huang", "authorids": "jiahaosu@terpmail.umd.edu;jingling@cs.umd.edu;bobby@cs.umd.edu;furongh@cs.umd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkl85oRqYX", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "wc_review": "266;484;657", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;769;0", "reply_reviewers": "0;0;0", "reply_authors": "0;1;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 469.0, 159.9770816919307 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 256.3333333333333, 362.51007648830335 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:x3tFXpry7fEJ:scholar.google.com/&scioq=Exploiting+Invariant+Structures+for+Compression+in+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rklEUjR5tm", "title": "SHE2: Stochastic Hamiltonian Exploration and Exploitation for Derivative-Free Optimization", "track": "main", "status": "Reject", "tldr": "a new derivative-free optimization algorithms derived from Nesterov's accelerated gradient methods and Hamiltonian dynamics", "abstract": "Derivative-free optimization (DFO) using trust region methods is frequently used for machine learning applications, such as (hyper-)parameter optimization without the derivatives of objective functions known. Inspired by the recent work in continuous-time minimizers, our work models the common trust region methods with the exploration-exploitation using a dynamical system coupling a pair of dynamical processes. While the first exploration process searches the minimum of the blackbox function through minimizing a time-evolving surrogation function, another exploitation process updates the surrogation function time-to-time using the points traversed by the exploration process. The efficiency of derivative-free optimization thus depends on ways the two processes couple. In this paper, we propose a novel dynamical system, namely \\ThePrev---\\underline{S}tochastic \\underline{H}amiltonian \\underline{E}xploration and \\underline{E}xploitation, that surrogates the subregions of blackbox function using a time-evolving quadratic function, then explores and tracks the minimum of the quadratic functions using a fast-converging Hamiltonian system. The \\ThePrev\\ algorithm is later provided as a discrete-time numerical approximation to the system. To further accelerate optimization, we present \\TheName\\ that parallelizes multiple \\ThePrev\\ threads for concurrent exploration and exploitation. Experiment results based on a wide range of machine learning applications show that \\TheName\\ outperform a boarder range of derivative-free optimization algorithms with faster convergence speed under the same settings.", "keywords": "derivative-free optimization", "primary_area": "", "supplementary_material": "", "author": "Haoyi Xiong;Wenqing Hu;Zhanxing Zhu;Xinjian Li;Yunchao Zhang;Jun Huan", "authorids": "xhyccc@gmail.com;huwenqing.pku@gmail.com;zhanxing.zhu@pku.edu.cn;lixingjian@baidu.com;yzgv7@mst.edu;huanjun@baidu.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nxiong2019she,\ntitle={{SHE}2: Stochastic Hamiltonian Exploration and Exploitation for Derivative-Free Optimization},\nauthor={Haoyi Xiong and Wenqing Hu and Zhanxing Zhu and Xinjian Li and Yunchao Zhang and Jun Huan},\nyear={2019},\nurl={https://openreview.net/forum?id=rklEUjR5tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rklEUjR5tm", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;3;4", "wc_review": "1159;706;307", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 724.0, 348.0603395964556 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:48jc-gg9hQkJ:scholar.google.com/&scioq=SHE2:+Stochastic+Hamiltonian+Exploration+and+Exploitation+for+Derivative-Free+Optimization&hl=en&as_sdt=0,14", "gs_version_total": 0 }, { "id": "rklNwjCcYm", "title": "Understanding and Improving Sequence-Labeling NER with Self-Attentive LSTMs", "track": "main", "status": "Withdraw", "tldr": "We provide insightful understanding of sequence-labeling NER and propose to use two types of cross structures, both of which bring theoretical and empirical improvements.", "abstract": "This paper improves upon the line of research that formulates named entity recognition (NER) as a sequence-labeling problem. We use so-called black-box long short-term memory (LSTM) encoders to achieve state-of-the-art results while providing insightful understanding of what the auto-regressive model learns with a parallel self-attention mechanism. Specifically, we decouple the sequence-labeling problem of NER into entity chunking, e.g., Barack_B Obama_E was_O elected_O, and entity typing, e.g., Barack_PERSON Obama_PERSON was_NONE elected_NONE, and analyze how the model learns to, or has difficulties in, capturing text patterns for each of the subtasks. The insights we gain then lead us to explore a more sophisticated deep cross-Bi-LSTM encoder, which proves better at capturing global interactions given both empirical results and a theoretical justification.", "keywords": "interpretability;sequence labeling;named entity recognition;LSTM;attention", "primary_area": "", "supplementary_material": "", "author": "Peng-Hsuan Li;Wei-Yun Ma", "authorids": "jacobvsdanniel@iis.sinica.edu.tw;ma@iis.sinica.edu.tw", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rklNwjCcYm", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;4", "wc_review": "559;271;174", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "434;487;15", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 3.3333333333333335, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 334.6666666666667, 163.4958375277147 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 312.0, 211.12239735913067 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15355282512118722804&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rklQas09tm", "title": "Learning Corresponded Rationales for Text Matching", "track": "main", "status": "Reject", "tldr": "We propose a novel self-explaining architecture to predict matches between two sequences of texts. Specifically, we introduce the notion of corresponded rationales and learn to extract them by the distal supervision from the downstream task.", "abstract": "The ability to predict matches between two sources of text has a number of applications including natural language inference (NLI) and question answering (QA). While flexible neural models have become effective tools in solving these tasks, they are rarely transparent in terms of the mechanism that mediates the prediction. In this paper, we propose a self-explaining architecture where the model is forced to highlight, in a dependent manner, how spans of one side of the input match corresponding segments of the other side in order to arrive at the overall decision. The text spans are regularized to be coherent and concise, and their correspondence is captured explicitly. The text spans -- rationales -- are learned entirely as latent mechanisms, guided only by the distal supervision from the end-to-end task. We evaluate our model on both NLI and QA using three publicly available datasets. Experimental results demonstrate quantitatively and qualitatively that our method delivers interpretable justification of the prediction without sacrificing state-of-the-art performance. Our code and data split will be publicly available. ", "keywords": "interpretability;rationalization;text matching;dependent selection", "primary_area": "", "supplementary_material": "", "author": "Mo Yu;Shiyu Chang;Tommi S Jaakkola", "authorids": "shiyu.chang@ibm.com;yum@us.ibm.com;tommi@csail.mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyu2019learning,\ntitle={Learning Corresponded Rationales for Text Matching},\nauthor={Mo Yu and Shiyu Chang and Tommi S Jaakkola},\nyear={2019},\nurl={https://openreview.net/forum?id=rklQas09tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rklQas09tm", "pdf_size": 0, "rating": "3;4;6", "confidence": "5;4;4", "wc_review": "386;601;323", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "267;587;228", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 436.6666666666667, 119.01353797876199 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 360.6666666666667, 160.83186527827405 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15954980995283494850&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rklXaoAcFX", "title": "Geomstats: a Python Package for Riemannian Geometry in Machine Learning", "track": "main", "status": "Reject", "tldr": "We introduce geomstats, an efficient Python package for Riemannian modelization and optimization over manifolds compatible with both numpy and tensorflow .", "abstract": "We introduce geomstats, a Python package for Riemannian modelization and optimization over manifolds such as hyperspheres, hyperbolic spaces, SPD matrices or Lie groups of transformations. Our contribution is threefold. First, geomstats allows the flexible modeling of many a machine learning problem through an efficient and extensively unit-tested implementations of these manifolds, as well as the set of useful Riemannian metrics, exponential and logarithm maps that we provide. Moreover, the wide choice of loss functions and our implementation of the corresponding gradients allow fast and easy optimization over manifolds. Finally, geomstats is the only package to provide a unified framework for Riemannian geometry, as the operations implemented in geomstats are available with different computing backends (numpy,tensorflow and keras), as well as with a GPU-enabled mode\u2013-thus considerably facilitating the application of Riemannian geometry in machine learning. In this paper, we present geomstats through a review of the utility and advantages of manifolds in machine learning, using the concrete examples that they span to show the efficiency and practicality of their implementation using our package", "keywords": "Riemannian geometry;Python package;machine learning;deep learning", "primary_area": "", "supplementary_material": "", "author": "Nina Miolane;Johan Mathe;Claire Donnat;Mikael Jorda;Xavier Pennec", "authorids": "nmiolane@stanford.edu;johan@froglabs.ai;cdonnat@stanford.edu;mjorda@stanford.edu;xavier.pennec@inria.fr", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nmiolane2019geomstats,\ntitle={Geomstats: a Python Package for Riemannian Geometry in Machine Learning},\nauthor={Nina Miolane and Johan Mathe and Claire Donnat and Mikael Jorda and Xavier Pennec},\nyear={2019},\nurl={https://openreview.net/forum?id=rklXaoAcFX},\n}", "github": "[![github](/images/github_icon.svg) geomstats/geomstats](https://github.com/geomstats/geomstats)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4;AnonReviewer3", "site": "https://openreview.net/forum?id=rklXaoAcFX", "pdf_size": 0, "rating": "3;4;4;8", "confidence": "5;5;4;2", "wc_review": "230;352;373;77", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "0;0;0;0", "reply_reviewers": "0;0;0;0", "reply_authors": "0;0;0;0", "rating_avg": [ 4.75, 1.920286436967152 ], "confidence_avg": [ 4.0, 1.224744871391589 ], "wc_review_avg": [ 258.0, 117.90462247087686 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9566892062149209, "gs_citation": 119, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2134246845110162381&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 30 }, { "title": "Learning Neural PDE Solvers with Convergence Guarantees", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/948", "id": "rklaWn0qK7", "author_site": "Jun-Ting Hsieh, Shengjia Zhao, Stephan Eismann, Lucia Mirabella, Stefano Ermon", "tldr": "We learn a fast neural solver for PDEs that has convergence guarantees.", "abstract": "Partial differential equations (PDEs) are widely used across the physical and computational sciences. Decades of research and engineering went into designing fast iterative solution methods. Existing solvers are general purpose, but may be sub-optimal for specific classes of problems. In contrast to existing hand-crafted solutions, we propose an approach to learn a fast iterative solver tailored to a specific domain. We achieve this goal by learning to modify the updates of an existing solver using a deep neural network. Crucially, our approach is proven to preserve strong correctness and convergence guarantees. After training on a single geometry, our model generalizes to a wide variety of geometries and boundary conditions, and achieves 2-3 times speedup compared to state-of-the-art solvers.", "keywords": "Partial differential equation;deep learning", "primary_area": "", "supplementary_material": "", "author": "Jun-Ting Hsieh;Shengjia Zhao;Stephan Eismann;Lucia Mirabella;Stefano Ermon", "authorids": "junting@stanford.edu;sjzhao@stanford.edu;seismann@stanford.edu;lucia.mirabella@siemens.com;ermon@cs.stanford.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nhsieh2018learning,\ntitle={Learning Neural {PDE} Solvers with Convergence Guarantees},\nauthor={Jun-Ting Hsieh and Shengjia Zhao and Stephan Eismann and Lucia Mirabella and Stefano Ermon},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rklaWn0qK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;4", "wc_review": "359;331;398", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "717;441;568", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 362.6666666666667, 27.47524137999317 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 575.3333333333334, 112.79578400316605 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 157, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12683471981515520201&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rklaWn0qK7", "pdf": "https://openreview.net/pdf?id=rklaWn0qK7", "email": ";;;;", "author_num": 5 }, { "id": "rkle3i09K7", "title": "Robust Determinantal Generative Classifier for Noisy Labels and Adversarial Attacks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Large-scale datasets may contain significant proportions of noisy (incorrect) class labels, and it is well-known that modern deep neural networks poorly generalize from such noisy training datasets. In this paper, we propose a novel inference method, Deep Determinantal Generative Classifier (DDGC), which can obtain a more robust decision boundary under any softmax neural classifier pre-trained on noisy datasets. Our main idea is inducing a generative classifier on top of hidden feature spaces of the discriminative deep model. By estimating the parameters of generative classifier using the minimum covariance determinant estimator, we significantly improve the classification accuracy, with neither re-training of the deep model nor changing its architectures. In particular, we show that DDGC not only generalizes well from noisy labels, but also is robust against adversarial perturbations due to its large margin property. Finally, we propose the ensemble version ofDDGC to improve its performance, by investigating the layer-wise characteristics of generative classifier. Our extensive experimental results demonstrate the superiority of DDGC given different learning models optimized by various training techniques to handle noisy labels or adversarial samples. For instance, on CIFAR-10 dataset containing 45% noisy training labels, we improve the test accuracy of a deep model optimized by the state-of-the-art noise-handling training method from33.34% to 43.02%.", "keywords": "Noisy Labels;Adversarial Attacks;Generative Models", "primary_area": "", "supplementary_material": "", "author": "Kimin Lee;Sukmin Yun;Kibok Lee;Honglak Lee;Bo Li;Jinwoo Shin", "authorids": "kiminlee@kaist.ac.kr;sm3199@kaist.ac.kr;kibok@umich.edu;honglak@eecs.umich.edu;lxbosky@gmail.com;jinwoos@kaist.ac.kr", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nlee2019robust,\ntitle={Robust Determinantal Generative Classifier for Noisy Labels and Adversarial Attacks},\nauthor={Kimin Lee and Sukmin Yun and Kibok Lee and Honglak Lee and Bo Li and Jinwoo Shin},\nyear={2019},\nurl={https://openreview.net/forum?id=rkle3i09K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkle3i09K7", "pdf_size": 0, "rating": "3;4;7", "confidence": "4;4;5", "wc_review": "138;774;530", "wc_reply_reviewers": "0;0;168", "wc_reply_authors": "212;605;573", "reply_reviewers": "0;0;1", "reply_authors": "2;2;2", "rating_avg": [ 4.666666666666667, 1.699673171197595 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 480.6666666666667, 261.97879473134634 ], "wc_reply_reviewers_avg": [ 56.0, 79.19595949289332 ], "wc_reply_authors_avg": [ 463.3333333333333, 178.1990148370324 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.9707253433941508, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8671087106105474623&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rklhb2R9Y7", "title": "Reinforced Imitation Learning from Observations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Imitation learning is an effective alternative approach to learn a policy when the reward function is sparse. In this paper, we consider a challenging setting where an agent has access to a sparse reward function and state-only expert observations. We propose a method which gradually balances between the imitation learning cost and the reinforcement learning objective. Built upon an existing imitation learning method, our approach works with state-only observations. We show, through navigation scenarios, that (i) an agent is able to efficiently leverage sparse rewards to outperform standard state-only imitation learning, (ii) it can learn a policy even when learner's actions are different from the expert, and (iii) the performance of the agent is not bounded by that of the expert due to the optimized usage of sparse rewards.", "keywords": "imitation learning;state-only observations;self-exploration", "primary_area": "", "supplementary_material": "", "author": "Konrad Zolna;Negar Rostamzadeh;Yoshua Bengio;Sungjin Ahn;Pedro O. Pinheiro", "authorids": "konrad.zolna@gmail.com;negar.rostamzadeh@gmail.com;yoshua.umontreal@gmail.com;sjn.ahn@gmail.com;pedro@opinheiro.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\n\u017co\u0142na2019reinforced,\ntitle={Reinforced Imitation Learning from Observations},\nauthor={Konrad \u017bo\u0142na and Negar Rostamzadeh and Yoshua Bengio and Sungjin Ahn and Pedro O. Pinheiro},\nyear={2019},\nurl={https://openreview.net/forum?id=rklhb2R9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rklhb2R9Y7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;2", "wc_review": "276;221;394", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "192;377;330", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 297.0, 72.17109301283075 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 299.6666666666667, 78.5125609200351 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.6546536707079772, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18147401407643802976&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "A new dog learns old tricks: RL finds classic optimization algorithms", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1034", "id": "rkluJ2R9KQ", "author_site": "Weiwei Kong, Christopher Liaw, Aranyak Mehta, D. Sivakumar", "tldr": "By combining ideas from traditional algorithms design and reinforcement learning, we introduce a novel framework for learning algorithms that solve online combinatorial optimization problems.", "abstract": "This paper introduces a novel framework for learning algorithms to solve online combinatorial optimization problems. Towards this goal, we introduce a number of key ideas from traditional algorithms and complexity theory. First, we draw a new connection between primal-dual methods and reinforcement learning. Next, we introduce the concept of adversarial distributions (universal and high-entropy training sets), which are distributions that encourage the learner to find algorithms that work well in the worst case. We test our new ideas on a number of optimization problem such as the AdWords problem, the online knapsack problem, and the secretary problem. Our results indicate that the models have learned behaviours that are consistent with the traditional optimal algorithms for these problems.", "keywords": "reinforcement learning;algorithms;adwords;knapsack;secretary", "primary_area": "", "supplementary_material": "", "author": "Weiwei Kong;Christopher Liaw;Aranyak Mehta;D. Sivakumar", "authorids": "wkong37@gatech.edu;cvliaw@cs.ubc.ca;aranyak@google.com;siva@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nkong2018a,\ntitle={A new dog learns old tricks: {RL} finds classic optimization algorithms},\nauthor={Weiwei Kong and Christopher Liaw and Aranyak Mehta and D. Sivakumar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkluJ2R9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;5", "wc_review": "745;472;719", "wc_reply_reviewers": "0;126;123", "wc_reply_authors": "1403;1264;1451", "reply_reviewers": "0;1;2", "reply_authors": "2;2;4", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 645.3333333333334, 123.02393624367937 ], "wc_reply_reviewers_avg": [ 83.0, 58.70264048575669 ], "wc_reply_authors_avg": [ 1372.6666666666667, 79.29831159755057 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 1.0, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12269274176119395887&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rkluJ2R9KQ", "pdf": "https://openreview.net/pdf?id=rkluJ2R9KQ", "email": ";;;", "author_num": 4 }, { "id": "rklvnjRqY7", "title": "A PRIVACY-PRESERVING IMAGE CLASSIFICATION FRAMEWORK WITH A LEARNABLE OBFUSCATOR", "track": "main", "status": "Withdraw", "tldr": "We proposed a novel deep learning image classification framework that can both accurately classify images and protect users' privacy.", "abstract": "Real world images often contain large amounts of private / sensitive information that should be carefully protected without reducing their utilities. In this paper, we propose a privacy-preserving deep learning framework with a learnable ob- fuscator for the image classification task. Our framework consists of three mod- els: learnable obfuscator, classifier and reconstructor. The learnable obfuscator is used to remove the sensitive information in the images and extract the feature maps from them. The reconstructor plays the role as an attacker, which tries to recover the image from the feature maps extracted by the obfuscator. In order to best protect users\u2019 privacy in images, we design an adversarial training methodol- ogy for our framework to optimize the obfuscator. Through extensive evaluations on real world datasets, both the numerical metrics and the visualization results demonstrate that our framework is qualified to protect users\u2019 privacy and achieve a relatively high accuracy on the image classification task.", "keywords": "privacy-preserving;image classification;adversarial training;learnable obfuscator", "primary_area": "", "supplementary_material": "", "author": "Xiangyi Meng;Zixuan Huang;Yuefeng Du;Antoni Chan;Cong Wang", "authorids": "xy.meng@my.cityu.edu.hk;zixuhuang3-c@my.cityu.edu.hk;yf.du@my.cityu.edu.hk;abchan@cityu.edu.hk;congwang@cityu.edu.hk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=rklvnjRqY7", "pdf_size": 0, "rating": "5;5;5", "confidence": "5;4;4", "wc_review": "1077;296;490", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 621.0, 332.02509945283754 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vf29jYhlw6wJ:scholar.google.com/&scioq=A+PRIVACY-PRESERVING+IMAGE+CLASSIFICATION+FRAMEWORK+WITH+A+LEARNABLE+OBFUSCATOR&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rklwwo05Ym", "title": "Pushing the bounds of dropout", "track": "main", "status": "Reject", "tldr": "", "abstract": "We show that dropout training is best understood as performing MAP estimation concurrently for a family of conditional models whose objectives are themselves lower bounded by the original dropout objective. This discovery allows us to pick any model from this family after training, which leads to a substantial improvement on regularisation-heavy language modelling. The family includes models that compute a power mean over the sampled dropout masks, and their less stochastic subvariants with tighter and higher lower bounds than the fully stochastic dropout objective. We argue that since the deterministic subvariant's bound is equal to its objective, and the highest amongst these models, the predominant view of it as a good approximation to MC averaging is misleading. Rather, deterministic dropout is the best available approximation to the true objective.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "G\u00e1bor Melis;Charles Blundell;Tom\u00e1\u0161 Ko\u010disk\u00fd;Karl Moritz Hermann;Chris Dyer;Phil Blunsom", "authorids": "melisgl@google.com;cblundell@google.com;tkocisky@google.com;kmh@google.com;cdyer@google.com;pblunsom@google.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nmelis2019pushing,\ntitle={Pushing the bounds of dropout},\nauthor={G\u00e1bor Melis and Charles Blundell and Tom\u00e1\u0161 Ko\u010disk\u00fd and Karl Moritz Hermann and Chris Dyer and Phil Blunsom},\nyear={2019},\nurl={https://openreview.net/forum?id=rklwwo05Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer4", "site": "https://openreview.net/forum?id=rklwwo05Ym", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;2;3", "wc_review": "553;108;253", "wc_reply_reviewers": "0;0;108", "wc_reply_authors": "273;137;254", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 304.6666666666667, 185.30755216366356 ], "wc_reply_reviewers_avg": [ 36.0, 50.91168824543142 ], "wc_reply_authors_avg": [ 221.33333333333334, 60.13503323539634 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13779156736675663380&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "title": "Deep Graph Infomax", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/782", "id": "rklz9iAcKQ", "author_site": "Petar Veli\u010dkovi\u0107, William Fedus, William L Hamilton, Pietro Li\u00f2, Yoshua Bengio, R Devon Hjelm", "tldr": "A new method for unsupervised representation learning on graphs, relying on maximizing mutual information between local and global representations in a graph. State-of-the-art results, competitive with supervised learning.", "abstract": "We present Deep Graph Infomax (DGI), a general approach for learning node representations within graph-structured data in an unsupervised manner. DGI relies on maximizing mutual information between patch representations and corresponding high-level summaries of graphs---both derived using established graph convolutional network architectures. The learnt patch representations summarize subgraphs centered around nodes of interest, and can thus be reused for downstream node-wise learning tasks. In contrast to most prior approaches to unsupervised learning with GCNs, DGI does not rely on random walk objectives, and is readily applicable to both transductive and inductive learning setups. We demonstrate competitive performance on a variety of node classification benchmarks, which at times even exceeds the performance of supervised learning.", "keywords": "Unsupervised Learning;Graph Neural Networks;Graph Convolutions;Mutual Information;Infomax;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Petar Veli\u010dkovi\u0107;William Fedus;William L. Hamilton;Pietro Li\u00f2;Yoshua Bengio;R Devon Hjelm", "authorids": "petar.velickovic@cst.cam.ac.uk;liam.fedus@gmail.com;wleif@stanford.edu;pietro.lio@cst.cam.ac.uk;yoshua.umontreal@gmail.com;devon.hjelm@microsoft.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nveli\u010dkovi\u01072018deep,\ntitle={Deep Graph Infomax},\nauthor={Petar Veli\u010dkovi\u0107 and William Fedus and William L. Hamilton and Pietro Li\u00f2 and Yoshua Bengio and R Devon Hjelm},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rklz9iAcKQ},\n}", "github": "[![github](/images/github_icon.svg) PetarV-/DGI](https://github.com/PetarV-/DGI) + [![Papers with Code](/images/pwc_icon.svg) 10 community implementations](https://paperswithcode.com/paper/?openreview=rklz9iAcKQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;7;9", "confidence": "4;3;4", "wc_review": "298;526;108", "wc_reply_reviewers": "215;192;0", "wc_reply_authors": "604;679;36", "reply_reviewers": "1;2;0", "reply_authors": "2;2;1", "rating_avg": [ 7.0, 1.632993161855452 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 310.6666666666667, 170.8826758009392 ], "wc_reply_reviewers_avg": [ 135.66666666666666, 96.38925712385634 ], "wc_reply_authors_avg": [ 439.6666666666667, 287.07296788253836 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1037, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4789130045609264841&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rklz9iAcKQ", "pdf": "https://openreview.net/pdf?id=rklz9iAcKQ", "email": ";;;;;", "author_num": 6 }, { "id": "rkx0g3R5tX", "title": "Partially Mutual Exclusive Softmax for Positive and Unlabeled data", "track": "main", "status": "Reject", "tldr": "Defining a partially mutual exclusive softmax loss for postive data and implementing a cooperative based sampling scheme", "abstract": "In recent years, softmax together with its fast approximations has become the de-facto loss function for deep neural networks with multiclass predictions. However, softmax is used in many problems that do not fully fit the multiclass framework and where the softmax assumption of mutually exclusive outcomes can lead to biased results. This is often the case for applications such as language modeling, next event prediction and matrix factorization, where many of the potential outcomes are not mutually exclusive, but are more likely to be independent conditionally on the state. To this end, for the set of problems with positive and unlabeled data, we propose a relaxation of the original softmax formulation, where, given the observed state, each of the outcomes are conditionally independent but share a common set of negatives. Since we operate in a regime where explicit negatives are missing, we create an adversarially-trained model of negatives and derive a new negative sampling and weighting scheme which we denote as Cooperative Importance Sampling (CIS). We show empirically the advantages of our newly introduced negative sampling scheme by pluging it in the Word2Vec algorithm and benching it extensively against other negative sampling schemes on both language modeling and matrix factorization tasks and show large lifts in performance.", "keywords": "Negative Sampling;Sampled Softmax;Word embeddings;Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Ugo Tanielian;Flavian vasile;Mike Gartrell", "authorids": "u.tanielian@criteo.com;f.vasile@criteo.com;m.gartrell@criteo.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ntanielian2019partially,\ntitle={Partially Mutual Exclusive Softmax for Positive and Unlabeled data},\nauthor={Ugo Tanielian and Flavian vasile and Mike Gartrell},\nyear={2019},\nurl={https://openreview.net/forum?id=rkx0g3R5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=rkx0g3R5tX", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;4", "wc_review": "280;737;308", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "489;499;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 441.6666666666667, 209.1448195761864 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 329.3333333333333, 232.90961527787746 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5ahZvX9EAOAJ:scholar.google.com/&scioq=Partially+Mutual+Exclusive+Softmax+for+Positive+and+Unlabeled+data&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rkx1m2C5YQ", "title": "Recurrent Kalman Networks: Factorized Inference in High-Dimensional Deep Feature Spaces", "track": "main", "status": "Reject", "tldr": "Kalman Filter based recurrent model for efficient state estimation, principled uncertainty handling and end to end learning of dynamic models in high dimensional spaces.", "abstract": "In order to integrate uncertainty estimates into deep time-series modelling, Kalman Filters (KFs) (Kalman et al., 1960) have been integrated with deep learning models. Yet, such approaches typically rely on approximate inference techniques such as variational inference which makes learning more complex and often less scalable due to approximation errors. We propose a new deep approach to Kalman filtering which can be learned directly in an end-to-end manner using backpropagation without additional approximations. Our approach uses a high-dimensional factorized latent state representation for which the Kalman updates simplify to scalar operations and thus avoids hard to backpropagate, computationally heavy and potentially unstable matrix inversions. Moreover, we use locally linear dynamic models to efficiently propagate the latent state to the next time\nstep. While our locally linear modelling and factorization assumptions are in general not true for the original low-dimensional state space of the system, the network finds a high-dimensional latent space where these assumptions hold to perform efficient inference. This state representation is learned jointly with the transition and noise models. The resulting network architecture, which we call Recurrent Kalman Network (RKN), can be used for any time-series data, similar to a LSTM (Hochreiter and Schmidhuber, 1997) but uses an explicit representation of uncertainty. As shown by our experiments, the RKN obtains much more accurate uncertainty estimates than an LSTM or Gated Recurrent Units (GRUs) (Cho et al., 2014) while also showing a slightly improved prediction performance and outperforms various recent generative models on an image imputation task.", "keywords": "state estimation;recurrent neural networks;Kalman Filter;deep learning", "primary_area": "", "supplementary_material": "", "author": "Philipp Becker;Harit Pandya;Gregor H.W. Gebhardt;Cheng Zhao;Gerhard Neumann", "authorids": "philippbecker93@googlemail.com;hpandya@lincoln.ac.uk;gebhardt@ias.tu-darmstadt.de;irobotcheng@gmail.com;gneumann@lincoln.ac.uk", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nbecker2019recurrent,\ntitle={Recurrent Kalman Networks: Factorized Inference in High-Dimensional Deep Feature Spaces},\nauthor={Philipp Becker and Harit Pandya and Gregor H.W. Gebhardt and Cheng Zhao and Gerhard Neumann},\nyear={2019},\nurl={https://openreview.net/forum?id=rkx1m2C5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkx1m2C5YQ", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "wc_review": "306;304;230", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "419;526;437", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 280.0, 35.364765892999586 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 460.6666666666667, 46.77843757782235 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 128, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1178874295303321595&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rkx8l3Cctm", "title": "Safe Policy Learning from Observations", "track": "main", "status": "Reject", "tldr": "An algorithm for learning to improve upon the behavior demonstrated by multiple unknown policies, by combining imitation learning and a novel safe policy improvement step that is resilient to value estimation errors.", "abstract": "In this paper, we consider the problem of learning a policy by observing numerous non-expert agents. Our goal is to extract a policy that, with high-confidence, acts better than the agents' average performance. Such a setting is important for real-world problems where expert data is scarce but non-expert data can easily be obtained, e.g. by crowdsourcing. Our approach is to pose this problem as safe policy improvement in reinforcement learning. First, we evaluate an average behavior policy and approximate its value function. Then, we develop a stochastic policy improvement algorithm that safely improves the average behavior. The primary advantages of our approach, termed Rerouted Behavior Improvement (RBI), over other safe learning methods are its stability in the presence of value estimation errors and the elimination of a policy search process. We demonstrate these advantages in the Taxi grid-world domain and in four games from the Atari learning environment.", "keywords": "learning from observations;safe reinforcement learning;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Elad Sarafian;Aviv Tamar;Sarit Kraus", "authorids": "elad.sarafian@gmail.com;avivt@berkeley.edu;sarit@cs.biu.ac.il", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsarafian2019safe,\ntitle={Safe Policy Learning from Observations},\nauthor={Elad Sarafian and Aviv Tamar and Sarit Kraus},\nyear={2019},\nurl={https://openreview.net/forum?id=rkx8l3Cctm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkx8l3Cctm", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;4", "wc_review": "187;276;430", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "734;616;1257", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 297.6666666666667, 100.3803876373379 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 869.0, 278.5546026664551 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7995923158853913270&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rkxJus0cFX", "title": "RedSync : Reducing Synchronization Traffic for Distributed Deep Learning", "track": "main", "status": "Reject", "tldr": "We proposed an implementation to accelerate DNN data parallel training by reducing communication bandwidth requirement.", "abstract": "Data parallelism has become a dominant method to scale Deep Neural Network (DNN) training across multiple nodes. Since the synchronization of the local models or gradients can be a bottleneck for large-scale distributed training, compressing communication traffic has gained widespread attention recently. Among several recent proposed compression algorithms, \nResidual Gradient Compression (RGC) is one of the most successful approaches---it can significantly compress the transmitting message size (0.1% of the gradient size) of each node and still preserve accuracy. However, the literature on compressing deep networks focuses almost exclusively on achieving good compression rate, while the efficiency of RGC in real implementation has been less investigated. In this paper, we develop an RGC method that achieves significant training time improvement in real-world multi-GPU systems. Our proposed RGC system design called RedSync, introduces a set of optimizations to reduce communication bandwidth while introducing limited overhead. We examine the performance of RedSync on two different multiple GPU platforms, including a supercomputer and a multi-card server. Our test cases include image classification on Cifar10 and ImageNet, and language modeling tasks on Penn Treebank and Wiki2 datasets. For DNNs featured with high communication to computation ratio, which has long been considered with poor scalability, RedSync shows significant performance improvement.", "keywords": "Data parallel;Deep Learning;Multiple GPU system;Communication Compression;Sparsification;Quantization", "primary_area": "", "supplementary_material": "", "author": "Jiarui Fang;Cho-Jui Hsieh", "authorids": "fang_jiarui@163.com;rainfarmer@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nfang2019redsync,\ntitle={RedSync : Reducing Synchronization Traffic for Distributed Deep Learning},\nauthor={Jiarui Fang and Cho-Jui Hsieh},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxJus0cFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxJus0cFX", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;4;3", "wc_review": "254;174;236", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "38;116;88", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 221.33333333333334, 34.26692606905706 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 80.66666666666667, 32.262809686834295 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "Theoretical Analysis of Auto Rate-Tuning by Batch Normalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/960", "id": "rkxQ-nA9FX", "author_site": "Sanjeev Arora, Zhiyuan Li, Kaifeng Lyu", "tldr": "We give a theoretical analysis of the ability of batch normalization to automatically tune learning rates, in the context of finding stationary points for a deep learning objective.", "abstract": "Batch Normalization (BN) has become a cornerstone of deep learning across diverse architectures, appearing to help optimization as well as generalization. While the idea makes intuitive sense, theoretical analysis of its effectiveness has been lacking. Here theoretical support is provided for one of its conjectured properties, namely, the ability to allow gradient descent to succeed with less tuning of learning rates. It is shown that even if we fix the learning rate of scale-invariant parameters (e.g., weights of each layer with BN) to a constant (say, 0.3), gradient descent still approaches a stationary point (i.e., a solution where gradient is zero) in the rate of T^{\u22121/2} in T iterations, asymptotically matching the best bound for gradient descent with well-tuned learning rates. A similar result with convergence rate T^{\u22121/4} is also shown for stochastic gradient descent.", "keywords": "batch normalization;scale invariance;learning rate;stationary point", "primary_area": "", "supplementary_material": "", "author": "Sanjeev Arora;Zhiyuan Li;Kaifeng Lyu", "authorids": "arora@cs.princeton.edu;zhiyuanli@cs.princeton.edu;vfleaking@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\narora2018theoretical,\ntitle={Theoretical Analysis of Auto Rate-Tuning by Batch Normalization},\nauthor={Sanjeev Arora and Zhiyuan Li and Kaifeng Lyu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxQ-nA9FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer4", "pdf_size": 0, "rating": "7;7;7", "confidence": "4;2;2", "wc_review": "995;124;233", "wc_reply_reviewers": "175;1;0", "wc_reply_authors": "507;147;23", "reply_reviewers": "1;1;0", "reply_authors": "2;1;1", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 450.6666666666667, 387.4655540245552 ], "wc_reply_reviewers_avg": [ 58.666666666666664, 82.26110191876164 ], "wc_reply_authors_avg": [ 225.66666666666666, 205.27271832586254 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 139, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12820662183792985320&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rkxQ-nA9FX", "pdf": "https://openreview.net/pdf?id=rkxQ-nA9FX", "email": ";;", "author_num": 3 }, { "title": "Per-Tensor Fixed-Point Quantization of the Back-Propagation Algorithm", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/747", "id": "rkxaNjA9Ym", "author_site": "Charbel Sakr, Naresh Shanbhag", "tldr": "We analyze and determine the precision requirements for training neural networks when all tensors, including back-propagated signals and weight accumulators, are quantized to fixed-point format.", "abstract": "The high computational and parameter complexity of neural networks makes their training very slow and difficult to deploy on energy and storage-constrained comput- ing systems. Many network complexity reduction techniques have been proposed including fixed-point implementation. However, a systematic approach for design- ing full fixed-point training and inference of deep neural networks remains elusive. We describe a precision assignment methodology for neural network training in which all network parameters, i.e., activations and weights in the feedforward path, gradients and weight accumulators in the feedback path, are assigned close to minimal precision. The precision assignment is derived analytically and enables tracking the convergence behavior of the full precision training, known to converge a priori. Thus, our work leads to a systematic methodology of determining suit- able precision for fixed-point training. The near optimality (minimality) of the resulting precision assignment is validated empirically for four networks on the CIFAR-10, CIFAR-100, and SVHN datasets. The complexity reduction arising from our approach is compared with other fixed-point neural network designs.", "keywords": "deep learning;reduced precision;fixed-point;quantization;back-propagation algorithm", "primary_area": "", "supplementary_material": "", "author": "Charbel Sakr;Naresh Shanbhag", "authorids": "sakr2@illinois.edu;shanbhag@illinois.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nsakr2018pertensor,\ntitle={Per-Tensor Fixed-Point Quantization of the Back-Propagation Algorithm},\nauthor={Charbel Sakr and Naresh Shanbhag},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxaNjA9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3", "pdf_size": 0, "rating": "3;7;8", "confidence": "2;3;4", "wc_review": "219;681;227", "wc_reply_reviewers": "175;0;0", "wc_reply_authors": "951;1386;512", "reply_reviewers": "1;0;0", "reply_authors": "3;3;2", "rating_avg": [ 6.0, 2.160246899469287 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 375.6666666666667, 215.92797152960267 ], "wc_reply_reviewers_avg": [ 58.333333333333336, 82.49579113843053 ], "wc_reply_authors_avg": [ 949.6666666666666, 356.81025147206117 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 57, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14777632566024300455&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=rkxaNjA9Ym", "pdf": "https://openreview.net/pdf?id=rkxaNjA9Ym", "email": ";", "author_num": 2 }, { "title": "FUNCTIONAL VARIATIONAL BAYESIAN NEURAL NETWORKS", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1035", "id": "rkxacs0qY7", "author_site": "Shengyang Sun, Guodong Zhang, Jiaxin Shi, Roger Grosse", "tldr": "We perform functional variational inference on the stochastic processes defined by Bayesian neural networks.", "abstract": "Variational Bayesian neural networks (BNN) perform variational inference over weights, but it is difficult to specify meaningful priors and approximating posteriors in a high-dimensional weight space. We introduce functional variational Bayesian neural networks (fBNNs), which maximize an Evidence Lower BOund (ELBO) defined directly on stochastic processes, i.e. distributions over functions. We prove that the KL divergence between stochastic processes is equal to the supremum of marginal KL divergences over all finite sets of inputs. Based on this, we introduce a practical training objective which approximates the functional ELBO using finite measurement sets and the spectral Stein gradient estimator. With fBNNs, we can specify priors which entail rich structure, including Gaussian processes and implicit stochastic processes. Empirically, we find that fBNNs extrapolate well using various structured priors, provide reliable uncertainty estimates, and can scale to large datasets.", "keywords": "functional variational inference;Bayesian neural networks;stochastic processes", "primary_area": "", "supplementary_material": "", "author": "Shengyang Sun;Guodong Zhang;Jiaxin Shi;Roger Grosse", "authorids": "ssy@cs.toronto.edu;gdzhang.cs@gmail.com;shijx15@mails.tsinghua.edu.cn;rgrosse@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nsun2018functional,\ntitle={{FUNCTIONAL} {VARIATIONAL} {BAYESIAN} {NEURAL} {NETWORKS}},\nauthor={Shengyang Sun and Guodong Zhang and Jiaxin Shi and Roger Grosse},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxacs0qY7},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkxacs0qY7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "wc_review": "1409;495;634", "wc_reply_reviewers": "203;0;0", "wc_reply_authors": "755;684;732", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 846.0, 402.12518780432873 ], "wc_reply_reviewers_avg": [ 67.66666666666667, 95.69511772057942 ], "wc_reply_authors_avg": [ 723.6666666666666, 29.578520735305357 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 325, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11345668122445712961&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rkxacs0qY7", "pdf": "https://openreview.net/pdf?id=rkxacs0qY7", "email": ";;;", "author_num": 4 }, { "title": "NADPEx: An on-policy temporally consistent exploration method for deep reinforcement learning", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/723", "id": "rkxciiC9tm", "author_site": "Sirui Xie, Junning Huang, Lanxin Lei, Chunxiao Liu, Zheng Ma, Wei Zhang, Liang Lin", "tldr": "", "abstract": "Reinforcement learning agents need exploratory behaviors to escape from local optima. These behaviors may include both immediate dithering perturbation and temporally consistent exploration. To achieve these, a stochastic policy model that is inherently consistent through a period of time is in desire, especially for tasks with either sparse rewards or long term information. In this work, we introduce a novel on-policy temporally consistent exploration strategy - Neural Adaptive Dropout Policy Exploration (NADPEx) - for deep reinforcement learning agents. Modeled as a global random variable for conditional distribution, dropout is incorporated to reinforcement learning policies, equipping them with inherent temporal consistency, even when the reward signals are sparse. Two factors, gradients' alignment with the objective and KL constraint in policy space, are discussed to guarantee NADPEx policy's stable improvement. Our experiments demonstrate that NADPEx solves tasks with sparse reward while naive exploration and parameter noise fail. It yields as well or even faster convergence in the standard mujoco benchmark for continuous control. ", "keywords": "Reinforcement learning;exploration", "primary_area": "", "supplementary_material": "", "author": "Sirui Xie;Junning Huang;Lanxin Lei;Chunxiao Liu;Zheng Ma;Wei Zhang;Liang Lin", "authorids": "xiesirui@sensetime.com;huangjunning@sensetime.com;leilanxin@sensetime.com;liuchunxiao@sensetime.com;mazheng@sensetime.com;wayne.zhang@sensetime.com;linliang@ieee.org", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nxie2018nadpex,\ntitle={{NADPE}x: An on-policy temporally consistent exploration method for deep reinforcement learning},\nauthor={Sirui Xie and Junning Huang and Lanxin Lei and Chunxiao Liu and Zheng Ma and Wei Zhang and Liang Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxciiC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;3", "wc_review": "416;235;194", "wc_reply_reviewers": "290;0;0", "wc_reply_authors": "1151;455;280", "reply_reviewers": "2;0;0", "reply_authors": "3;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 281.6666666666667, 96.45148463807537 ], "wc_reply_reviewers_avg": [ 96.66666666666667, 136.7073110293992 ], "wc_reply_authors_avg": [ 628.6666666666666, 376.19173598342405 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16547616825713719585&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rkxciiC9tm", "pdf": "https://openreview.net/pdf?id=rkxciiC9tm", "email": ";;;;;;", "author_num": 7 }, { "id": "rkxd2oR9Y7", "title": "The Case for Full-Matrix Adaptive Regularization", "track": "main", "status": "Reject", "tldr": "fast, truly scalable full-matrix AdaGrad/Adam, with theory for adaptive stochastic non-convex optimization", "abstract": "Adaptive regularization methods pre-multiply a descent direction by a preconditioning matrix. Due to the large number of parameters of machine learning problems, full-matrix preconditioning methods are prohibitively expensive. We show how to modify full-matrix adaptive regularization in order to make it practical and effective. We also provide novel theoretical analysis\nfor adaptive regularization in non-convex optimization settings. The core of our algorithm, termed GGT, consists of efficient inverse computation of square roots of low-rank matrices. Our preliminary experiments underscore improved convergence rate of GGT across a variety of synthetic tasks and standard deep learning benchmarks.", "keywords": "adaptive regularization;non-convex optimization", "primary_area": "", "supplementary_material": "", "author": "Naman Agarwal;Brian Bullins;Xinyi Chen;Elad Hazan;Karan Singh;Cyril Zhang;Yi Zhang", "authorids": "namanagarwal@google.com;bbullins@cs.princeton.edu;xinyic@google.com;ehazan@cs.princeton.edu;karans@cs.princeton.edu;cyril.zhang@cs.princeton.edu;y.zhang@cs.princeton.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nagarwal2019the,\ntitle={The Case for Full-Matrix Adaptive Regularization},\nauthor={Naman Agarwal and Brian Bullins and Xinyi Chen and Elad Hazan and Karan Singh and Cyril Zhang and Yi Zhang},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxd2oR9Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxd2oR9Y7", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;3", "wc_review": "456;341;344", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "480;355;130", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 380.3333333333333, 53.51842880935708 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 321.6666666666667, 144.81789330818972 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7809425922077876419&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "rkxdpiA5Ym", "title": "Diagnosing Language Inconsistency in Cross-Lingual Word Embeddings", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Cross-lingual embeddings encode meaning of words from different languages into a shared low-dimensional space. However, despite numerous applications, evaluation of such embeddings is limited. We focus on diagnosing the problem of words segregated by languages in cross-lingual embeddings. In an ideal cross-lingual embedding, word similarity should be independent of language---i.e., words within a language should not be more similar to each other than to words in another language. One test of this is modularity, a network measurement that measures the strength of clusters in a graph. When we apply this measure to a nearest neighbor graph, imperfect cross-lingual embeddings are sorted into modular, distinct regions. The correlation of this measurement with accuracy on two downstream tasks demonstrates that modularity can serve as an intrinsic metric of embedding quality.", "keywords": "cross-lingual embeddings;evaluation;graph-based metric;modularity", "primary_area": "", "supplementary_material": "", "author": "Yoshinari Fujinuma;Jordan Boyd-Graber;Michael J. Paul", "authorids": "yoshinari.fujinuma@colorado.edu;jbg@umiacs.umd.edu;michael.j.paul@colorado.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxdpiA5Ym", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;4;4", "wc_review": "616;518;151", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 428.3333333333333, 200.14383716606304 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:vpruuzkF0vUJ:scholar.google.com/&scioq=Diagnosing+Language+Inconsistency+in+Cross-Lingual+Word+Embeddings&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "rkxfjjA5Km", "title": "Auto-Encoding Knockoff Generator for FDR Controlled Variable Selection", "track": "main", "status": "Reject", "tldr": "This paper provide model free method for generating Knockoffs, which is critical step in Model-X procedure to choose important variables with any supervised learning method under rigorous FDR control.", "abstract": "A new statistical procedure (Cande\u0300s,2018) has provided a way to identify important factors using any supervised learning method controlling for FDR. This line of research has shown great potential to expand the horizon of machine learning methods beyond the task of prediction, to serve the broader need for scientific researches for interpretable findings. However, the lack of a practical and flexible method to generate knockoffs remains the major obstacle for wide application of Model-X procedure. This paper fills in the gap by proposing a model-free knockoff generator which approximates the correlation structure between features through latent variable representation. We demonstrate our proposed method can achieve FDR control and better power than two existing methods in various simulated settings and a real data example for finding mutations associated with drug resistance in HIV-1 patients.\n\n", "keywords": "Model-X Knockoff Generator;model-free FDR control;variable selection", "primary_area": "", "supplementary_material": "", "author": "Ying Liu;Cheng Zheng", "authorids": "summeryingl@gmail.com;zzhengccheng@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nliu2019autoencoding,\ntitle={Auto-Encoding Knockoff Generator for {FDR} Controlled Variable Selection},\nauthor={Ying Liu and Cheng Zheng},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxfjjA5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkxfjjA5Km", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;4;3", "wc_review": "370;464;220", "wc_reply_reviewers": "202;53;0", "wc_reply_authors": "1346;598;355", "reply_reviewers": "1;1;0", "reply_authors": "3;2;1", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 351.3333333333333, 100.4832766627805 ], "wc_reply_reviewers_avg": [ 85.0, 85.514131385793 ], "wc_reply_authors_avg": [ 766.3333333333334, 421.72054991691147 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14987795470203139640&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rkxhX209FX", "title": "An Active Learning Framework for Efficient Robust Policy Search", "track": "main", "status": "Reject", "tldr": "An Active Learning framework that leads to efficient robust RL and opens up possibilities in Multi-Task RL", "abstract": "Robust Policy Search is the problem of learning policies that do not degrade in performance when subject to unseen environment model parameters. It is particularly relevant for transferring policies learned in a simulation environment to the real world. Several existing approaches involve sampling large batches of trajectories which reflect the differences in various possible environments, and then selecting some subset of these to learn robust policies, such as the ones that result in the worst performance. We propose an active learning based framework, EffAcTS, to selectively choose model parameters for this purpose so as to collect only as much data as necessary to select such a subset. We apply this framework to an existing method, namely EPOpt, and experimentally validate the gains in sample efficiency and the performance of our approach on standard continuous control tasks. We also present a Multi-Task Learning perspective to the problem of Robust Policy Search, and draw connections from our proposed framework to existing work on Multi-Task Learning.", "keywords": "Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Sai Kiran Narayanaswami;Nandan Sudarsanam;Balaraman Ravindran", "authorids": "saikirann94@gmail.com;nandan@iitm.ac.in;ravi@cse.iitm.ac.in", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nnarayanaswami2019an,\ntitle={An Active Learning Framework for Efficient Robust Policy Search},\nauthor={Sai Kiran Narayanaswami and Nandan Sudarsanam and Balaraman Ravindran},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxhX209FX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxhX209FX", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;3", "wc_review": "378;133;434", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "767;315;578", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 315.0, 130.70832669216858 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 553.3333333333334, 185.35071860904367 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10189614833388517731&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4 }, { "id": "rkxjnjA5KQ", "title": "Transfer Learning for Related Reinforcement Learning Tasks via Image-to-Image Translation", "track": "main", "status": "Reject", "tldr": "We propose a method of transferring knowledge between related RL tasks using visual mappings, and demonstrate its effectiveness on visual variants of the Atari Breakout game and different levels of Road Fighter, a Nintendo car driving game.", "abstract": "Deep Reinforcement Learning has managed to achieve state-of-the-art results in learning control policies directly from raw pixels. However, despite its remarkable success, it fails to generalize, a fundamental component required in a stable Artificial Intelligence system. Using the Atari game Breakout, we demonstrate the difficulty of a trained agent in adjusting to simple modifications in the raw image, ones that a human could adapt to trivially. In transfer learning, the goal is to use the knowledge gained from the source task to make the training of the target task faster and better. We show that using various forms of fine-tuning, a common method for transfer learning, is not effective for adapting to such small visual changes. In fact, it is often easier to re-train the agent from scratch than to fine-tune a trained agent. We suggest that in some cases transfer learning can be improved by adding a dedicated component whose goal is to learn to visually map between the known domain and the new one. Concretely, we use Unaligned Generative Adversarial Networks (GANs) to create a mapping function to translate images in the target task to corresponding images in the source task. These mapping functions allow us to transform between various variations of the Breakout game, as well as between different levels of a Nintendo game, Road Fighter. We show that learning this mapping is substantially more efficient than re-training. A visualization of a trained agent playing Breakout and Road Fighter, with and without the GAN transfer, can be seen in \\url{https://streamable.com/msgtm} and \\url{https://streamable.com/5e2ka}.", "keywords": "Transfer Learning;Reinforcement Learning;Generative Adversarial Networks;Video Games", "primary_area": "", "supplementary_material": "", "author": "Shani Gamrian;Yoav Goldberg", "authorids": "gamrianshani@gmail.com;yoav.goldberg@gmail.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ngamrian2019transfer,\ntitle={Transfer Learning for Related Reinforcement Learning Tasks via Image-to-Image Translation},\nauthor={Shani Gamrian and Yoav Goldberg},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxjnjA5KQ},\n}", "github": "[![github](/images/github_icon.svg) ShaniGam/RL-GAN](https://github.com/ShaniGam/RL-GAN) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rkxjnjA5KQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxjnjA5KQ", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;3;3", "wc_review": "1222;215;334", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1723;84;191", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 590.3333333333334, 449.2900572631844 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 666.0, 748.6872956493029 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 144, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9611056051873190205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7 }, { "id": "rkxkHnA5tX", "title": "Learning from Noisy Demonstration Sets via Meta-Learned Suitability Assessor", "track": "main", "status": "Reject", "tldr": "We propose a framework to learn a good policy through imitation learning from a noisy demonstration set via meta-training a demonstration suitability assessor.", "abstract": "A noisy and diverse demonstration set may hinder the performances of an agent aiming to acquire certain skills via imitation learning. However, state-of-the-art imitation learning algorithms often assume the optimality of the given demonstration set.\nIn this paper, we address such optimal assumption by learning only from the most suitable demonstrations in a given set. Suitability of a demonstration is estimated by whether imitating it produce desirable outcomes for achieving the goals of the tasks. For more efficient demonstration suitability assessments, the learning agent should be capable of imitating a demonstration as quick as possible, which shares similar spirit with fast adaptation in the meta-learning regime. Our framework, thus built on top of Model-Agnostic Meta-Learning, evaluates how desirable the imitated outcomes are, after adaptation to each demonstration in the set. The resulting assessments hence enable us to select suitable demonstration subsets for acquiring better imitated skills. The videos related to our experiments are available at: https://sites.google.com/view/deepdj", "keywords": "Imitation Learning;Noisy Demonstration Set;Meta-Learning", "primary_area": "", "supplementary_material": "", "author": "Te-Lin Wu;Jaedong Hwang;Jingyun Yang;Shaofan Lai;Carl Vondrick;Joseph J. Lim", "authorids": "telinwu@usc.edu;jd730@snu.ac.kr;jingyuny@usc.edu;shaofanl@usc.edu;vondrick@cs.columbia.edu;limjj@usc.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nwu2019learning,\ntitle={Learning from Noisy Demonstration Sets via Meta-Learned Suitability Assessor},\nauthor={Te-Lin Wu and Jaedong Hwang and Jingyun Yang and Shaofan Lai and Carl Vondrick and Joseph J. Lim},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxkHnA5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer4", "site": "https://openreview.net/forum?id=rkxkHnA5tX", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;4", "wc_review": "172;444;267", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 294.3333333333333, 112.71300822097785 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:58dwkoFGRaEJ:scholar.google.com/&scioq=Learning+from+Noisy+Demonstration+Sets+via+Meta-Learned+Suitability+Assessor&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "rkxn7nR5KX", "title": "Incremental Few-Shot Learning with Attention Attractor Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Machine learning classifiers are often trained to recognize a set of pre-defined classes. However,\nin many real applications, it is often desirable to have the flexibility of learning additional\nconcepts, without re-training on the full training set. This paper addresses this problem,\nincremental few-shot learning, where a regular classification network has already been trained to\nrecognize a set of base classes; and several extra novel classes are being considered, each with\nonly a few labeled examples. After learning the novel classes, the model is then evaluated on the\noverall performance of both base and novel classes. To this end, we propose a meta-learning model,\nthe Attention Attractor Network, which regularizes the learning of novel classes. In each episode,\nwe train a set of new weights to recognize novel classes until they converge, and we show that the\ntechnique of recurrent back-propagation can back-propagate through the optimization process and\nfacilitate the learning of the attractor network regularizer. We demonstrate that the learned\nattractor network can recognize novel classes while remembering old classes without the need to\nreview the original training set, outperforming baselines that do not rely on an iterative\noptimization process.", "keywords": "meta-learning;few-shot learning;incremental learning", "primary_area": "", "supplementary_material": "", "author": "Mengye Ren;Renjie Liao;Ethan Fetaya;Richard S. Zemel", "authorids": "mren@cs.toronto.edu;rjliao@cs.toronto.edu;ethanf@cs.toronto.edu;zemel@cs.toronto.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nren2019incremental,\ntitle={Incremental Few-Shot Learning with Attention Attractor Networks},\nauthor={Mengye Ren and Renjie Liao and Ethan Fetaya and Richard S. Zemel},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxn7nR5KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxn7nR5KX", "pdf_size": 0, "rating": "5;5;5", "confidence": "4;3;5", "wc_review": "380;230;321", "wc_reply_reviewers": "459;0;152", "wc_reply_authors": "414;463;306", "reply_reviewers": "1;0;1", "reply_authors": "2;2;1", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 310.3333333333333, 61.69999099585743 ], "wc_reply_reviewers_avg": [ 203.66666666666666, 190.91417501647754 ], "wc_reply_authors_avg": [ 394.3333333333333, 65.5862451704285 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 217, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13601757233344695275&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "title": "SPIGAN: Privileged Adversarial Learning from Simulation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/779", "id": "rkxoNnC5FQ", "author_site": "Kuan-Hui Lee, German Ros, Jie Li, Adrien Gaidon", "tldr": "An unsupervised sim-to-real domain adaptation method for semantic segmentation using privileged information from a simulator with GAN-based image translation.", "abstract": "Deep Learning for Computer Vision depends mainly on the source of supervision. Photo-realistic simulators can generate large-scale automatically labeled synthetic data, but introduce a domain gap negatively impacting performance. We propose a new unsupervised domain adaptation algorithm, called SPIGAN, relying on Simulator Privileged Information (PI) and Generative Adversarial Networks (GAN). We use internal data from the simulator as PI during the training of a target task network. We experimentally evaluate our approach on semantic segmentation. We train the networks on real-world Cityscapes and Vistas datasets, using only unlabeled real-world images and synthetic labeled data with z-buffer (depth) PI from the SYNTHIA dataset. Our method improves over no adaptation and state-of-the-art unsupervised domain adaptation techniques.", "keywords": "domain adaptation;GAN;semantic segmentation;simulation;privileged information", "primary_area": "", "supplementary_material": "", "author": "Kuan-Hui Lee;German Ros;Jie Li;Adrien Gaidon", "authorids": "kuan.lee@tri.global;germanros1987@gmail.com;jie.li@tri.global;adrien.gaidon@tri.global", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlee2018spigan,\ntitle={{SPIGAN}: Privileged Adversarial Learning from Simulation},\nauthor={Kuan-Hui Lee and German Ros and Jie Li and Adrien Gaidon},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxoNnC5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;5", "wc_review": "511;300;765", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "905;623;660", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 525.3333333333334, 190.10581848597434 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 729.3333333333334, 125.13015446148684 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 150, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4636284905704356497&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rkxoNnC5FQ", "pdf": "https://openreview.net/pdf?id=rkxoNnC5FQ", "email": ";;;", "author_num": 4 }, { "id": "rkxraoRcF7", "title": "Learning Disentangled Representations with Reference-Based Variational Autoencoders", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning disentangled representations from visual data, where different high-level generative factors are independently encoded, is of importance for many computer vision tasks. Supervised approaches, however, require a significant annotation effort in order to label the factors of interest in a training set. To alleviate the annotation cost, we introduce a learning setting which we refer to as \"reference-based disentangling''. Given a pool of unlabelled images, the goal is to learn a representation where a set of target factors are disentangled from others. The only supervision comes from an auxiliary \"reference set'' that contains images where the factors of interest are constant. In order to address this problem, we propose reference-based variational autoencoders, a novel deep generative model designed to exploit the weak supervisory signal provided by the reference set. During training, we use the variational inference framework where adversarial learning is used to minimize the objective function. By addressing tasks such as feature learning, conditional image generation or attribute transfer, we validate the ability of the proposed model to learn disentangled representations from minimal supervision.\n\n", "keywords": "Disentangled representations;Variational Autoencoders;Adversarial Learning;Weakly-supervised learning", "primary_area": "", "supplementary_material": "", "author": "Adria Ruiz;Oriol Martinez;Xavier Binefa;Jakob Verbeek", "authorids": "adria.ruiz-ovejero@inria.fr;oriol.martinez@upf.edu;xavier.binefa@upf.edu;jakob.verbeek@inria.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nruiz2019learning,\ntitle={Learning Disentangled Representations with Reference-Based Variational Autoencoders},\nauthor={Adria Ruiz and Oriol Martinez and Xavier Binefa and Jakob Verbeek},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxraoRcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkxraoRcF7", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;3;4", "wc_review": "551;408;537", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1222;912;811", "reply_reviewers": "0;0;0", "reply_authors": "3;2;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 498.6666666666667, 64.36527704351849 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 981.6666666666666, 174.87201669284374 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3832926725845496396&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8 }, { "id": "rkxt8oC9FQ", "title": "Perfect Match: A Simple Method for Learning Representations For Counterfactual Inference With Neural Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "Learning representations for counterfactual inference from observational data is of high practical relevance for many domains, such as healthcare, public policy and economics. Counterfactual inference enables one to answer \"What if...?\" questions, such as \"What would be the outcome if we gave this patient treatment $t_1$?\". However, current methods for training neural networks for counterfactual inference on observational data are either overly complex, limited to settings with only two available treatment options, or both. Here, we present Perfect Match (PM), a method for training neural networks for counterfactual inference that is easy to implement, compatible with any architecture, does not add computational complexity or hyperparameters, and extends to any number of treatments. PM is based on the idea of augmenting samples within a minibatch with their propensity-matched nearest neighbours. Our experiments demonstrate that PM outperforms a number of more complex state-of-the-art methods in inferring counterfactual outcomes across several real-world and semi-synthetic datasets.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Patrick Schwab;Lorenz Linhardt;Walter Karlen", "authorids": "patrick.schwab@hest.ethz.ch;llorenz@student.ethz.ch;walter.karlen@hest.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nschwab2019perfect,\ntitle={Perfect Match: A Simple Method for Learning Representations For Counterfactual Inference With Neural Networks},\nauthor={Patrick Schwab and Lorenz Linhardt and Walter Karlen},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxt8oC9FQ},\n}", "github": "[![github](/images/github_icon.svg) d909b/perfect_match](https://github.com/d909b/perfect_match)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkxt8oC9FQ", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;3;4", "wc_review": "356;501;320", "wc_reply_reviewers": "0;656;0", "wc_reply_authors": "319;2434;391", "reply_reviewers": "0;3;0", "reply_authors": "1;4;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 392.3333333333333, 78.2318491550738 ], "wc_reply_reviewers_avg": [ 218.66666666666666, 309.2413656389168 ], "wc_reply_authors_avg": [ 1048.0, 980.4906934795455 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 134, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6433751529871513369&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rkxtl3C5YX", "title": "Understanding & Generalizing AlphaGo Zero", "track": "main", "status": "Reject", "tldr": "", "abstract": "AlphaGo Zero (AGZ) introduced a new {\\em tabula rasa} reinforcement learning algorithm that has achieved superhuman performance in the games of Go, Chess, and Shogi with no prior knowledge other than the rules of the game. This success naturally begs the question whether it is possible to develop similar high-performance reinforcement learning algorithms for generic sequential decision-making problems (beyond two-player games), using only the constraints of the environment as the ``rules.'' To address this challenge, we start by taking steps towards developing a formal understanding of AGZ. AGZ includes two key innovations: (1) it learns a policy (represented as a neural network) using {\\em supervised learning} with cross-entropy loss from samples generated via Monte-Carlo Tree Search (MCTS); (2) it uses {\\em self-play} to learn without training data. \n\nWe argue that the self-play in AGZ corresponds to learning a Nash equilibrium for the two-player game; and the supervised learning with MCTS is attempting to learn the policy corresponding to the Nash equilibrium, by establishing a novel bound on the difference between the expected return achieved by two policies in terms of the expected KL divergence (cross-entropy) of their induced distributions. To extend AGZ to generic sequential decision-making problems, we introduce a {\\em robust MDP} framework, in which the agent and nature effectively play a zero-sum game: the agent aims to take actions to maximize reward while nature seeks state transitions, subject to the constraints of that environment, that minimize the agent's reward. For a challenging network scheduling domain, we find that AGZ within the robust MDP framework provides near-optimal performance, matching one of the best known scheduling policies that has taken the networking community three decades of intensive research to develop.\n", "keywords": "reinforcement learning;AlphaGo Zero", "primary_area": "", "supplementary_material": "", "author": "Ravichandra Addanki;Mohammad Alizadeh;Shaileshh Bojja Venkatakrishnan;Devavrat Shah;Qiaomin Xie;Zhi Xu", "authorids": "addanki@mit.edu;alizadeh@csail.mit.edu;bjjvnkt@csail.mit.edu;devavrat@mit.edu;qxie@mit.edu;zhixu@mit.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\naddanki2019understanding,\ntitle={Understanding & Generalizing AlphaGo Zero},\nauthor={Ravichandra Addanki and Mohammad Alizadeh and Shaileshh Bojja Venkatakrishnan and Devavrat Shah and Qiaomin Xie and Zhi Xu},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxtl3C5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rkxtl3C5YX", "pdf_size": 0, "rating": "5;5;7", "confidence": "3;5;4", "wc_review": "293;404;246", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "778;324;56", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 314.3333333333333, 66.24365797736581 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 386.0, 297.99776285513735 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:2olYUU8A6zwJ:scholar.google.com/&scioq=Understanding+%26+Generalizing+AlphaGo+Zero&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "rkxusjRctQ", "title": "Learning models for visual 3D localization with implicit mapping", "track": "main", "status": "Reject", "tldr": "We propose a generative approach based on Generative Query Networks + attention for localization with implicit mapping, and compare to a discriminative baseline with a similar architecture.", "abstract": "We consider learning based methods for visual localization that do not require the construction of explicit maps in the form of point clouds or voxels. The goal is to learn an implicit representation of the environment at a higher, more abstract level, for instance that of objects. We propose to use a generative approach based on Generative Query Networks (GQNs, Eslami et al. 2018), asking the following questions: 1) Can GQN capture more complex scenes than those it was originally demonstrated on? 2) Can GQN be used for localization in those scenes? To study this approach we consider procedurally generated Minecraft worlds, for which we can generate images of complex 3D scenes along with camera pose coordinates. We first show that GQNs, enhanced with a novel attention mechanism can capture the structure of 3D scenes in Minecraft, as evidenced by their samples. We then apply the models to the localization problem, comparing the results to a discriminative baseline, and comparing the ways each approach captures the task uncertainty. ", "keywords": "generative learning;generative models;generative query networks;camera re-localization", "primary_area": "", "supplementary_material": "", "author": "Dan Rosenbaum;Frederic Besse;Fabio Viola;Danilo J. Rezende;S. M. Ali Eslami", "authorids": "danro@google.com;fbesse@google.com;fviola@google.com;danilor@google.com;aeslami@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nrosenbaum2019learning,\ntitle={Learning models for visual 3D localization with implicit mapping},\nauthor={Dan Rosenbaum and Frederic Besse and Fabio Viola and Danilo J. Rezende and S. M. Ali Eslami},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxusjRctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rkxusjRctQ", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;4", "wc_review": "828;321;250", "wc_reply_reviewers": "308;22;0", "wc_reply_authors": "728;461;271", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 466.3333333333333, 257.37434906290787 ], "wc_reply_reviewers_avg": [ 110.0, 140.29492744453262 ], "wc_reply_authors_avg": [ 486.6666666666667, 187.45014151916652 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14504973804663357663&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "title": "Generating Multi-Agent Trajectories using Programmatic Weak Supervision", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/985", "id": "rkxw-hAcFQ", "author_site": "Eric Zhan, Stephan Zheng, Yisong Yue, Long Sha, Patrick Lucey", "tldr": "We blend deep generative models with programmatic weak supervision to generate coordinated multi-agent trajectories of significantly higher quality than previous baselines.", "abstract": "We study the problem of training sequential generative models for capturing coordinated multi-agent trajectory behavior, such as offensive basketball gameplay. When modeling such settings, it is often beneficial to design hierarchical models that can capture long-term coordination using intermediate variables. Furthermore, these intermediate variables should capture interesting high-level behavioral semantics in an interpretable and manipulable way. We present a hierarchical framework that can effectively learn such sequential generative models. Our approach is inspired by recent work on leveraging programmatically produced weak labels, which we extend to the spatiotemporal regime. In addition to synthetic settings, we show how to instantiate our framework to effectively model complex interactions between basketball players and generate realistic multi-agent trajectories of basketball gameplay over long time periods. We validate our approach using both quantitative and qualitative evaluations, including a user study comparison conducted with professional sports analysts.", "keywords": "deep learning;generative models;imitation learning;hierarchical methods;data programming;weak supervision;spatiotemporal", "primary_area": "", "supplementary_material": "", "author": "Eric Zhan;Stephan Zheng;Yisong Yue;Long Sha;Patrick Lucey", "authorids": "ezhan@caltech.edu;stzheng@caltech.edu;yyue@caltech.edu;lsha@stats.com;plucey@stats.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nzhan2018generating,\ntitle={Generating Multi-Agent Trajectories using Programmatic Weak Supervision},\nauthor={Eric Zhan and Stephan Zheng and Yisong Yue and Long Sha and Patrick Lucey},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxw-hAcFQ},\n}", "github": "[![github](/images/github_icon.svg) ezhan94/multiagent-programmatic-supervision](https://github.com/ezhan94/multiagent-programmatic-supervision) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rkxw-hAcFQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "wc_review": "346;422;60", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "335;397;208", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 276.0, 155.85463312544374 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 313.3333333333333, 78.6652542246081 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18317571777385401636&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=rkxw-hAcFQ", "pdf": "https://openreview.net/pdf?id=rkxw-hAcFQ", "email": ";;;;", "author_num": 5 }, { "title": "Label super-resolution networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/673", "id": "rkxwShA9Ym", "author_site": "Nikolay Malkin, Caleb Robinson, Le Hou, Rachel Soobitsky, Jacob Czawlytko, Dimitris Samaras, Joel Saltz, Lucas Joppa, Nebojsa Jojic", "tldr": "Super-resolving coarse labels into pixel-level labels, applied to aerial imagery and medical scans.", "abstract": "We present a deep learning-based method for super-resolving coarse (low-resolution) labels assigned to groups of image pixels into pixel-level (high-resolution) labels, given the joint distribution between those low- and high-resolution labels. This method involves a novel loss function that minimizes the distance between a distribution determined by a set of model outputs and the corresponding distribution given by low-resolution labels over the same set of outputs. This setup does not require that the high-resolution classes match the low-resolution classes and can be used in high-resolution semantic segmentation tasks where high-resolution labeled data is not available. Furthermore, our proposed method is able to utilize both data with low-resolution labels and any available high-resolution labels, which we show improves performance compared to a network trained only with the same amount of high-resolution data.\nWe test our proposed algorithm in a challenging land cover mapping task to super-resolve labels at a 30m resolution to a separate set of labels at a 1m resolution. We compare our algorithm with models that are trained on high-resolution data and show that 1) we can achieve similar performance using only low-resolution data; and 2) we can achieve better performance when we incorporate a small amount of high-resolution data in our training. We also test our approach on a medical imaging problem, resolving low-resolution probability maps into high-resolution segmentation of lymphocytes with accuracy equal to that of fully supervised models.", "keywords": "weakly supervised segmentation;land cover mapping;medical imaging", "primary_area": "", "supplementary_material": "", "author": "Kolya Malkin;Caleb Robinson;Le Hou;Rachel Soobitsky;Jacob Czawlytko;Dimitris Samaras;Joel Saltz;Lucas Joppa;Nebojsa Jojic", "authorids": "kolya_malkin@hotmail.com;dcrobins@gatech.edu;le.hou@stonybrook.edu;rsoobitsky@chesapeakeconservancy.org;jczawlytko@chesapeakeconservancy.org;samaras@cs.stonybrook.edu;joel.saltz@stonybrookmedicine.edu;lujoppa@microsoft.com;jojic@microsoft.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@inproceedings{\nmalkin2018label,\ntitle={Label super-resolution networks},\nauthor={Kolya Malkin and Caleb Robinson and Le Hou and Nebojsa Jojic},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkxwShA9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;9", "confidence": "4;4;4", "wc_review": "130;303;442", "wc_reply_reviewers": "0;23;0", "wc_reply_authors": "212;756;398", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 7.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 291.6666666666667, 127.62531967529885 ], "wc_reply_reviewers_avg": [ 7.666666666666667, 10.842303978193728 ], "wc_reply_authors_avg": [ 455.3333333333333, 225.75699816887675 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=632480761848779697&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=rkxwShA9Ym", "pdf": "https://openreview.net/pdf?id=rkxwShA9Ym", "email": ";;;;;;;;", "author_num": 9 }, { "title": "ANYTIME MINIBATCH: EXPLOITING STRAGGLERS IN ONLINE DISTRIBUTED OPTIMIZATION", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/970", "id": "rkzDIiA5YQ", "author_site": "Nuwan Ferdinand, Haider Al-Lawati, Stark Draper, Matthew Nokleby", "tldr": "Accelerate distributed optimization by exploiting stragglers.", "abstract": "Distributed optimization is vital in solving large-scale machine learning problems. A widely-shared feature of distributed optimization techniques is the requirement that all nodes complete their assigned tasks in each computational epoch before the system can proceed to the next epoch. In such settings, slow nodes, called stragglers, can greatly slow progress. To mitigate the impact of stragglers, we propose an online distributed optimization method called Anytime Minibatch. In this approach, all nodes are given a fixed time to compute the gradients of as many data samples as possible. The result is a variable per-node minibatch size. Workers then get a fixed communication time to average their minibatch gradients via several rounds of consensus, which are then used to update primal variables via dual averaging. Anytime Minibatch prevents stragglers from holding up the system without wasting the work that stragglers can complete. We present a convergence analysis and analyze the wall time performance. Our numerical results show that our approach is up to 1.5 times faster in Amazon EC2 and it is up to five times faster when there is greater variability in compute node performance.", "keywords": "distributed optimization;gradient descent;minibatch;stragglers", "primary_area": "", "supplementary_material": "", "author": "Nuwan Ferdinand;Haider Al-Lawati;Stark Draper;Matthew Nokleby", "authorids": "nuwan.ferdinand@utoronto.ca;haider.al.lawati@mail.utoronto.ca;stark.draper@utoronto.ca;matthew.nokleby@target.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nferdinand2018anytime,\ntitle={{ANYTIME} {MINIBATCH}: {EXPLOITING} {STRAGGLERS} {IN} {ONLINE} {DISTRIBUTED} {OPTIMIZATION}},\nauthor={Nuwan Ferdinand and Haider Al-Lawati and Stark Draper and Matthew Nokleby},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkzDIiA5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;4;4", "wc_review": "84;510;368", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "311;905;378", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 320.6666666666667, 177.10511630729988 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 531.3333333333334, 265.6342514728768 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1554485746213937113&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rkzDIiA5YQ", "pdf": "https://openreview.net/pdf?id=rkzDIiA5YQ", "email": ";;;", "author_num": 4 }, { "id": "rkzUYjCcFm", "title": "FAST OBJECT LOCALIZATION VIA SENSITIVITY ANALYSIS", "track": "main", "status": "Reject", "tldr": "Proposing a novel object localization(detection) approach based on interpreting the deep CNN using internal representation and network's thoughts", "abstract": "Deep Convolutional Neural Networks (CNNs) have been repeatedly shown to perform well on image classification tasks, successfully recognizing a broad array of objects when given sufficient training data. Methods for object localization, however, are still in need of substantial improvement. Common approaches to this problem involve the use of a sliding window, sometimes at multiple scales, providing input to a deep CNN trained to classify the contents of the window. In general, these approaches are time consuming, requiring many classification calculations. In this paper, we offer a fundamentally different approach to the localization of recognized objects in images. Our method is predicated on the idea that a deep CNN capable of recognizing an object must implicitly contain knowledge about object location in its connection weights. We provide a simple method to interpret classifier weights in the context of individual classified images. This method involves the calculation of the derivative of network generated activation patterns, such as the activation of output class label units, with regard to each in- put pixel, performing a sensitivity analysis that identifies the pixels that, in a local sense, have the greatest influence on internal representations and object recognition. These derivatives can be efficiently computed using a single backward pass through the deep CNN classifier, producing a sensitivity map of the image. We demonstrate that a simple linear mapping can be learned from sensitivity maps to bounding box coordinates, localizing the recognized object. Our experimental results, using real-world data sets for which ground truth localization information is known, reveal competitive accuracy from our fast technique.", "keywords": "Internal Representations;Sensitivity Analysis;Object Detection", "primary_area": "", "supplementary_material": "", "author": "Mohammad K. Ebrahimpour;David C. Noelle", "authorids": "mebrahimpour@ucmerced.edu;dnoelle@ucmerced.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nebrahimpour2019fast,\ntitle={{FAST} {OBJECT} {LOCALIZATION} {VIA} {SENSITIVITY} {ANALYSIS}},\nauthor={Mohammad K. Ebrahimpour and David C. Noelle},\nyear={2019},\nurl={https://openreview.net/forum?id=rkzUYjCcFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rkzUYjCcFm", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;5", "wc_review": "751;564;746", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 687.0, 86.99808427009566 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.6546536707079772, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6024365070514419713&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "rkzcvoA9YX", "title": "Few-Shot Learning by Exploiting Object Relation", "track": "main", "status": "Withdraw", "tldr": "Few-shot learning by exploiting the object-level relation to learn the image-level relation (similarity)", "abstract": "\nFew-shot learning trains image classifiers over datasets with few examples per category. \nIt poses challenges for the optimization algorithms, which typically require many examples to fine-tune the model parameters for new categories. \nDistance-learning-based approaches avoid the optimization issue by embedding the images into a metric space and applying the nearest neighbor classifier for new categories. In this paper, we propose to exploit the object-level relation to learn the image relation feature, which is converted into a distance directly.\nFor a new category, even though its images are not seen by the model, some objects may appear in the training images. Hence, object-level relation is useful for inferring the relation of images from unseen categories. Consequently, our model generalizes well for new categories without fine-tuning.\nExperimental results on benchmark datasets show that our approach outperforms state-of-the-art methods.", "keywords": "few-shot learning;relation learning", "primary_area": "", "supplementary_material": "", "author": "Liangqu Long;Wei Wang;Jun Wen;Meihui Zhang;Qian Lin", "authorids": "liangqu.long@gmail.com;wangwei@comp.nus.edu.sg;jungel2star@gmail.com;meihui_zhang@bit.edu.cn;linqian@comp.nus.edu.sg", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rkzcvoA9YX", "pdf_size": 0, "rating": "4;4;6", "confidence": "3;4;4", "wc_review": "382;119;439", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 313.3333333333333, 139.37080835749725 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14216983897071794182&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rkzfuiA9F7", "title": "Projective Subspace Networks For Few-Shot Learning", "track": "main", "status": "Reject", "tldr": "We proposed Projective Subspace Networks for few-shot and semi-supervised few-shot learning", "abstract": "Generalization from limited examples, usually studied under the umbrella of meta-learning, equips learning techniques with the ability to adapt quickly in dynamical environments and proves to be an essential aspect of lifelong learning. In this paper, we introduce the Projective Subspace Networks (PSN), a deep learning paradigm that learns non-linear embeddings from limited supervision. In contrast to previous studies, the embedding in PSN deems samples of a given class to form an affine subspace. We will show that such modeling leads to robust solutions, yielding competitive results on supervised and semi-supervised few-shot classification. Moreover, our PSN approach has the ability of end-to-end learning. In contrast to previous works, our projective subspace can be thought of as a richer representation capturing higher-order information datapoints for modeling new concepts.", "keywords": "few-shot;one-shot;semi-supervised;meta-learning", "primary_area": "", "supplementary_material": "", "author": "Christian Simon;Piotr Koniusz;Mehrtash Harandi", "authorids": "christian.simon@anu.edu.au;piotr.koniusz@data61.csiro.au;mehrtash.harandi@monash.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsimon2019projective,\ntitle={Projective Subspace Networks For Few-Shot Learning},\nauthor={Christian Simon and Piotr Koniusz and Mehrtash Harandi},\nyear={2019},\nurl={https://openreview.net/forum?id=rkzfuiA9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rkzfuiA9F7", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;4", "wc_review": "1074;293;455", "wc_reply_reviewers": "0;225;44", "wc_reply_authors": "1063;419;607", "reply_reviewers": "0;1;1", "reply_authors": "2;2;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 607.3333333333334, 336.54552275864387 ], "wc_reply_reviewers_avg": [ 89.66666666666667, 97.36643272823659 ], "wc_reply_authors_avg": [ 696.3333333333334, 270.393951280144 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1761381814802980360&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Sample Efficient Adaptive Text-to-Speech", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/786", "id": "rkzjUoAcFX", "author_site": "Yutian Chen, Yannis M Assael, Brendan Shillingford, David Budden, Scott Reed, Heiga Zen, Quan Wang, Luis C. Cobo, Andrew Trask, Ben Laurie, Caglar Gulcehre, Aaron van den Oord, Oriol Vinyals, Nando de Freitas", "tldr": "Sample efficient algorithms to adapt a text-to-speech model to a new voice style with the state-of-the-art performance.", "abstract": "We present a meta-learning approach for adaptive text-to-speech (TTS) with few data. During training, we learn a multi-speaker model using a shared conditional WaveNet core and independent learned embeddings for each speaker. The aim of training is not to produce a neural network with fixed weights, which is then deployed as a TTS system. Instead, the aim is to produce a network that requires few data at deployment time to rapidly adapt to new speakers. We introduce and benchmark three strategies:\n(i) learning the speaker embedding while keeping the WaveNet core fixed,\n(ii) fine-tuning the entire architecture with stochastic gradient descent, and\n(iii) predicting the speaker embedding with a trained neural network encoder.\nThe experiments show that these approaches are successful at adapting the multi-speaker neural network to new speakers, obtaining state-of-the-art results in both sample naturalness and voice similarity with merely a few minutes of audio data from new speakers.", "keywords": "few shot;meta learning;text to speech;wavenet", "primary_area": "", "supplementary_material": "", "author": "Yutian Chen;Yannis Assael;Brendan Shillingford;David Budden;Scott Reed;Heiga Zen;Quan Wang;Luis C. Cobo;Andrew Trask;Ben Laurie;Caglar Gulcehre;A\u00e4ron van den Oord;Oriol Vinyals;Nando de Freitas", "authorids": "yutianc@google.com;yannisassael@google.com;shillingford@google.com;budden@google.com;reedscot@google.com;heigazen@google.com;quanw@google.com;luisca@google.com;atrask@google.com;benl@google.com;caglarg@google.com;avdnoord@google.com;vinyals@google.com;nandodefreitas@google.com", "gender": ";;;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;;", "aff": ";;;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;;", "position": ";;;;;;;;;;;;;", "bibtex": "@inproceedings{\nchen2018sample,\ntitle={Sample Efficient Adaptive Text-to-Speech},\nauthor={Yutian Chen and Yannis Assael and Brendan Shillingford and David Budden and Scott Reed and Heiga Zen and Quan Wang and Luis C. Cobo and Andrew Trask and Ben Laurie and Caglar Gulcehre and A\u00e4ron van den Oord and Oriol Vinyals and Nando de Freitas},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rkzjUoAcFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;4;4", "wc_review": "360;964;182", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "537;244;197", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 502.0, 334.6679946852801 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 326.0, 150.42827748354586 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 14, 0 ], "corr_rating_confidence": -0.9999999999999998, "gs_citation": 177, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14180263255450614943&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 9, "openreview": "https://openreview.net/forum?id=rkzjUoAcFX", "pdf": "https://openreview.net/pdf?id=rkzjUoAcFX", "email": ";;;;;;;;;;;;;", "author_num": 14 }, { "title": "Practical lossless compression with latent variables using bits back coding", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/689", "id": "ryE98iR5tm", "author_site": "James Townsend, Thomas Bird, David Barber", "tldr": "We do lossless compression of large image datasets using a VAE, beat existing compression algorithms.", "abstract": "Deep latent variable models have seen recent success in many data domains. Lossless compression is an application of these models which, despite having the potential to be highly useful, has yet to be implemented in a practical manner. We present '`Bits Back with ANS' (BB-ANS), a scheme to perform lossless compression with latent variable models at a near optimal rate. We demonstrate this scheme by using it to compress the MNIST dataset with a variational auto-encoder model (VAE), achieving compression rates superior to standard methods with only a simple VAE. Given that the scheme is highly amenable to parallelization, we conclude that with a sufficiently high quality generative model this scheme could be used to achieve substantial improvements in compression rate with acceptable running time. We make our implementation available open source at https://github.com/bits-back/bits-back .", "keywords": "compression;variational auto-encoders;deep latent gaussian models;lossless compression;latent variables;approximate inference;variational inference", "primary_area": "", "supplementary_material": "", "author": "James Townsend;Thomas Bird;David Barber", "authorids": "james.townsend@cs.ucl.ac.uk;thomas.bird@cs.ucl.ac.uk;david.barber@ucl.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\ntownsend2018practical,\ntitle={Practical lossless compression with latent variables using bits back coding},\nauthor={James Townsend and Thomas Bird and David Barber},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryE98iR5tm},\n}", "github": "[![github](/images/github_icon.svg) bits-back/bits-back](https://github.com/bits-back/bits-back) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=ryE98iR5tm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "6;6;8", "confidence": "3;4;5", "wc_review": "222;560;717", "wc_reply_reviewers": "0;8;15", "wc_reply_authors": "713;1684;1411", "reply_reviewers": "0;1;1", "reply_authors": "1;3;2", "rating_avg": [ 6.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 499.6666666666667, 206.53705613171587 ], "wc_reply_reviewers_avg": [ 7.666666666666667, 6.128258770283412 ], "wc_reply_authors_avg": [ 1269.3333333333333, 408.8702592374369 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 174, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1443052248345328520&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ryE98iR5tm", "pdf": "https://openreview.net/pdf?id=ryE98iR5tm", "email": ";;", "author_num": 3 }, { "id": "ryEkcsActX", "title": "Teacher Guided Architecture Search", "track": "main", "status": "Reject", "tldr": "Faster architecture search by maximizing representational similarity with a teacher network", "abstract": "Strong improvements in neural network performance in vision tasks have resulted from the search of alternative network architectures, and prior work has shown that this search process can be automated and guided by evaluating candidate network performance following limited training (\u201cPerformance Guided Architecture Search\u201d or PGAS). However, because of the large architecture search spaces and the high computational cost associated with evaluating each candidate model, further gains in computational efficiency are needed. Here we present a method termed Teacher Guided Search for Architectures by Generation and Evaluation (TG-SAGE) that produces up to an order of magnitude in search efficiency over PGAS methods. Specifically, TG-SAGE guides each step of the architecture search by evaluating the similarity of internal representations of the candidate networks with those of the (fixed) teacher network. We show that this procedure leads to significant reduction in required per-sample training and that, this advantage holds for two different search spaces of architectures, and two different search algorithms. We further show that in the space of convolutional cells for visual categorization, TG-SAGE finds a cell structure with similar performance as was previously found using other methods but at a total computational cost that is two orders of magnitude lower than Neural Architecture Search (NAS) and more than four times lower than progressive neural architecture search (PNAS). These results suggest that TG-SAGE can be used to accelerate network architecture search in cases where one has access to some or all of the internal representations of a teacher network of interest, such as the brain. ", "keywords": "hyperparameter search;architecture search;convolutional neural networks", "primary_area": "", "supplementary_material": "", "author": "Pouya Bashivan;Mark Tensen;James J DiCarlo", "authorids": "bashivan@mit.edu;mark.tensen@student.uva.nl;dicarlo@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nbashivan2019teacher,\ntitle={Teacher Guided Architecture Search},\nauthor={Pouya Bashivan and Mark Tensen and James J DiCarlo},\nyear={2019},\nurl={https://openreview.net/forum?id=ryEkcsActX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryEkcsActX", "pdf_size": 0, "rating": "5;6;6", "confidence": "4;4;4", "wc_review": "522;312;164", "wc_reply_reviewers": "133;234;0", "wc_reply_authors": "1365;1279;264", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 332.6666666666667, 146.88166060547593 ], "wc_reply_reviewers_avg": [ 122.33333333333333, 95.82739112012227 ], "wc_reply_authors_avg": [ 969.3333333333334, 499.98022183104626 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6116064380006255157&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 11 }, { "id": "ryEquiR9KX", "title": "Applications of Gaussian Processes in Finance", "track": "main", "status": "Withdraw", "tldr": "Covariance matrix estimation of financial assets with Gaussian Process Latent Variable Models", "abstract": "Estimating covariances between financial assets plays an important role in risk management. In practice, when the sample size is small compared to the number of variables, the empirical estimate is known to be very unstable. Here, we propose a novel covariance estimator based on the Gaussian Process Latent Variable Model (GP-LVM). Our estimator can be considered as a non-linear extension of standard factor models with readily interpretable parameters reminiscent of market betas. Furthermore, our Bayesian treatment naturally shrinks the sample covariance matrix towards a more structured matrix given by the prior and thereby systematically reduces estimation errors. Finally, we discuss some financial applications of the GP-LVM model.", "keywords": "Gaussian Processes;Latent Variable Model;Variational Bayes;Stan;Asset Pricing;Portfolio Allocation;Finance;CAPM", "primary_area": "", "supplementary_material": "", "author": "Rajbir S. Nirwan;Nils Bertschinger", "authorids": "nirwan@fias.uni-frankfurt.de;bertschinger@fias.uni-frankfurt.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryEquiR9KX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;4", "wc_review": "655;579;385", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 539.6666666666666, 113.68181717212106 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1656791844602528920&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryG2Cs09Y7", "title": "Feature prioritization and regularization improve standard accuracy and adversarial robustness", "track": "main", "status": "Reject", "tldr": "We propose a model that employs feature prioritization and regularization to improve the adversarial robustness and the standard accuracy.", "abstract": "Adversarial training has been successfully applied to build robust models at a certain cost. While the robustness of a model increases, the standard classification accuracy declines. This phenomenon is suggested to be an inherent trade-off. We propose a model that employs feature prioritization by a nonlinear attention module and $L_2$ feature regularization to improve the adversarial robustness and the standard accuracy relative to adversarial training. The attention module encourages the model to rely heavily on robust features by assigning larger weights to them while suppressing non-robust features. The regularizer encourages the model to extracts similar features for the natural and adversarial images, effectively ignoring the added perturbation. In addition to evaluating the robustness of our model, we provide justification for the attention module and propose a novel experimental strategy that quantitatively demonstrates that our model is almost ideally aligned with salient data characteristics. Additional experimental results illustrate the power of our model relative to the state of the art methods.", "keywords": "adversarial robustness;feature prioritization;regularization", "primary_area": "", "supplementary_material": "", "author": "Chihuang Liu;Joseph JaJa", "authorids": "chliu@umd.edu;joseph@umiacs.umd.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nliu2019feature,\ntitle={Feature prioritization and regularization improve standard accuracy and adversarial robustness},\nauthor={Chihuang Liu and Joseph JaJa},\nyear={2019},\nurl={https://openreview.net/forum?id=ryG2Cs09Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryG2Cs09Y7", "pdf_size": 0, "rating": "4;5;5", "confidence": "3;2;5", "wc_review": "335;97;277", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "405;8;804", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 236.33333333333334, 101.32894727349908 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 405.6666666666667, 324.96598112554625 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8288729157938327670&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "ryG8UsR5t7", "title": "MERCI: A NEW METRIC TO EVALUATE THE CORRELATION BETWEEN PREDICTIVE UNCERTAINTY AND TRUE ERROR", "track": "main", "status": "Reject", "tldr": "We review existing metrics and propose a new one to evaluate predictive uncertainty in deep learning", "abstract": "As deep learning applications are becoming more and more pervasive, the question of evaluating the reliability of a prediction becomes a central question in the machine learning community. This domain, known as predictive uncertainty, has come under the scrutiny of research groups developing Bayesian approaches to deep learning such as Monte Carlo Dropout. Unfortunately, for the time being, the real goal of predictive uncertainty has been swept under the rug. Indeed, Bayesian approaches are solely evaluated in terms of raw performance of the prediction, while the quality of the estimated uncertainty is not assessed. One contribution of this article is to draw attention on existing metrics developed in the forecast community, designed to evaluate both the sharpness and the calibration of predictive uncertainty. Sharpness refers to the concentration of the predictive distributions and calibration to the consistency between the predicted uncertainty level and the actual errors. We further analyze the behavior of these metrics on regression problems when deep convolutional networks are involved and for several current predictive uncertainty approaches. A second contribution of this article is to propose an alternative metric that is more adapted to the evaluation of relative uncertainty assessment and directly applicable to regression with deep learning. This metric is evaluated and compared with existing ones on a toy dataset as well as on the problem of monocular depth estimation. ", "keywords": "evaluation metric;predictive uncertainty;deep learning", "primary_area": "", "supplementary_material": "", "author": "michel moukari;lo\u00efc simon;sylvaine picard;fr\u00e9d\u00e9ric jurie", "authorids": "michel.moukari@unicaen.fr;loic.simon@ensicaen.fr;sylvaine.picard@safrangroup.com;frederic.jurie@unicaen.fr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nmoukari2019merci,\ntitle={{MERCI}: A {NEW} {METRIC} {TO} {EVALUATE} {THE} {CORRELATION} {BETWEEN} {PREDICTIVE} {UNCERTAINTY} {AND} {TRUE} {ERROR}},\nauthor={michel moukari and lo\u00efc simon and sylvaine picard and fr\u00e9d\u00e9ric jurie},\nyear={2019},\nurl={https://openreview.net/forum?id=ryG8UsR5t7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryG8UsR5t7", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;3", "wc_review": "299;398;343", "wc_reply_reviewers": "0;0;28", "wc_reply_authors": "184;254;195", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 346.6666666666667, 40.49965706301996 ], "wc_reply_reviewers_avg": [ 9.333333333333334, 13.199326582148887 ], "wc_reply_authors_avg": [ 211.0, 30.735430152621365 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2875337726317953354&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "ryGDEjCcK7", "title": "CONTROLLING COVARIATE SHIFT USING EQUILIBRIUM NORMALIZATION OF WEIGHTS", "track": "main", "status": "Reject", "tldr": "An alternative normalization technique to batch normalization", "abstract": "We introduce a new normalization technique that exhibits the fast convergence properties of batch normalization using a transformation of layer weights instead of layer outputs. The proposed technique keeps the contribution of positive and negative weights to the layer output in equilibrium. We validate our method on a set of standard benchmarks including CIFAR-10/100, SVHN and ILSVRC 2012 ImageNet.", "keywords": "normalization;optimization", "primary_area": "", "supplementary_material": "", "author": "Aaron Defazio", "authorids": "aaron.defazio@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\ndefazio2019controlling,\ntitle={{CONTROLLING} {COVARIATE} {SHIFT} {USING} {EQUILIBRIUM} {NORMALIZATION} {OF} {WEIGHTS}},\nauthor={Aaron Defazio},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGDEjCcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryGDEjCcK7", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;1;4", "wc_review": "444;191;497", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "251;0;116", "reply_reviewers": "0;0;0", "reply_authors": "1;0;1", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 377.3333333333333, 133.5223659999411 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 122.33333333333333, 102.56813453613273 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:M2bzrcwzZUkJ:scholar.google.com/&scioq=CONTROLLING+COVARIATE+SHIFT+USING+EQUILIBRIUM+NORMALIZATION+OF+WEIGHTS&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Kernel RNN Learning (KeRNL)", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1061", "id": "ryGfnoC5KQ", "author_site": "Christopher Roth, Ingmar Kanitscheider, Ila Fiete", "tldr": "A biologically plausible learning rule for training recurrent neural networks", "abstract": "We describe Kernel RNN Learning (KeRNL), a reduced-rank, temporal eligibility trace-based approximation to backpropagation through time (BPTT) for training recurrent neural networks (RNNs) that gives competitive performance to BPTT on long time-dependence tasks. The approximation replaces a rank-4 gradient learning tensor, which describes how past hidden unit activations affect the current state, by a simple reduced-rank product of a sensitivity weight and a temporal eligibility trace. In this structured approximation motivated by node perturbation, the sensitivity weights and eligibility kernel time scales are themselves learned by applying perturbations. The rule represents another step toward biologically plausible or neurally inspired ML, with lower complexity in terms of relaxed architectural requirements (no symmetric return weights), a smaller memory demand (no unfolding and storage of states over time), and a shorter feedback time. ", "keywords": "RNNs;Biologically plausible learning rules;Algorithm;Neural Networks;Supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Christopher Roth;Ingmar Kanitscheider;Ila Fiete", "authorids": "christopher_roth@utexas.edu;ingmar@openai.com;fiete@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nroth2018kernel,\ntitle={Kernel {RNN} Learning (Ke{RNL})},\nauthor={Christopher Roth and Ingmar Kanitscheider and Ila Fiete},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGfnoC5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "1;4;4", "wc_review": "122;320;590", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "450;103;709", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 344.0, 191.81240835774938 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 420.6666666666667, 248.26643394188878 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7980550833653333135&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=ryGfnoC5KQ", "pdf": "https://openreview.net/pdf?id=ryGfnoC5KQ", "email": ";;", "author_num": 3 }, { "title": "Deep, Skinny Neural Networks are not Universal Approximators", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/905", "id": "ryGgSsAcFQ", "tldr": "This paper proves that skinny neural networks cannot approximate certain functions, no matter how deep they are.", "abstract": "In order to choose a neural network architecture that will be effective for a particular modeling problem, one must understand the limitations imposed by each of the potential options. These limitations are typically described in terms of information theoretic bounds, or by comparing the relative complexity needed to approximate example functions between different architectures. In this paper, we examine the topological constraints that the architecture of a neural network imposes on the level sets of all the functions that it is able to approximate. This approach is novel for both the nature of the limitations and the fact that they are independent of network depth for a broad family of activation functions.", "keywords": "neural network;universality;expressability", "primary_area": "", "supplementary_material": "", "author": "Jesse Johnson", "authorids": "jejo.math@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@inproceedings{\njohnson2018deep,\ntitle={Deep, Skinny Neural Networks are not Universal Approximators},\nauthor={Jesse Johnson},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGgSsAcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;8", "confidence": "4;4;4", "wc_review": "263;250;431", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "49;0;0", "reply_reviewers": "0;0;0", "reply_authors": "1;0;0", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 314.6666666666667, 82.43111602355562 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 16.333333333333332, 23.098821518760555 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12423471448848584321&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryGgSsAcFQ", "pdf": "https://openreview.net/pdf?id=ryGgSsAcFQ", "email": "", "author_num": 1 }, { "id": "ryGiYoAqt7", "title": "Learning agents with prioritization and parameter noise in continuous state and action space", "track": "main", "status": "Reject", "tldr": "Improving the performance of an RL agent in the continuous action and state space domain by using prioritised experience replay and parameter noise.", "abstract": "Reinforcement Learning (RL) problem can be solved in two different ways - the Value function-based approach and the policy optimization-based approach - to eventually arrive at an optimal policy for the given environment. One of the recent breakthroughs in reinforcement learning is the use of deep neural networks as function approximators to approximate the value function or q-function in a reinforcement learning scheme. This has led to results with agents automatically learning how to play games like alpha-go showing better-than-human performance. Deep Q-learning networks (DQN) and Deep Deterministic Policy Gradient (DDPG) are two such methods that have shown state-of-the-art results in recent times. Among the many variants of RL, an important class of problems is where the state and action spaces are continuous --- autonomous robots, autonomous vehicles, optimal control are all examples of such problems that can lend themselves naturally to reinforcement based algorithms, and have continuous state and action spaces. In this paper, we adapt and combine approaches such as DQN and DDPG in novel ways to outperform the earlier results for continuous state and action space problems. We believe these results are a valuable addition to the fast-growing body of results on Reinforcement Learning, more so for continuous state and action space problems.", "keywords": "reinforcement learning;continuous action space;prioritization;parameter;noise;policy gradients", "primary_area": "", "supplementary_material": "", "author": "Rajesh Devaraddi;G. Srinivasaraghavan", "authorids": "rajesh.dm@iiitb.ac.in;gsr@iiitb.ac.in", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\ndevaraddi2019learning,\ntitle={Learning agents with prioritization and parameter noise in continuous state and action space},\nauthor={Rajesh Devaraddi and G. Srinivasaraghavan},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGiYoAqt7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryGiYoAqt7", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;3", "wc_review": "158;75;264", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 165.66666666666666, 77.34913631637323 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3160669453335765938&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "title": "Large Scale Graph Learning From Smooth Signals", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/661", "id": "ryGkSo0qYm", "author_site": "Vassilis Kalofolias, Nathana\u00ebl Perraudin", "tldr": "", "abstract": "Graphs are a prevalent tool in data science, as they model the inherent structure of the data. Typically they are constructed either by connecting nearest samples, or by learning them from data, solving an optimization problem. While graph learning does achieve a better quality, it also comes with a higher computational cost. In particular, the current state-of-the-art model cost is O(n^2) for n samples.\nIn this paper, we show how to scale it, obtaining an approximation with leading cost of O(n log(n)), with quality that approaches the exact graph learning model. Our algorithm uses known approximate nearest neighbor techniques to reduce the number of variables, and automatically selects the correct parameters of the model, requiring a single intuitive input: the desired edge density.", "keywords": "Graph learning;Graph signal processing;Network inference", "primary_area": "", "supplementary_material": "", "author": "Vassilis Kalofolias;Nathana\u00ebl Perraudin", "authorids": "v.kalofolias@gmail.com;nathanael.perraudin@sdsc.ethz.ch", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nkalofolias2018large,\ntitle={Large Scale Graph Learning From Smooth Signals},\nauthor={Vassilis Kalofolias and Nathana\u00ebl Perraudin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGkSo0qYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "5;7;7", "confidence": "3;4;5", "wc_review": "189;62;474", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1165;186;1492", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 241.66666666666666, 172.27174915103043 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 947.6666666666666, 554.8779645611777 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 107, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=300846958242643752&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=ryGkSo0qYm", "pdf": "https://openreview.net/pdf?id=ryGkSo0qYm", "email": ";", "author_num": 2 }, { "id": "ryGpEiAcFQ", "title": "A Synaptic Neural Network and Synapse Learning", "track": "main", "status": "Reject", "tldr": "A synaptic neural network with synapse graph and learning that has the feature of topological conjugation and Bose-Einstein distribution in surprisal space. ", "abstract": "A Synaptic Neural Network (SynaNN) consists of synapses and neurons. Inspired by the synapse research of neuroscience, we built a synapse model with a nonlinear synapse function of excitatory and inhibitory channel probabilities. Introduced the concept of surprisal space and constructed a commutative diagram, we proved that the inhibitory probability function -log(1-exp(-x)) in surprisal space is the topologically conjugate function of the inhibitory complementary probability 1-x in probability space. Furthermore, we found that the derivative of the synapse over the parameter in the surprisal space is equal to the negative Bose-Einstein distribution. In addition, we constructed a fully connected synapse graph (tensor) as a synapse block of a synaptic neural network. Moreover, we proved the gradient formula of a cross-entropy loss function over parameters, so synapse learning can work with the gradient descent and backpropagation algorithms. In the proof-of-concept experiment, we performed an MNIST training and testing on the MLP model with synapse network as hidden layers.", "keywords": "synaptic neural network;surprisal;synapse;probability;excitation;inhibition;synapse learning;bose-einstein distribution;tensor;gradient;loss function;mnist;topologically conjugate", "primary_area": "", "supplementary_material": "", "author": "Chang Li", "authorids": "changli@neatware.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nli2019a,\ntitle={A Synaptic Neural Network and Synapse Learning},\nauthor={Chang Li},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGpEiAcFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=ryGpEiAcFQ", "pdf_size": 0, "rating": "2;2;2;3", "confidence": "3;3;4;3", "wc_review": "118;158;581;273", "wc_reply_reviewers": "0;0;0;0", "wc_reply_authors": "33;978;1488;726", "reply_reviewers": "0;0;0;0", "reply_authors": "1;2;4;2", "rating_avg": [ 2.25, 0.4330127018922193 ], "confidence_avg": [ 3.25, 0.4330127018922193 ], "wc_review_avg": [ 282.5, 181.48898038173007 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 806.25, 524.0793713742223 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.25, 1.0897247358851685 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.3333333333333333, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:a24bnwOC2nEJ:scholar.google.com/&scioq=A+Synaptic+Neural+Network+and+Synapse+Learning&hl=en&as_sdt=0,33", "gs_version_total": 2 }, { "title": "How Powerful are Graph Neural Networks?", "status": "Oral", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/791", "id": "ryGs6iA5Km", "author_site": "Keyulu Xu, Weihua Hu, Jure Leskovec, Stefanie Jegelka", "tldr": "We develop theoretical foundations for the expressive power of GNNs and design a provably most powerful GNN.", "abstract": "Graph Neural Networks (GNNs) are an effective framework for representation learning of graphs. GNNs follow a neighborhood aggregation scheme, where the representation vector of a node is computed by recursively aggregating and transforming representation vectors of its neighboring nodes. Many GNN variants have been proposed and have achieved state-of-the-art results on both node and graph classification tasks. However, despite GNNs revolutionizing graph representation learning, there is limited understanding of their representational properties and limitations. Here, we present a theoretical framework for analyzing the expressive power of GNNs to capture different graph structures. Our results characterize the discriminative power of popular GNN variants, such as Graph Convolutional Networks and GraphSAGE, and show that they cannot learn to distinguish certain simple graph structures. We then develop a simple architecture that is provably the most expressive among the class of GNNs and is as powerful as the Weisfeiler-Lehman graph isomorphism test. We empirically validate our theoretical findings on a number of graph classification benchmarks, and demonstrate that our model achieves state-of-the-art performance.", "keywords": "graph neural networks;theory;deep learning;representational power;graph isomorphism;deep multisets", "primary_area": "", "supplementary_material": "", "author": "Keyulu Xu*;Weihua Hu*;Jure Leskovec;Stefanie Jegelka", "authorids": "keyulu@mit.edu;weihuahu@stanford.edu;jure@cs.stanford.edu;stefje@mit.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nxu2018how,\ntitle={How Powerful are Graph Neural Networks?},\nauthor={Keyulu Xu and Weihua Hu and Jure Leskovec and Stefanie Jegelka},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGs6iA5Km},\n}", "github": "[![github](/images/github_icon.svg) weihua916/powerful-gnns](https://github.com/weihua916/powerful-gnns) + [![Papers with Code](/images/pwc_icon.svg) 17 community implementations](https://paperswithcode.com/paper/?openreview=ryGs6iA5Km)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;5;5", "wc_review": "487;1094;732", "wc_reply_reviewers": "323;193;265", "wc_reply_authors": "688;1336;646", "reply_reviewers": "2;2;2", "reply_authors": "3;4;3", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 771.0, 249.33645274340986 ], "wc_reply_reviewers_avg": [ 260.3333333333333, 53.17476427362471 ], "wc_reply_authors_avg": [ 890.0, 315.8354001691387 ], "reply_reviewers_avg": [ 2.0, 0.0 ], "reply_authors_avg": [ 3.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 61, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 10619, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9955904491400591671&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 23, "openreview": "https://openreview.net/forum?id=ryGs6iA5Km", "pdf": "https://openreview.net/pdf?id=ryGs6iA5Km", "email": ";;;", "author_num": 4 }, { "title": "Overcoming Catastrophic Forgetting for Continual Learning via Model Adaptation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/914", "id": "ryGvcoA5YX", "author_site": "Wenpeng Hu, Zhou Lin, Bing Liu, Chongyang Tao, Jay Tao, Jinwen Ma, Dongyan Zhao, Rui Yan", "tldr": "", "abstract": "Learning multiple tasks sequentially is important for the development of AI and lifelong learning systems. However, standard neural network architectures suffer from catastrophic forgetting which makes it difficult for them to learn a sequence of tasks. Several continual learning methods have been proposed to address the problem. In this paper, we propose a very different approach, called Parameter Generation and Model Adaptation (PGMA), to dealing with the problem. The proposed approach learns to build a model, called the solver, with two sets of parameters. The first set is shared by all tasks learned so far and the second set is dynamically generated to adapt the solver to suit each test example in order to classify it. Extensive experiments have been carried out to demonstrate the effectiveness of the proposed approach.", "keywords": "overcoming forgetting;model adaptation;continual learning", "primary_area": "", "supplementary_material": "", "author": "Wenpeng Hu;Zhou Lin;Bing Liu;Chongyang Tao;Zhengwei Tao;Jinwen Ma;Dongyan Zhao;Rui Yan", "authorids": "wenpeng.hu@pku.edu.cn;scene@pku.edu.cn;liub@uic.edu;chongyangtao@pku.edu.cn;tttzw@pku.edu.cn;jwma@math.pku.edu.cn;zhaody@pku.edu.cn;ruiyan@pku.edu.cn", "gender": ";;;;;;;", "homepage": ";;;;;;;", "dblp": ";;;;;;;", "google_scholar": ";;;;;;;", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": ";;;;;;;", "aff": ";;;;;;;", "aff_domain": ";;;;;;;", "position": ";;;;;;;", "bibtex": "@inproceedings{\nhu2018overcoming,\ntitle={Overcoming Catastrophic Forgetting via Model Adaptation},\nauthor={Wenpeng Hu and Zhou Lin and Bing Liu and Chongyang Tao and Zhengwei Tao and Jinwen Ma and Dongyan Zhao and Rui Yan},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryGvcoA5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;2;4", "wc_review": "688;246;393", "wc_reply_reviewers": "286;0;12", "wc_reply_authors": "1145;552;861", "reply_reviewers": "2;0;1", "reply_authors": "3;2;3", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 442.3333333333333, 183.7867121300002 ], "wc_reply_reviewers_avg": [ 99.33333333333333, 132.08414826247025 ], "wc_reply_authors_avg": [ 852.6666666666666, 242.16293871872483 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 17, 0 ], "authors#_avg": [ 8, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 211, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1382204332167785281&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryGvcoA5YX", "pdf": "https://openreview.net/pdf?id=ryGvcoA5YX", "email": ";;;;;;;", "author_num": 8 }, { "id": "ryM07h0cYX", "title": "Reinforced Pipeline Optimization: Behaving Optimally with Non-Differentiabilities", "track": "main", "status": "Reject", "tldr": "By converting an originally non-differentiable pipeline into a stochastic counterpart, we can then train the converted pipeline completely end-to-end while optimizing any criterion attached to it.", "abstract": "Many machine learning systems are implemented as pipelines. A pipeline is essentially a chain/network of information processing units. As information flows in and out and gradients vice versa, ideally, a pipeline can be trained end-to-end via backpropagation provided with the right supervision and loss function. However, this is usually impossible in practice, because either the loss function itself may be non-differentiable, or there may exist some non-differentiable units. One popular way to superficially resolve this issue is to separate a pipeline into a set of differentiable sub-pipelines and train them with isolated loss functions. Yet, from a decision-theoretical point of view, this is equivalent to making myopic decisions using ad hoc heuristics along the pipeline while ignoring the real utility, which prevents the pipeline from behaving optimally. In this paper, we show that by converting a pipeline into a stochastic counterpart, it can then be trained end-to-end in the presence of non-differentiable parts. Thus, the resulting pipeline is optimal under certain conditions with respect to any criterion attached to it. In experiments, we apply the proposed approach - reinforced pipeline optimization - to Faster R-CNN, a state-of-the-art object detection pipeline, and obtain empirically near-optimal object detectors consistent with its base design in terms of mean average precision.", "keywords": "Pipeline Optimization;Reinforcement Learning;Stochastic Computation Graph;Faster R-CNN", "primary_area": "", "supplementary_material": "", "author": "Aijun Bai;Dongdong Chen;Gang Hua;Lu Yuan", "authorids": "aijunbai@gmail.com;cd722522@mail.ustc.edu.cn;ganghua@gmail.com;luyuan@microsoft.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nbai2019reinforced,\ntitle={Reinforced Pipeline Optimization: Behaving Optimally with Non-Differentiabilities},\nauthor={Aijun Bai and Dongdong Chen and Gang Hua and Lu Yuan},\nyear={2019},\nurl={https://openreview.net/forum?id=ryM07h0cYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryM07h0cYX", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;5;2", "wc_review": "1041;266;695", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 667.3333333333334, 316.9966701542603 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.6546536707079772, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Pa4f_SrKaSwJ:scholar.google.com/&scioq=Reinforced+Pipeline+Optimization:+Behaving+Optimally+with+Non-Differentiabilities&hl=en&as_sdt=0,5", "gs_version_total": 2 }, { "id": "ryMQ5sRqYX", "title": "Finding Mixed Nash Equilibria of Generative Adversarial Networks", "track": "main", "status": "Reject", "tldr": "", "abstract": "We reconsider the training objective of Generative Adversarial Networks (GANs) from the mixed Nash Equilibria (NE) perspective. Inspired by the classical prox methods, we develop a novel algorithmic framework for GANs via an infinite-dimensional two-player game and prove rigorous convergence rates to the mixed NE. We then propose a principled procedure to reduce our novel prox methods to simple sampling routines, leading to practically efficient algorithms. Finally, we provide experimental evidence that our approach outperforms methods that seek pure strategy equilibria, such as SGD, Adam, and RMSProp, both in speed and quality.", "keywords": "GANs;mixed Nash equilibrium;mirror descent;sampling", "primary_area": "", "supplementary_material": "", "author": "Ya-Ping Hsieh;Chen Liu;Volkan Cevher", "authorids": "ya-ping.hsieh@epfl.ch;chen.liu@epfl.ch;volkan.cevher@epfl.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nhsieh2019finding,\ntitle={Finding Mixed Nash Equilibria of Generative Adversarial Networks},\nauthor={Ya-Ping Hsieh and Chen Liu and Volkan Cevher},\nyear={2019},\nurl={https://openreview.net/forum?id=ryMQ5sRqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryMQ5sRqYX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;5;4", "wc_review": "120;618;365", "wc_reply_reviewers": "0;545;80", "wc_reply_authors": "335;1486;317", "reply_reviewers": "0;1;1", "reply_authors": "1;2;2", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 367.6666666666667, 203.31639273692505 ], "wc_reply_reviewers_avg": [ 208.33333333333334, 240.28917763579963 ], "wc_reply_authors_avg": [ 712.6666666666666, 546.8786174483532 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 105, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14920056197615352388&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 16 }, { "title": "Analysis of Quantized Models", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/969", "id": "ryM_IoAqYX", "author_site": "LU HOU, Ruiliang Zhang, James Kwok", "tldr": "In this paper, we studied efficient training of loss-aware weight-quantized networks with quantized gradient in a distributed environment, both theoretically and empirically.", "abstract": "Deep neural networks are usually huge, which significantly limits the deployment on low-end devices. In recent years, many\nweight-quantized models have been proposed. They have small storage and fast inference, but training can still be time-consuming. This can be improved with distributed learning. To reduce the high communication cost due to worker-server synchronization, recently gradient quantization has also been proposed to train deep networks with full-precision weights. \nIn this paper, we theoretically study how the combination of both weight and gradient quantization affects convergence.\nWe show that (i) weight-quantized models converge to an error related to the weight quantization resolution and weight dimension; (ii) quantizing gradients slows convergence by a factor related to the gradient quantization resolution and dimension; and (iii) clipping the gradient before quantization renders this factor dimension-free, thus allowing the use of fewer bits for gradient quantization. Empirical experiments confirm the theoretical convergence results, and demonstrate that quantized networks can speed up training and have comparable performance as full-precision networks.", "keywords": "weight quantization;gradient quantization;distributed learning", "primary_area": "", "supplementary_material": "", "author": "Lu Hou;Ruiliang Zhang;James T. Kwok", "authorids": "lhouab@cse.ust.hk;ruiliang.zhang@tusimple.ai;jamesk@cse.ust.hk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhou2018analysis,\ntitle={Analysis of Quantized Models},\nauthor={Lu Hou and Ruiliang Zhang and James T. Kwok},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryM_IoAqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "651;430;596", "wc_reply_reviewers": "0;130;0", "wc_reply_authors": "1472;509;586", "reply_reviewers": "0;1;0", "reply_authors": "3;3;2", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 559.0, 93.93969696920821 ], "wc_reply_reviewers_avg": [ 43.333333333333336, 61.282587702834114 ], "wc_reply_authors_avg": [ 855.6666666666666, 436.9457123666 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5969195543045574314&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=ryM_IoAqYX", "pdf": "https://openreview.net/pdf?id=ryM_IoAqYX", "email": ";;", "author_num": 3 }, { "title": "Deep learning generalizes because the parameter-function map is biased towards simple functions", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/989", "id": "rye4g3AqFm", "author_site": "Guillermo Valle-Perez, Chico Q. Camargo, Ard Louis", "tldr": "The parameter-function map of deep networks is hugely biased; this can explain why they generalize. We use PAC-Bayes and Gaussian processes to obtain nonvacuous bounds.", "abstract": "Deep neural networks (DNNs) generalize remarkably well without explicit regularization even in the strongly over-parametrized regime where classical learning theory would instead predict that they would severely overfit. While many proposals for some kind of implicit regularization have been made to rationalise this success, there is no consensus for the fundamental reason why DNNs do not strongly overfit. In this paper, we provide a new explanation. By applying a very general probability-complexity bound recently derived from algorithmic information theory (AIT), we argue that the parameter-function map of many DNNs should be exponentially biased towards simple functions. We then provide clear evidence for this strong simplicity bias in a model DNN for Boolean functions, as well as in much larger fully connected and convolutional networks trained on CIFAR10 and MNIST.\nAs the target functions in many real problems are expected to be highly structured, this intrinsic simplicity bias helps explain why deep networks generalize well on real world problems.\nThis picture also facilitates a novel PAC-Bayes approach where the prior is taken over the DNN input-output function space, rather than the more conventional prior over parameter space. If we assume that the training algorithm samples parameters close to uniformly within the zero-error region then the PAC-Bayes theorem can be used to guarantee good expected generalization for target functions producing high-likelihood training sets. By exploiting recently discovered connections between DNNs and Gaussian processes to estimate the marginal likelihood, we produce relatively tight generalization PAC-Bayes error bounds which correlate well with the true error on realistic datasets such as MNIST and CIFAR10 and for architectures including convolutional and fully connected networks.", "keywords": "generalization;deep learning theory;PAC-Bayes;Gaussian processes;parameter-function map;simplicity bias", "primary_area": "", "supplementary_material": "", "author": "Guillermo Valle-Perez;Chico Q. Camargo;Ard A. Louis", "authorids": "guillermo.valle@dtc.ox.ac.uk;chico.camargo@gmail.com;ard.louis@physics.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nvalle-perez2018deep,\ntitle={Deep learning generalizes because the parameter-function map is biased towards simple functions},\nauthor={Guillermo Valle-Perez and Chico Q. Camargo and Ard A. Louis},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rye4g3AqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;3;4", "wc_review": "492;318;318", "wc_reply_reviewers": "19;0;0", "wc_reply_authors": "1090;1451;219", "reply_reviewers": "1;0;0", "reply_authors": "2;2;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 376.0, 82.02438661763951 ], "wc_reply_reviewers_avg": [ 6.333333333333333, 8.956685895029603 ], "wc_reply_authors_avg": [ 920.0, 517.1273215240775 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.18898223650461363, "gs_citation": 253, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15588770246928974204&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=rye4g3AqFm", "pdf": "https://openreview.net/pdf?id=rye4g3AqFm", "email": ";;", "author_num": 3 }, { "id": "rye7XnRqFm", "title": "Q-map: a Convolutional Approach for Goal-Oriented Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "Q-map is a reinforcement learning agent that uses a convolutional autoencoder-like architecture to efficiently learn to navigate its environment.", "abstract": "Goal-oriented learning has become a core concept in reinforcement learning (RL), extending the reward signal as a sole way to define tasks. However, as parameterizing value functions with goals increases the learning complexity, efficiently reusing past experience to update estimates towards several goals at once becomes desirable but usually requires independent updates per goal.\nConsidering that a significant number of RL environments can support spatial coordinates as goals, such as on-screen location of the character in ATARI or SNES games, we propose a novel goal-oriented agent called Q-map that utilizes an autoencoder-like neural network to predict the minimum number of steps towards each coordinate in a single forward pass. This architecture is similar to Horde with parameter sharing and allows the agent to discover correlations between visual patterns and navigation. For example learning how to use a ladder in a game could be transferred to other ladders later.\nWe show how this network can be efficiently trained with a 3D variant of Q-learning to update the estimates towards all goals at once. While the Q-map agent could be used for a wide range of applications, we propose a novel exploration mechanism in place of epsilon-greedy that relies on goal selection at a desired distance followed by several steps taken towards it, allowing long and coherent exploratory steps in the environment.\nWe demonstrate the accuracy and generalization qualities of the Q-map agent on a grid-world environment and then demonstrate the efficiency of the proposed exploration mechanism on the notoriously difficult Montezuma's Revenge and Super Mario All-Stars games.", "keywords": "reinforcement learning;goal-oriented;convolutions;off-policy", "primary_area": "", "supplementary_material": "", "author": "Fabio Pardo;Vitaly Levdik;Petar Kormushev", "authorids": "f.pardo@imperial.ac.uk;v.levdik@imperial.ac.uk;p.kormushev@imperial.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\npardo2019qmap,\ntitle={Q-map: a Convolutional Approach for Goal-Oriented Reinforcement Learning},\nauthor={Fabio Pardo and Vitaly Levdik and Petar Kormushev},\nyear={2019},\nurl={https://openreview.net/forum?id=rye7XnRqFm},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rye7XnRqFm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rye7XnRqFm", "pdf_size": 0, "rating": "4;4;5", "confidence": "3;5;4", "wc_review": "467;408;693", "wc_reply_reviewers": "154;205;25", "wc_reply_authors": "426;224;315", "reply_reviewers": "1;1;1", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 522.6666666666666, 122.82869733449463 ], "wc_reply_reviewers_avg": [ 128.0, 75.7495874576225 ], "wc_reply_authors_avg": [ 321.6666666666667, 82.60078019540062 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15375529606217400896&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Learning when to Communicate at Scale in Multiagent Cooperative and Competitive Tasks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/770", "id": "rye7knCqK7", "author_site": "Amanpreet Singh, Tushar Jain, Sainbayar Sukhbaatar", "tldr": "We introduce IC3Net, a single network which can be used to train agents in cooperative, competitive and mixed scenarios. We also show that agents can learn when to communicate using our model.", "abstract": "Learning when to communicate and doing that effectively is essential in multi-agent tasks. Recent works show that continuous communication allows efficient training with back-propagation in multi-agent scenarios, but have been restricted to fully-cooperative tasks. In this paper, we present Individualized Controlled Continuous Communication Model (IC3Net) which has better training efficiency than simple continuous communication model, and can be applied to semi-cooperative and competitive settings along with the cooperative settings. IC3Net controls continuous communication with a gating mechanism and uses individualized rewards foreach agent to gain better performance and scalability while fixing credit assignment issues. Using variety of tasks including StarCraft BroodWars explore and combat scenarios, we show that our network yields improved performance and convergence rates than the baselines as the scale increases. Our results convey that IC3Net agents learn when to communicate based on the scenario and profitability.", "keywords": "multiagent;communication;competitive;cooperative;continuous;emergent;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Amanpreet Singh;Tushar Jain;Sainbayar Sukhbaatar", "authorids": "amanpreet@nyu.edu;tushar@nyu.edu;sainbar@cs.nyu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nsingh2018individualized,\ntitle={Individualized Controlled Continuous Communication Model for Multiagent Cooperative and Competitive Tasks},\nauthor={Amanpreet Singh and Tushar Jain and Sainbayar Sukhbaatar},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rye7knCqK7},\n}", "github": "[![github](/images/github_icon.svg) IC3Net/IC3Net](https://github.com/IC3Net/IC3Net) + [![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rye7knCqK7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;3;3", "wc_review": "365;356;298", "wc_reply_reviewers": "0;0;143", "wc_reply_authors": "1071;990;627", "reply_reviewers": "0;0;1", "reply_authors": "3;3;2", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.0 ], "wc_review_avg": [ 339.6666666666667, 29.69100125552447 ], "wc_reply_reviewers_avg": [ 47.666666666666664, 67.41084647311753 ], "wc_reply_authors_avg": [ 896.0, 193.06475597581243 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.6666666666666665, 0.4714045207910317 ], "replies_avg": [ 16, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 391, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12298395236200633957&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "openreview": "https://openreview.net/forum?id=rye7knCqK7", "pdf": "https://openreview.net/pdf?id=rye7knCqK7", "email": ";;", "author_num": 3 }, { "id": "ryeAy3AqYm", "title": "Distilled Agent DQN for Provable Adversarial Robustness", "track": "main", "status": "Reject", "tldr": "We introduce a way of (provably) defending Deep-RL against adversarial perturbations, including a new poisoning attack.", "abstract": "As deep neural networks have become the state of the art for solving complex reinforcement learning tasks, susceptibility to perceptual adversarial examples have become a concern. The transferability of adversarial examples is known to enable attacks capable of tricking the agent into bad states. In this work we demonstrate a simple poisoning attack able to keep deep RL from learning, and into fooling it when trained with defense methods commonly used for classification tasks. We then propose an algorithm called DadQN, based on deep Q-networks, which enables the use of stronger defenses, including defenses enabling the first ever on-line robustness certification of a deep RL agent.", "keywords": "reinforcement learning;dqn;adversarial examples;robustness analysis;adversarial defense;robust learning;robust rl", "primary_area": "", "supplementary_material": "", "author": "Matthew Mirman;Marc Fischer;Martin Vechev", "authorids": "matthew.mirman@inf.ethz.ch;marcfisc@student.ethz.ch;martin.vechev@inf.ethz.ch", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nmirman2019distilled,\ntitle={Distilled Agent {DQN} for Provable Adversarial Robustness},\nauthor={Matthew Mirman and Marc Fischer and Martin Vechev},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeAy3AqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=ryeAy3AqYm", "pdf_size": 0, "rating": "3;4;5", "confidence": "2;2;4", "wc_review": "376;166;696", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "379;705;1165", "reply_reviewers": "0;0;0", "reply_authors": "1;1;2", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 2.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 412.6666666666667, 217.91945504908202 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 749.6666666666666, 322.4338002684513 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=522351347661628184&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryeNPi0qKX", "title": "Language Modeling Teaches You More Syntax than Translation Does: Lessons Learned Through Auxiliary Task Analysis", "track": "main", "status": "Reject", "tldr": "We throughly compare several pretraining tasks on their ability to induce syntactic information and find that representations from language models consistently perform best, even when trained on relatively small amounts of data.", "abstract": "Recent work using auxiliary prediction task classifiers to investigate the properties of LSTM representations has begun to shed light on why pretrained representations, like ELMo (Peters et al., 2018) and CoVe (McCann et al., 2017), are so beneficial for neural language understanding models. We still, though, do not yet have a clear understanding of how the choice of pretraining objective affects the type of linguistic information that models learn. With this in mind, we compare four objectives - language modeling, translation, skip-thought, and autoencoding - on their ability to induce syntactic and part-of-speech information. We make a fair comparison between the tasks by holding constant the quantity and genre of the training data, as well as the LSTM architecture. We find that representations from language models consistently perform best on our syntactic auxiliary prediction tasks, even when trained on relatively small amounts of data. These results suggest that language modeling may be the best data-rich pretraining task for transfer learning applications requiring syntactic information. We also find that the representations from randomly-initialized, frozen LSTMs perform strikingly well on our syntactic auxiliary tasks, but this effect disappears when the amount of training data for the auxiliary tasks is reduced.", "keywords": "representation learning;recurrent neural networks;syntax;part-of-speech tagging", "primary_area": "", "supplementary_material": "", "author": "Kelly W. Zhang;Samuel R. Bowman", "authorids": "kellywzhang@seas.harvard.edu;bowman@nyu.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nzhang2019language,\ntitle={Language Modeling Teaches You More Syntax than Translation Does: Lessons Learned Through Auxiliary Task Analysis},\nauthor={Kelly W. Zhang and Samuel R. Bowman},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeNPi0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryeNPi0qKX", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "364;358;66", "wc_reply_reviewers": "237;283;0", "wc_reply_authors": "525;494;54", "reply_reviewers": "1;1;0", "reply_authors": "2;2;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 262.6666666666667, 139.08590470960345 ], "wc_reply_reviewers_avg": [ 173.33333333333334, 123.99551963231934 ], "wc_reply_authors_avg": [ 357.6666666666667, 215.0973939611749 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 172, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10609019624648008720&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "Synthetic Datasets for Neural Program Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/832", "id": "ryeOSnAqYm", "author_site": "Richard Shin, Neel Kant, Kavi Gupta, Christopher Bender, Brandon Trabucco, Rishabh Singh, Dawn Song", "tldr": "", "abstract": "The goal of program synthesis is to automatically generate programs in a particular language from corresponding specifications, e.g. input-output behavior.\nMany current approaches achieve impressive results after training on randomly generated I/O examples in limited domain-specific languages (DSLs), as with string transformations in RobustFill.\nHowever, we empirically discover that applying test input generation techniques for languages with control flow and rich input space causes deep networks to generalize poorly to certain data distributions;\nto correct this, we propose a new methodology for controlling and evaluating the bias of synthetic data distributions over both programs and specifications.\nWe demonstrate, using the Karel DSL and a small Calculator DSL, that training deep networks on these distributions leads to improved cross-distribution generalization performance. ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Richard Shin;Neel Kant;Kavi Gupta;Chris Bender;Brandon Trabucco;Rishabh Singh;Dawn Song", "authorids": "ricshin@berkeley.edu;kantneel@berkeley.edu;kavi@berkeley.edu;chrisbender@berkeley.edu;btrabucco@berkeley.edu;rising@google.com;dawnsong@cs.berkeley.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\nshin2018synthetic,\ntitle={Synthetic Datasets for Neural Program Synthesis},\nauthor={Richard Shin and Neel Kant and Kavi Gupta and Chris Bender and Brandon Trabucco and Rishabh Singh and Dawn Song},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeOSnAqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;6;7", "confidence": "2;4;3", "wc_review": "253;962;549", "wc_reply_reviewers": "28;0;0", "wc_reply_authors": "773;812;579", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 588.0, 290.75877745420973 ], "wc_reply_reviewers_avg": [ 9.333333333333334, 13.199326582148887 ], "wc_reply_authors_avg": [ 721.3333333333334, 101.8964616112301 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7991169696227265299&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ryeOSnAqYm", "pdf": "https://openreview.net/pdf?id=ryeOSnAqYm", "email": ";;;;;;", "author_num": 7 }, { "id": "ryeX-nC9YQ", "title": "Dimension-Free Bounds for Low-Precision Training", "track": "main", "status": "Reject", "tldr": "we proved dimension-independent bounds for low-precision training algorithms", "abstract": "Low-precision training is a promising way of decreasing the time and energy cost of training machine learning models.\nPrevious work has analyzed low-precision training algorithms, such as low-precision stochastic gradient descent, and derived theoretical bounds on their convergence rates.\nThese bounds tend to depend on the dimension of the model $d$ in that the number of bits needed to achieve a particular error bound increases as $d$ increases.\nThis is undesirable because a motivating application for low-precision training is large-scale models, such as deep learning, where $d$ can be huge.\nIn this paper, we prove dimension-independent bounds for low-precision training algorithms that use fixed-point arithmetic, which lets us better understand what affects the convergence of these algorithms as parameters scale.\nOur methods also generalize naturally to let us prove new convergence bounds on low-precision training with other quantization schemes, such as low-precision floating-point computation and logarithmic quantization.", "keywords": "low precision;stochastic gradient descent", "primary_area": "", "supplementary_material": "", "author": "Zheng Li;Christopher De Sa", "authorids": "lzlz19971997@gmail.com;cdesa@cs.cornell.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2019dimensionfree,\ntitle={Dimension-Free Bounds for Low-Precision Training},\nauthor={Zheng Li and Christopher De Sa},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeX-nC9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer4", "site": "https://openreview.net/forum?id=ryeX-nC9YQ", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;3;3", "wc_review": "223;561;407", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "449;469;242", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 397.0, 138.16897866983987 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 386.6666666666667, 102.6201193182355 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15916062818541172055&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "DPSNet: End-to-end Deep Plane Sweep Stereo", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/682", "id": "ryeYHi0ctQ", "author_site": "Sunghoon Im, Hae-Gon Jeon, Stephen Lin, In Kweon", "tldr": "A convolution neural network for multi-view stereo matching whose design is inspired by best practices of traditional geometry-based approaches", "abstract": "Multiview stereo aims to reconstruct scene depth from images acquired by a camera under arbitrary motion. Recent methods address this problem through deep learning, which can utilize semantic cues to deal with challenges such as textureless and reflective regions. In this paper, we present a convolutional neural network called DPSNet (Deep Plane Sweep Network) whose design is inspired by best practices of traditional geometry-based approaches. Rather than directly estimating depth and/or optical flow correspondence from image pairs as done in many previous deep learning methods, DPSNet takes a plane sweep approach that involves building a cost volume from deep features using the plane sweep algorithm, regularizing the cost volume via a context-aware cost aggregation, and regressing the depth map from the cost volume. The cost volume is constructed using a differentiable warping process that allows for end-to-end training of the network. Through the effective incorporation of conventional multiview stereo concepts within a deep learning framework, DPSNet achieves state-of-the-art reconstruction results on a variety of challenging datasets.", "keywords": "Deep Learning;Stereo;Depth;Geometry", "primary_area": "", "supplementary_material": "", "author": "Sunghoon Im;Hae-Gon Jeon;Stephen Lin;In So Kweon", "authorids": "dlarl8927@kaist.ac.kr;haegonj@andrew.cmu.edu;stevelin@microsoft.com;iskweon77@kaist.ac.kr", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nim2018dpsnet,\ntitle={{DPSN}et: End-to-end Deep Plane Sweep Stereo},\nauthor={Sunghoon Im and Hae-Gon Jeon and Stephen Lin and In So Kweon},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeYHi0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;7", "confidence": "4;5;4", "wc_review": "166;260;498", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "42;913;660", "reply_reviewers": "0;0;0", "reply_authors": "1;3;3", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 308.0, 139.72353655224543 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 538.3333333333334, 365.8436217232105 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.9428090415820634 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 288, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11110225942792064313&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=ryeYHi0ctQ", "pdf": "https://openreview.net/pdf?id=ryeYHi0ctQ", "email": ";;;", "author_num": 4 }, { "id": "ryeaZhRqFm", "title": "Link Prediction in Hypergraphs using Graph Convolutional Networks", "track": "main", "status": "Reject", "tldr": "We propose Neural Hyperlink Predictor (NHP). NHP adapts graph convolutional networks for link prediction in hypergraphs", "abstract": "Link prediction in simple graphs is a fundamental problem in which new links between nodes are predicted based on the observed structure of the graph. However, in many real-world applications, there is a need to model relationships among nodes which go beyond pairwise associations. For example, in a chemical reaction, relationship among the reactants and products is inherently higher-order. Additionally, there is need to represent the direction from reactants to products. Hypergraphs provide a natural way to represent such complex higher-order relationships. Even though Graph Convolutional Networks (GCN) have recently emerged as a powerful deep learning-based approach for link prediction over simple graphs, their suitability for link prediction in hypergraphs is unexplored -- we fill this gap in this paper and propose Neural Hyperlink Predictor (NHP). NHP adapts GCNs for link prediction in hypergraphs. We propose two variants of NHP --NHP-U and NHP-D -- for link prediction over undirected and directed hypergraphs, respectively. To the best of our knowledge, NHP-D is the first method for link prediction over directed hypergraphs. Through extensive experiments on multiple real-world datasets, we show NHP's effectiveness.", "keywords": "Graph convolution;hypergraph;hyperlink prediction", "primary_area": "", "supplementary_material": "", "author": "Naganand Yadati;Vikram Nitin;Madhav Nimishakavi;Prateek Yadav;Anand Louis;Partha Talukdar", "authorids": "y.naganand@gmail.com;vikramnitin9@gmail.com;madhav@iisc.ac.in;prateekyadav@iisc.ac.in;anandl@iisc.ac.in;ppt@iisc.ac.in", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nyadati2019link,\ntitle={Link Prediction in Hypergraphs using Graph Convolutional Networks},\nauthor={Naganand Yadati and Vikram Nitin and Madhav Nimishakavi and Prateek Yadav and Anand Louis and Partha Talukdar},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeaZhRqFm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryeaZhRqFm", "pdf_size": 0, "rating": "4;5;6", "confidence": "5;4;2", "wc_review": "157;369;691", "wc_reply_reviewers": "0;111;65", "wc_reply_authors": "480;614;721", "reply_reviewers": "0;1;1", "reply_authors": "1;2;3", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 405.6666666666667, 219.5409351857239 ], "wc_reply_reviewers_avg": [ 58.666666666666664, 45.536310297997964 ], "wc_reply_authors_avg": [ 605.0, 98.59344129639996 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=188949741057813690&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryeh4jA9F7", "title": "Playing the Game of Universal Adversarial Perturbations", "track": "main", "status": "Reject", "tldr": "We propose a robustification method under the presence of universal adversarial perturbations, by connecting a game theoretic method (fictitious play) with the problem of robustification, and making it more scalable.", "abstract": "We study the problem of learning classifiers robust to universal adversarial perturbations. While prior work approaches this problem via robust optimization, adversarial training, or input transformation, we instead phrase it as a two-player zero-sum game. In this new formulation, both players simultaneously play the same game, where one player chooses a classifier that minimizes a classification loss whilst the other player creates an adversarial perturbation that increases the same loss when applied to every sample in the training set.\nBy observing that performing a classification (respectively creating adversarial samples) is the best response to the other player, we propose a novel extension of a game-theoretic algorithm, namely fictitious play, to the domain of training robust classifiers. Finally, we empirically show the robustness and versatility of our approach in two defence scenarios where universal attacks are performed on several image classification datasets -- CIFAR10, CIFAR100 and ImageNet.", "keywords": "adversarial perturbations;universal adversarial perturbations;game theory;robust machine learning", "primary_area": "", "supplementary_material": "", "author": "Julien Perolet;Mateusz Malinowski;Bilal Piot;Olivier Pietquin", "authorids": "perolat@google.com;mateuszm@google.com;piot@google.com;pietquin@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nperolet2019playing,\ntitle={Playing the Game of Universal Adversarial Perturbations},\nauthor={Julien Perolet and Mateusz Malinowski and Bilal Piot and Olivier Pietquin},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeh4jA9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryeh4jA9F7", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;4;1", "wc_review": "239;295;443", "wc_reply_reviewers": "134;194;0", "wc_reply_authors": "358;353;192", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 325.6666666666667, 86.05941100322626 ], "wc_reply_reviewers_avg": [ 109.33333333333333, 81.09802025258625 ], "wc_reply_authors_avg": [ 301.0, 77.10166448700487 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3389077351760816259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4 }, { "id": "ryekdoCqF7", "title": "Incremental training of multi-generative adversarial networks", "track": "main", "status": "Reject", "tldr": "We propose a new method to incrementally train a mixture generative model to approximate the information projection of the real data distribution.", "abstract": "Generative neural networks map a standard, possibly distribution to a complex high-dimensional distribution, which represents the real world data set. However, a determinate input distribution as well as a specific architecture of neural networks may impose limitations on capturing the diversity in the high dimensional target space. To resolve this difficulty, we propose a training framework that greedily produce a series of generative adversarial networks that incrementally capture the diversity of the target space. We show theoretically and empirically that our training algorithm converges to the theoretically optimal distribution, the projection of the real distribution onto the convex hull of the network's distribution space.", "keywords": "GAN;Incremental training;Information projection;Mixture distribution", "primary_area": "", "supplementary_material": "", "author": "Qi Tan;Pingzhong Tang;Ke Xu;Weiran Shen;Song Zuo", "authorids": "thunderingtan@gmail.com;kenshinping@gmail.com;xuke@tsinghua.edu.cn;emersonswr@gmail.com;songzuo.z@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\ntan2019incremental,\ntitle={Incremental training of multi-generative adversarial networks},\nauthor={Qi Tan and Pingzhong Tang and Ke Xu and Weiran Shen and Song Zuo},\nyear={2019},\nurl={https://openreview.net/forum?id=ryekdoCqF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryekdoCqF7", "pdf_size": 0, "rating": "5;6;6", "confidence": "3;3;4", "wc_review": "303;244;137", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "624;313;394", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 228.0, 68.70710783220807 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 443.6666666666667, 131.73289473611197 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:9AdVzOUDSkMJ:scholar.google.com/&scioq=Incremental+training+of+multi-generative+adversarial+networks&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ryemosC9tm", "title": "Representation-Constrained Autoencoders and an Application to Wireless Positioning", "track": "main", "status": "Reject", "tldr": "We propose to impose representation constraints to autoencoders in order to localize wireless transmitters in space from their channel state information. ", "abstract": "In a number of practical applications that rely on dimensionality reduction, the dataset or measurement process provides valuable side information that can be incorporated when learning low-dimensional embeddings. We propose the inclusion of pairwise representation constraints into autoencoders (AEs) with the goal of promoting application-specific structure. We use synthetic results to show that only a small amount of AE representation constraints are required to substantially improve the local and global neighborhood preserving properties of the learned embeddings. To demonstrate the efficacy of our approach and to illustrate a practical application that naturally provides such representation constraints, we focus on wireless positioning using a recently proposed channel charting framework. We show that representation-constrained AEs recover the global geometry of the learned low-dimensional representations, which enables channel charting to perform approximate positioning without access to global navigation satellite systems or supervised learning methods that rely on extensive measurement campaigns. ", "keywords": "Autoencoder;dimensionality reduction;wireless positioning;channel charting;localization", "primary_area": "", "supplementary_material": "", "author": "Pengzhi Huang;Emre Gonultas;Said Medjkouh;Oscar Castaneda;Olav Tirkkonen;Tom Goldstein;Christoph Studer", "authorids": "ph448@cornell.edu;eg566@cornell.edu;sm2685@cornell.edu;oc66@cornell.edu;olav.tirkkonen@aalto.fi;tomg@cs.umd.edu;studer@cornell.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nhuang2019representationconstrained,\ntitle={Representation-Constrained Autoencoders and an Application to Wireless Positioning},\nauthor={Pengzhi Huang and Emre Gonultas and Said Medjkouh and Oscar Castaneda and Olav Tirkkonen and Tom Goldstein and Christoph Studer},\nyear={2019},\nurl={https://openreview.net/forum?id=ryemosC9tm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryemosC9tm", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;2", "wc_review": "569;139;191", "wc_reply_reviewers": "0;81;0", "wc_reply_authors": "754;385;197", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 299.6666666666667, 191.6269524072459 ], "wc_reply_reviewers_avg": [ 27.0, 38.18376618407357 ], "wc_reply_authors_avg": [ 445.3333333333333, 231.36166973425443 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12974964158215899385&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryeoxnRqKQ", "title": "NATTACK: A STRONG AND UNIVERSAL GAUSSIAN BLACK-BOX ADVERSARIAL ATTACK", "track": "main", "status": "Reject", "tldr": "", "abstract": "Recent works find that DNNs are vulnerable to adversarial examples, whose changes from the benign ones are imperceptible and yet lead DNNs to make wrong predictions. One can find various adversarial examples for the same input to a DNN using different attack methods. In other words, there is a population of adversarial examples, instead of only one, for any input to a DNN. By explicitly modeling this adversarial population with a Gaussian distribution, we propose a new black-box attack called NATTACK. The adversarial attack is hence formalized as an optimization problem, which searches the mean of the Gaussian under the guidance of increasing the target DNN's prediction error. NATTACK achieves 100% attack success rate on six out of eleven recently published defense methods (and greater than 90% for four), all using the same algorithm. Such results are on par with or better than powerful state-of-the-art white-box attacks. While the white-box attacks are often model-specific or defense-specific, the proposed black-box NATTACK is universally applicable to different defenses. ", "keywords": "adversarial attack;black-box;evolutional strategy;policy gradient", "primary_area": "", "supplementary_material": "", "author": "Yandong Li;Lijun Li;Liqiang Wang;Tong Zhang;Boqing Gong", "authorids": "lyndon.leeseu@outlook.com;lilijun1990@buaa.edu.cn;lwang@cs.ucf.edu;bradymzhang@tencent.com;boqinggo@outlook.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019nattack,\ntitle={{NATTACK}: A {STRONG} {AND} {UNIVERSAL} {GAUSSIAN} {BLACK}-{BOX} {ADVERSARIAL} {ATTACK}},\nauthor={Yandong Li and Lijun Li and Liqiang Wang and Tong Zhang and Boqing Gong},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeoxnRqKQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryeoxnRqKQ", "pdf_size": 0, "rating": "4;4;7", "confidence": "5;3;3", "wc_review": "456;329;120", "wc_reply_reviewers": "605;0;0", "wc_reply_authors": "1098;458;38", "reply_reviewers": "3;0;0", "reply_authors": "5;1;1", "rating_avg": [ 5.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 301.6666666666667, 138.52637133613064 ], "wc_reply_reviewers_avg": [ 201.66666666666666, 285.1997350785742 ], "wc_reply_authors_avg": [ 531.3333333333334, 435.83891009816415 ], "reply_reviewers_avg": [ 1.0, 1.4142135623730951 ], "reply_authors_avg": [ 2.3333333333333335, 1.8856180831641267 ], "replies_avg": [ 33, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.5000000000000001, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9676090578408989696&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Conditional Network Embeddings", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/812", "id": "ryepUj0qtX", "author_site": "Bo Kang, Jefrey Lijffijt, Tijl De Bie", "tldr": "We introduce a network embedding method that accounts for prior information about the network, yielding superior empirical performance.", "abstract": "Network Embeddings (NEs) map the nodes of a given network into $d$-dimensional Euclidean space $\\mathbb{R}^d$. Ideally, this mapping is such that 'similar' nodes are mapped onto nearby points, such that the NE can be used for purposes such as link prediction (if 'similar' means being 'more likely to be connected') or classification (if 'similar' means 'being more likely to have the same label'). In recent years various methods for NE have been introduced, all following a similar strategy: defining a notion of similarity between nodes (typically some distance measure within the network), a distance measure in the embedding space, and a loss function that penalizes large distances for similar nodes and small distances for dissimilar nodes.\n\nA difficulty faced by existing methods is that certain networks are fundamentally hard to embed due to their structural properties: (approximate) multipartiteness, certain degree distributions, assortativity, etc. To overcome this, we introduce a conceptual innovation to the NE literature and propose to create \\emph{Conditional Network Embeddings} (CNEs); embeddings that maximally add information with respect to given structural properties (e.g. node degrees, block densities, etc.). We use a simple Bayesian approach to achieve this, and propose a block stochastic gradient descent algorithm for fitting it efficiently.\n\nWe demonstrate that CNEs are superior for link prediction and multi-label classification when compared to state-of-the-art methods, and this without adding significant mathematical or computational complexity. Finally, we illustrate the potential of CNE for network visualization.", "keywords": "Network embedding;graph embedding;learning node representations;link prediction;multi-label classification of nodes", "primary_area": "", "supplementary_material": "", "author": "Bo Kang;Jefrey Lijffijt;Tijl De Bie", "authorids": "bo.kang@ugent.be;jefrey.lijffijt@ugent.be;tijl.debie@ugent.be", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nkang2018conditional,\ntitle={Conditional Network Embeddings},\nauthor={Bo Kang and Jefrey Lijffijt and Tijl De Bie},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryepUj0qtX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;3", "wc_review": "41;178;326", "wc_reply_reviewers": "0;0;32", "wc_reply_authors": "288;788;448", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 181.66666666666666, 116.37964694147436 ], "wc_reply_reviewers_avg": [ 10.666666666666666, 15.084944665313014 ], "wc_reply_authors_avg": [ 508.0, 208.48661028149186 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 60, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11198262175221248724&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=ryepUj0qtX", "pdf": "https://openreview.net/pdf?id=ryepUj0qtX", "email": ";;", "author_num": 3 }, { "title": "Defensive Quantization: When Efficiency Meets Robustness", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/863", "id": "ryetZ20ctX", "author_site": "Ji Lin, Chuang Gan, Song Han", "tldr": "We designed a novel quantization methodology to jointly optimize the efficiency and robustness of deep learning models.", "abstract": "Neural network quantization is becoming an industry standard to efficiently deploy deep learning models on hardware platforms, such as CPU, GPU, TPU, and FPGAs. However, we observe that the conventional quantization approaches are vulnerable to adversarial attacks. This paper aims to raise people's awareness about the security of the quantized models, and we designed a novel quantization methodology to jointly optimize the efficiency and robustness of deep learning models. We first conduct an empirical study to show that vanilla quantization suffers more from adversarial attacks. We observe that the inferior robustness comes from the error amplification effect, where the quantization operation further enlarges the distance caused by amplified noise. Then we propose a novel Defensive Quantization (DQ) method by controlling the Lipschitz constant of the network during quantization, such that the magnitude of the adversarial noise remains non-expansive during inference. Extensive experiments on CIFAR-10 and SVHN datasets demonstrate that our new quantization method can defend neural networks against adversarial examples, and even achieves superior robustness than their full-precision counterparts, while maintaining the same hardware efficiency as vanilla quantization approaches. As a by-product, DQ can also improve the accuracy of quantized models without adversarial attack. ", "keywords": "defensive quantization;model quantization;adversarial attack;efficiency;robustness", "primary_area": "", "supplementary_material": "", "author": "Ji Lin;Chuang Gan;Song Han", "authorids": "jilin@mit.edu;ganchuang1990@gmail.com;songhan@mit.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nlin2018defensive,\ntitle={Defensive Quantization: When Efficiency Meets Robustness},\nauthor={Ji Lin and Chuang Gan and Song Han},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryetZ20ctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer4", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;2;4", "wc_review": "327;43;429", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "642;0;239", "reply_reviewers": "0;0;0", "reply_authors": "1;0;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 266.3333333333333, 163.31836666122058 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 293.6666666666667, 264.9305988786917 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "openreview": "https://openreview.net/forum?id=ryetZ20ctX", "pdf": "https://openreview.net/pdf?id=ryetZ20ctX", "email": ";;", "author_num": 3 }, { "id": "ryewE3R5YX", "title": "Characterizing Attacks on Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Deep Reinforcement learning (DRL) has achieved great success in various applications, such as playing computer games and controlling robotic manipulation. However, recent studies show that machine learning models are vulnerable to adversarial examples, which are carefully crafted instances that aim to mislead learning models to make arbitrarily incorrect prediction, and raised severe security concerns. DRL has been attacked by adding perturbation to each observed frame. However, such observation based attacks are not quite realistic considering that it would be hard for adversaries to directly manipulate pixel values in practice. Therefore, we propose to understand the vulnerabilities of DRL from various perspectives and provide a throughout taxonomy of adversarial perturbation against DRL, and we conduct the first experiments on unexplored parts of this taxonomy. In addition to current observation based attacks against DRL, we propose attacks based on the actions and environment dynamics. Among these experiments, we introduce a novel sequence-based attack to attack a sequence of frames for real-time scenarios such as autonomous driving, and the first targeted attack that perturbs environment dynamics to let the agent fail in a specific way. We show empirically that our sequence-based attack can generate effective perturbations in a blackbox setting in real time with a small number of queries, independent of episode length. We conduct extensive experiments to compare the effectiveness of different attacks with several baseline attack methods in several game playing, robotics control, and autonomous driving environments.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Chaowei Xiao;Xinlei Pan;Warren He;Bo Li;Jian Peng;Mingjie Sun;Jinfeng Yi;Mingyan Liu;Dawn Song.", "authorids": "xiaocw@umich.edu;xinleipan@berkeley.edu;_w@eecs.berkeley.edu;lxbosky@gmail.com;jianpeng@illinois.edu;sunmj15@mails.tsinghua.com;jinfengyi.ustc@gmail.com;mingyan@umich.edu;dawnsong@gmail.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "@misc{\nxiao2019characterizing,\ntitle={Characterizing Attacks on Deep Reinforcement Learning},\nauthor={Chaowei Xiao and Xinlei Pan and Warren He and Bo Li and Jian Peng and Mingjie Sun and Jinfeng Yi and Mingyan Liu and Dawn Song.},\nyear={2019},\nurl={https://openreview.net/forum?id=ryewE3R5YX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryewE3R5YX", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;3", "wc_review": "471;248;672", "wc_reply_reviewers": "155;98;91", "wc_reply_authors": "953;743;882", "reply_reviewers": "1;1;1", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 463.6666666666667, 173.17492761816172 ], "wc_reply_reviewers_avg": [ 114.66666666666667, 28.662790435607548 ], "wc_reply_authors_avg": [ 859.3333333333334, 87.217480408969 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 87, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2082865532339280385&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "ryeyti0qKX", "title": "On the Statistical and Information Theoretical Characteristics of DNN Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "It has been common to argue or imply that a regularizer can be used to alter a statistical property of a hidden layer's representation and thus improve generalization or performance of deep networks. For instance, dropout has been known to improve performance by reducing co-adaptation, and representational sparsity has been argued as a good characteristic because many data-generation processes have only a small number of factors that are independent. In this work, we analytically and empirically investigate the popular characteristics of learned representations, including correlation, sparsity, dead unit, rank, and mutual information, and disprove many of the \\textit{conventional wisdom}. We first show that infinitely many Identical Output Networks (IONs) can be constructed for any deep network with a linear layer, where any invertible affine transformation can be applied to alter the layer's representation characteristics. The existence of ION proves that the correlation characteristics of representation can be either low or high for a well-performing network. Extensions to ReLU layers are provided, too. Then, we consider sparsity, dead unit, and rank to show that only loose relationships exist among the three characteristics. It is shown that a higher sparsity or additional dead units do not imply a better or worse performance when the rank of representation is fixed. We also develop a rank regularizer and show that neither representation sparsity nor lower rank is helpful for improving performance even when the data-generation process has only a small number of independent factors. Mutual information $I(\\z_l;\\x)$ and $I(\\z_l;\\y)$ are investigated as well, and we show that regularizers can affect $I(\\z_l;\\x)$ and thus indirectly influence the performance. Finally, we explain how a rich set of regularizers can be used as a powerful tool for performance tuning. ", "keywords": "learned representation;statistical characteristics;information theoretical characteristics;deep network", "primary_area": "", "supplementary_material": "", "author": "Daeyoung Choi;Wonjong Rhee;Kyungeun Lee;Changho Shin", "authorids": "choid@snu.ac.kr;wrhee@snu.ac.kr;ruddms0415@snu.ac.kr;chshin@encoredtech.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchoi2019on,\ntitle={On the Statistical and Information Theoretical Characteristics of {DNN} Representations},\nauthor={Daeyoung Choi and Wonjong Rhee and Kyungeun Lee and Changho Shin},\nyear={2019},\nurl={https://openreview.net/forum?id=ryeyti0qKX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryeyti0qKX", "pdf_size": 0, "rating": "3;4;5", "confidence": "3;4;3", "wc_review": "791;161;341", "wc_reply_reviewers": "699;3;38", "wc_reply_authors": "840;88;166", "reply_reviewers": "1;1;1", "reply_authors": "3;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 431.0, 264.9528259898354 ], "wc_reply_reviewers_avg": [ 246.66666666666666, 320.1669703278102 ], "wc_reply_authors_avg": [ 364.6666666666667, 337.61648195680385 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:C_x2AWU_ZuMJ:scholar.google.com/&scioq=On+the+Statistical+and+Information+Theoretical+Characteristics+of+DNN+Representations&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "title": "GO Gradient for Expectation-Based Objectives", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/741", "id": "ryf6Fs09YX", "author_site": "Yulai Cong, Miaoyun Zhao, Ke Bai, Lawrence Carin", "tldr": "a Rep-like gradient for non-reparameterizable continuous/discrete distributions; further generalized to deep probabilistic models, yielding statistical back-propagation", "abstract": "Within many machine learning algorithms, a fundamental problem concerns efficient calculation of an unbiased gradient wrt parameters $\\boldsymbol{\\gamma}$ for expectation-based objectives $\\mathbb{E}_{q_{\\boldsymbol{\\gamma}} (\\boldsymbol{y})} [f (\\boldsymbol{y}) ]$. Most existing methods either ($i$) suffer from high variance, seeking help from (often) complicated variance-reduction techniques; or ($ii$) they only apply to reparameterizable continuous random variables and employ a reparameterization trick. To address these limitations, we propose a General and One-sample (GO) gradient that ($i$) applies to many distributions associated with non-reparameterizable continuous {\\em or} discrete random variables, and ($ii$) has the same low-variance as the reparameterization trick. We find that the GO gradient often works well in practice based on only one Monte Carlo sample (although one can of course use more samples if desired). Alongside the GO gradient, we develop a means of propagating the chain rule through distributions, yielding statistical back-propagation, coupling neural networks to common random variables.", "keywords": "generalized reparameterization gradient;variance reduction;non-reparameterizable;discrete random variable;GO gradient;general and one-sample gradient;expectation-based objective;variable nabla;statistical back-propagation;hierarchical;graphical model", "primary_area": "", "supplementary_material": "", "author": "Yulai Cong;Miaoyun Zhao;Ke Bai;Lawrence Carin", "authorids": "yulaicong@gmail.com;miaoyun9zhao@gmail.com;ke.bai@duke.edu;lcarin@duke.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\ncong2018go,\ntitle={{GO} Gradient for Expectation-Based Objectives},\nauthor={Yulai Cong and Miaoyun Zhao and Ke Bai and Lawrence Carin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryf6Fs09YX},\n}", "github": "[![github](/images/github_icon.svg) YulaiCong/GOgradient](https://github.com/YulaiCong/GOgradient)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "277;727;458", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "725;763;600", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 487.3333333333333, 184.87893936904285 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 696.0, 69.6323679524592 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13295613950307692271&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryf6Fs09YX", "pdf": "https://openreview.net/pdf?id=ryf6Fs09YX", "email": ";;;", "author_num": 4 }, { "title": "h-detach: Modifying the LSTM Gradient Towards Better Optimization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/683", "id": "ryf7ioRqFX", "author_site": "Bhargav Kanuparthi, Devansh Arpit, Giancarlo Kerg, Nan Rosemary Ke, Ioannis Mitliagkas, Yoshua Bengio", "tldr": "A simple algorithm to improve optimization and handling of long term dependencies in LSTM", "abstract": "Recurrent neural networks are known for their notorious exploding and vanishing gradient problem (EVGP). This problem becomes more evident in tasks where the information needed to correctly solve them exist over long time scales, because EVGP prevents important gradient components from being back-propagated adequately over a large number of steps. We introduce a simple stochastic algorithm (\\textit{h}-detach) that is specific to LSTM optimization and targeted towards addressing this problem. Specifically, we show that when the LSTM weights are large, the gradient components through the linear path (cell state) in the LSTM computational graph get suppressed. Based on the hypothesis that these components carry information about long term dependencies (which we show empirically), their suppression can prevent LSTMs from capturing them. Our algorithm\\footnote{Our code is available at https://github.com/bhargav104/h-detach.} prevents gradients flowing through this path from getting suppressed, thus allowing the LSTM to capture such dependencies better. We show significant improvements over vanilla LSTM gradient based training in terms of convergence speed, robustness to seed and learning rate, and generalization using our modification of LSTM gradient on various benchmark datasets.", "keywords": "LSTM;Optimization;Long term dependencies;Back-propagation through time", "primary_area": "", "supplementary_material": "", "author": "Bhargav Kanuparthi;Devansh Arpit;Giancarlo Kerg;Nan Rosemary Ke;Ioannis Mitliagkas;Yoshua Bengio", "authorids": "bhargavkanuparthi25@gmail.com;devansharpit@gmail.com;giancarlo.kerg@gmail.com;rosemary.nan.ke@gmail.com;ioannis@iro.umontreal.ca;yoshua.umontreal@gmail.com", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@inproceedings{\nkanuparthi2018hdetach,\ntitle={h-detach: Modifying the {LSTM} Gradient Towards Better Optimization},\nauthor={Bhargav Kanuparthi and Devansh Arpit and Giancarlo Kerg and Nan Rosemary Ke and Ioannis Mitliagkas and Yoshua Bengio},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryf7ioRqFX},\n}", "github": "[![github](/images/github_icon.svg) bhargav104/h-detach](https://github.com/bhargav104/h-detach)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;3;5", "wc_review": "208;383;775", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "288;490;426", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 455.3333333333333, 237.06023613325698 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 401.3333333333333, 84.2905029578593 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 13, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": 0.5, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=762520068872474914&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryf7ioRqFX", "pdf": "https://openreview.net/pdf?id=ryf7ioRqFX", "email": ";;;;;", "author_num": 6 }, { "id": "ryfDoiR5Ym", "title": "Fatty and Skinny: A Joint Training Method of Watermark Encoder and Decoder", "track": "main", "status": "Withdraw", "tldr": "We propose a novel watermark encoder-decoder neural networks. They perform a cooperative game to define their own watermarking scheme. People do not need to design watermarking methods any more.", "abstract": "Watermarks have been used for various purposes. Recently, researchers started to look into using them for deep neural networks. Some works try to hide attack triggers on their adversarial samples when attacking neural networks and others want to watermark neural networks to prove their ownership against plagiarism. Implanting a backdoor watermark module into a neural network is getting more attention from the community. In this paper, we present a general purpose encoder-decoder joint training method, inspired by generative adversarial networks (GANs). Unlike GANs, however, our encoder and decoder neural networks cooperate to find the best watermarking scheme given data samples. In other words, we do not design any new watermarking strategy but our proposed two neural networks will find the best suited method on their own. After being trained, the decoder can be implanted into other neural networks to attack or protect them (see Appendix for their use cases and real implementations). To this end, the decoder should be very tiny in order not to incur any overhead when attached to other neural networks but at the same time provide very high decoding success rates, which is very challenging. Our joint training method successfully solves the problem and in our experiments maintain almost 100\\% encoding-decoding success rates for multiple datasets with very little modifications on data samples to hide watermarks. We also present several real-world use cases in Appendix.", "keywords": "Adversarial Machine Learning;Watermarking;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Sanghyun Hong;Mahmoud Mohammadi;Noseong Park", "authorids": "shhong@cs.umd.edu;mmoham12@uncc.edu;npark9@gmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryfDoiR5Ym", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "wc_review": "965;504;266", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 578.3333333333334, 290.16585295692914 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EJdggsAQjkQJ:scholar.google.com/&scioq=Fatty+and+Skinny:+A+Joint+Training+Method+of+Watermark+Encoder+and+Decoder&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "An analytic theory of generalization dynamics and transfer learning in deep linear networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/798", "id": "ryfMLoCqtQ", "author_site": "Andrew Lampinen, Surya Ganguli", "tldr": "We provide many insights into neural network generalization from the theoretically tractable linear case.", "abstract": "Much attention has been devoted recently to the generalization puzzle in deep learning: large, deep networks can generalize well, but existing theories bounding generalization error are exceedingly loose, and thus cannot explain this striking performance. Furthermore, a major hope is that knowledge may transfer across tasks, so that multi-task learning can improve generalization on individual tasks. However we lack analytic theories that can quantitatively predict how the degree of knowledge transfer depends on the relationship between the tasks. We develop an analytic theory of the nonlinear dynamics of generalization in deep linear networks, both within and across tasks. In particular, our theory provides analytic solutions to the training and testing error of deep networks as a function of training time, number of examples, network size and initialization, and the task structure and SNR. Our theory reveals that deep networks progressively learn the most important task structure first, so that generalization error at the early stopping time primarily depends on task structure and is independent of network size. This suggests any tight bound on generalization error must take into account task structure, and explains observations about real data being learned faster than random data. Intriguingly our theory also reveals the existence of a learning algorithm that proveably out-performs neural network training through gradient descent. Finally, for transfer learning, our theory reveals that knowledge transfer depends sensitively, but computably, on the SNRs and input feature alignments of pairs of tasks.", "keywords": "Generalization;Theory;Transfer;Multi-task;Linear", "primary_area": "", "supplementary_material": "", "author": "Andrew K. Lampinen;Surya Ganguli", "authorids": "lampinen@stanford.edu;sganguli@stanford.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nlampinen2018an,\ntitle={An analytic theory of generalization dynamics and transfer learning in deep linear networks},\nauthor={Andrew K. Lampinen and Surya Ganguli},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryfMLoCqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;4;4", "wc_review": "774;283;123", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "560;154;394", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 393.3333333333333, 276.98415518260646 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 369.3333333333333, 166.66399997866634 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 136, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=358108946105258258&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ryfMLoCqtQ", "pdf": "https://openreview.net/pdf?id=ryfMLoCqtQ", "email": ";", "author_num": 2 }, { "id": "ryfaViR9YX", "title": "Variation Network: Learning High-level Attributes for Controlled Input Manipulation", "track": "main", "status": "Reject", "tldr": "The Variation Network is a generative model able to learn high-level attributes without supervision that can then be used for controlled input manipulation.", "abstract": "This paper presents the Variation Network (VarNet), a generative model providing means to manipulate the high-level attributes of a given input. The originality of our approach is that VarNet is not only capable of handling pre-defined attributes but can also learn the relevant attributes of the dataset by itself. These two settings can be easily combined which makes VarNet applicable for a wide variety of tasks. Further, VarNet has a sound probabilistic interpretation which grants us with a novel way to navigate in the latent spaces as well as means to control how the attributes are learned. We demonstrate experimentally that this model is capable of performing interesting input manipulation and that the learned attributes are relevant and interpretable.", "keywords": "Generative models;Input manipulation;Unsupervised feature learning;Variations", "primary_area": "", "supplementary_material": "", "author": "Ga\u00ebtan Hadjeres", "authorids": "hadjeres.g@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nhadjeres2019variation,\ntitle={Variation Network: Learning High-level Attributes for Controlled Input Manipulation},\nauthor={Ga\u00ebtan Hadjeres},\nyear={2019},\nurl={https://openreview.net/forum?id=ryfaViR9YX},\n}", "github": "[![github](/images/github_icon.svg) Ghadjeres/VarNet](https://github.com/Ghadjeres/VarNet)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryfaViR9YX", "pdf_size": 0, "rating": "3;4;6", "confidence": "4;3;2", "wc_review": "293;186;172", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 217.0, 54.043192602460735 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": -0.9819805060619659, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12303004176078603456&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "ryfcCo0ctQ", "title": "Convergent Reinforcement Learning with Function Approximation: A Bilevel Optimization Perspective", "track": "main", "status": "Reject", "tldr": "", "abstract": " We study reinforcement learning algorithms with nonlinear function approximation in the online setting. By formulating both the problems of value function estimation and policy learning as bilevel optimization problems, we propose online Q-learning and actor-critic algorithms for these two problems respectively. Our algorithms are gradient-based methods and thus are computationally efficient. Moreover, by approximating the iterates using differential equations, we establish convergence guarantees for the proposed algorithms. Thorough numerical experiments are conducted to back up our theory.", "keywords": "reinforcement learning;Deep Q-networks;actor-critic algorithm;ODE approximation", "primary_area": "", "supplementary_material": "", "author": "Zhuoran Yang;Zuyue Fu;Kaiqing Zhang;Zhaoran Wang", "authorids": "zy6@princeton.edu;zuyuefu2022@u.northwestern.edu;kzhang66@illinois.edu;zhaoranwang@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyang2019convergent,\ntitle={Convergent Reinforcement Learning with Function Approximation: A Bilevel Optimization Perspective},\nauthor={Zhuoran Yang and Zuyue Fu and Kaiqing Zhang and Zhaoran Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=ryfcCo0ctQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer4;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryfcCo0ctQ", "pdf_size": 0, "rating": "5;5;6;6", "confidence": "4;3;4;4", "wc_review": "827;226;393;253", "wc_reply_reviewers": "0;176;83;0", "wc_reply_authors": "1040;1092;388;336", "reply_reviewers": "0;2;1;0", "reply_authors": "2;2;1;1", "rating_avg": [ 5.5, 0.5 ], "confidence_avg": [ 3.75, 0.4330127018922193 ], "wc_review_avg": [ 424.75, 240.73468279414996 ], "wc_reply_reviewers_avg": [ 64.75, 72.62015904692029 ], "wc_reply_authors_avg": [ 714.0, 352.95892112255785 ], "reply_reviewers_avg": [ 0.75, 0.82915619758885 ], "reply_authors_avg": [ 1.5, 0.5 ], "replies_avg": [ 15, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.5773502691896257, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11487241735741608529&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ryfz73C9KQ", "title": "Neural Predictive Belief Representations", "track": "main", "status": "Reject", "tldr": "We investigate the quality of belief state representations of partially observable dynamic environments learned with modern neural architectures.", "abstract": "Unsupervised representation learning has succeeded with excellent results in many applications. It is an especially powerful tool to learn a good representation of environments with partial or noisy observations. In partially observable domains it is important for the representation to encode a belief state---a sufficient statistic of the observations seen so far. In this paper, we investigate whether it is possible to learn such a belief representation using modern neural architectures. Specifically, we focus on one-step frame prediction and two variants of contrastive predictive coding (CPC) as the objective functions to learn the representations. To evaluate these learned representations, we test how well they can predict various pieces of information about the underlying state of the environment, e.g., position of the agent in a 3D maze. We show that all three methods are able to learn belief representations of the environment---they encode not only the state information, but also its uncertainty, a crucial aspect of belief states. We also find that for CPC multi-step predictions and action-conditioning are critical for accurate belief representations in visually complex environments. The ability of neural representations to capture the belief information has the potential to spur new advances for learning and planning in partially observable domains, where leveraging uncertainty is essential for optimal decision making.", "keywords": "belief states;representation learning;contrastive predictive coding;reinforcement learning;predictive state representations;deep reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Zhaohan Daniel Guo;Mohammad Gheshlaghi Azar;Bilal Piot;Bernardo Avila Pires;R\u00e9mi Munos", "authorids": "z.daniel.guo@gmail.com;mazar@google.com;piot@google.com;bavilapires@google.com;munos@google.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nguo2019neural,\ntitle={Neural Predictive Belief Representations},\nauthor={Zhaohan Daniel Guo and Mohammad Gheshlaghi Azar and Bilal Piot and Bernardo Avila Pires and R\u00e9mi Munos},\nyear={2019},\nurl={https://openreview.net/forum?id=ryfz73C9KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryfz73C9KQ", "pdf_size": 0, "rating": "4;5;7", "confidence": "3;3;4", "wc_review": "469;363;479", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "727;489;351", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 437.0, 52.48491846870553 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 522.3333333333334, 155.3004256987798 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.944911182523068, "gs_citation": 101, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4227486234570163658&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "id": "rygFmh0cKm", "title": "On Difficulties of Probability Distillation", "track": "main", "status": "Reject", "tldr": "We point out an optimization issue of distillation with KL divergence, and explore different alternatives", "abstract": "Probability distillation has recently been of interest to deep learning practitioners as it presents a practical solution for sampling from autoregressive models for deployment in real-time applications. We identify a pathological optimization issue with the commonly adopted stochastic minimization of the (reverse) KL divergence, owing to sparse gradient signal from the teacher model due to curse of dimensionality. We also explore alternative principles for distillation, and show that one can achieve qualitatively better results than with KL minimization. \n", "keywords": "Probability distillation;Autoregressive models;normalizing flows;wavenet;pixelcnn", "primary_area": "", "supplementary_material": "", "author": "Chin-Wei Huang;Faruk Ahmed;Kundan Kumar;Alexandre Lacoste;Aaron Courville", "authorids": "chin-wei.huang@umontreal.ca;faruk.ahmed.91@gmail.com;kundankumar2510@gmail.com;allac@elementai.com;aaron.courville@gmail.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nhuang2019on,\ntitle={On Difficulties of Probability Distillation},\nauthor={Chin-Wei Huang and Faruk Ahmed and Kundan Kumar and Alexandre Lacoste and Aaron Courville},\nyear={2019},\nurl={https://openreview.net/forum?id=rygFmh0cKm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=rygFmh0cKm", "pdf_size": 0, "rating": "5;5;7", "confidence": "4;5;2", "wc_review": "238;433;269", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "379;504;166", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 313.3333333333333, 85.55829721436854 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 349.6666666666667, 139.53812700795754 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.9449111825230679, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mwBbyWLHB3YJ:scholar.google.com/&scioq=On+Difficulties+of+Probability+Distillation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rygPUoR9YQ", "title": "Compositional GAN: Learning Conditional Image Composition", "track": "main", "status": "Withdraw", "tldr": "We develop a novel approach to model object compositionality in images in a GAN framework.", "abstract": "Generative Adversarial Networks (GANs) can produce images of surprising complexity and realism, but are generally structured to sample from a single latent source ignoring the explicit spatial interaction between multiple entities that could be present in a scene. Capturing such complex interactions between different objects in the world, including their relative scaling, spatial layout, occlusion, or viewpoint transformation is a challenging problem. In this work, we propose to model object composition in a GAN framework as a self-consistent composition-decomposition network. Our model is conditioned on the object images from their marginal distributions and can generate a realistic image from their joint distribution. We evaluate our model through qualitative experiments and user evaluations in scenarios when either paired or unpaired examples for the individual object images and the joint scenes are given during training. Our results reveal that the learned model captures potential interactions between the two object domains given as input to output new instances of composed scene at test time in a reasonable fashion.", "keywords": "Image Composition;GAN;Conditional Image generation", "primary_area": "", "supplementary_material": "", "author": "Samaneh Azadi;Deepak Pathak;Sayna Ebrahimi;Trevor Darrell", "authorids": "sazadi@berkeley.edu;pathak@berkeley.edu;sayna@berkeley.edu;trevor@eecs.berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rygPUoR9YQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "540;604;308", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "553;1132;540", "reply_reviewers": "0;0;0", "reply_authors": "1;2;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 484.0, 127.16393618737455 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 741.6666666666666, 276.05836741932353 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13315257918545481662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "rygVV205KQ", "title": "Visual Imitation with a Minimal Adversary", "track": "main", "status": "Reject", "tldr": "Imitation from pixels, with sparse or no reward, using off-policy RL and a tiny adversarially-learned reward function.", "abstract": "High-dimensional sparse reward tasks present major challenges for reinforcement learning agents. In this work we use imitation learning to address two of these challenges: how to learn a useful representation of the world e.g. from pixels, and how to explore efficiently given the rarity of a reward signal? We show that adversarial imitation can work well even in this high dimensional observation space. Surprisingly the adversary itself, acting as the learned reward function, can be tiny, comprising as few as 128 parameters, and can be easily trained using the most basic GAN formulation. Our approach removes limitations present in most contemporary imitation approaches: requiring no demonstrator actions (only video), no special initial conditions or warm starts, and no explicit tracking of any single demo. The proposed agent can solve a challenging robot manipulation task of block stacking from only video demonstrations and sparse reward, in which the non-imitating agents fail to learn completely. Furthermore, our agent learns much faster than competing approaches that depend on hand-crafted, staged dense reward functions, and also better compared to standard GAIL baselines. Finally, we develop a new adversarial goal recognizer that in some cases allows the agent to learn stacking without any task reward, purely from imitation.", "keywords": "imitation;from pixels;adversarial", "primary_area": "", "supplementary_material": "", "author": "Scott Reed;Yusuf Aytar;Ziyu Wang;Tom Paine;A\u00e4ron van den Oord;Tobias Pfaff;Sergio Gomez;Alexander Novikov;David Budden;Oriol Vinyals", "authorids": "reedscot@google.com;yusufaytar@google.com;ziyu@google.com;tpaine@google.com;avdnoord@google.com;tpfaff@google.com;sergomez@google.com;anovikov@google.com;budden@google.com;vinyals@google.com", "gender": ";;;;;;;;;", "homepage": ";;;;;;;;;", "dblp": ";;;;;;;;;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": ";;;;;;;;;", "aff": ";;;;;;;;;", "aff_domain": ";;;;;;;;;", "position": ";;;;;;;;;", "bibtex": "@misc{\nreed2019visual,\ntitle={Visual Imitation with a Minimal Adversary},\nauthor={Scott Reed and Yusuf Aytar and Ziyu Wang and Tom Paine and A\u00e4ron van den Oord and Tobias Pfaff and Sergio Gomez and Alexander Novikov and David Budden and Oriol Vinyals},\nyear={2019},\nurl={https://openreview.net/forum?id=rygVV205KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rygVV205KQ", "pdf_size": 0, "rating": "3;5;6", "confidence": "3;4;4", "wc_review": "150;378;918", "wc_reply_reviewers": "0;0;263", "wc_reply_authors": "334;343;597", "reply_reviewers": "0;0;1", "reply_authors": "1;1;2", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 482.0, 322.0434753259255 ], "wc_reply_reviewers_avg": [ 87.66666666666667, 123.97938896804132 ], "wc_reply_authors_avg": [ 424.6666666666667, 121.91344835123354 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 10, 0 ], "corr_rating_confidence": 0.9449111825230683, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2907692414667961786&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rygZJ2RcF7", "title": "Out-of-Sample Extrapolation with Neuron Editing", "track": "main", "status": "Reject", "tldr": "We reframe the generation problem as one of editing existing points, and as a result extrapolate better than traditional GANs.", "abstract": "While neural networks can be trained to map from one specific dataset to another, they usually do not learn a generalized transformation that can extrapolate accurately outside the space of training. For instance, a generative adversarial network (GAN) exclusively trained to transform images of cars from light to dark might not have the same effect on images of horses. This is because neural networks are good at generation within the manifold of the data that they are trained on. However, generating new samples outside of the manifold or extrapolating \"out-of-sample\" is a much harder problem that has been less well studied. To address this, we introduce a technique called neuron editing that learns how neurons encode an edit for a particular transformation in a latent space. We use an autoencoder to decompose the variation within the dataset into activations of different neurons and generate transformed data by defining an editing transformation on those neurons. By performing the transformation in a latent trained space, we encode fairly complex and non-linear transformations to the data with much simpler distribution shifts to the neuron's activations. We showcase our technique on image domain/style transfer and two biological applications: removal of batch artifacts representing unwanted noise and modeling the effect of drug treatments to predict synergy between drugs.", "keywords": "generative adversarial networks;computational biology;generating;generation;extrapolation;out-of-sample;neural network inference", "primary_area": "", "supplementary_material": "", "author": "Matthew Amodio;David van Dijk;Ruth Montgomery;Guy Wolf;Smita Krishnaswamy", "authorids": "matthew.amodio@yale.edu;davidvandijk@gmail.com;ruth.montgomery@yale.edu;guy.wolf@yale.edu;smita.krishnaswamy@yale.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\namodio2019outofsample,\ntitle={Out-of-Sample Extrapolation with Neuron Editing},\nauthor={Matthew Amodio and David van Dijk and Ruth Montgomery and Guy Wolf and Smita Krishnaswamy},\nyear={2019},\nurl={https://openreview.net/forum?id=rygZJ2RcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rygZJ2RcF7", "pdf_size": 0, "rating": "5;5;6", "confidence": "3;3;4", "wc_review": "276;464;302", "wc_reply_reviewers": "106;0;0", "wc_reply_authors": "718;792;584", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 347.3333333333333, 83.17585119625902 ], "wc_reply_reviewers_avg": [ 35.333333333333336, 49.968879203849355 ], "wc_reply_authors_avg": [ 698.0, 86.08522908528889 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=600732414069935156&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3 }, { "title": "Differentiable Learning-to-Normalize via Switchable Normalization", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1116", "id": "ryggIs0cYQ", "author_site": "Ping Luo, jiamin ren, zhanglin peng, Ruimao Zhang, Jingyu Li", "tldr": "", "abstract": "We address a learning-to-normalize problem by proposing Switchable Normalization (SN), which learns to select different normalizers for different normalization layers of a deep neural network. SN employs three distinct scopes to compute statistics (means and variances) including a channel, a layer, and a minibatch. SN switches between them by learning their importance weights in an end-to-end manner. It has several good properties. First, it adapts to various network architectures and tasks (see Fig.1). Second, it is robust to a wide range of batch sizes, maintaining high performance even when small minibatch is presented (e.g. 2 images/GPU). Third, SN does not have sensitive hyper-parameter, unlike group normalization that searches the number of groups as a hyper-parameter. Without bells and whistles, SN outperforms its counterparts on various challenging benchmarks, such as ImageNet, COCO, CityScapes, ADE20K, and Kinetics. Analyses of SN are also presented. We hope SN will help ease the usage and understand the normalization techniques in deep learning. The code of SN will be released.", "keywords": "normalization;deep learning;CNN;computer vision", "primary_area": "", "supplementary_material": "", "author": "Ping Luo;Jiamin Ren;Zhanglin Peng;Ruimao Zhang;Jingyu Li", "authorids": "pluo@ie.cuhk.edu.hk;renjiamin@sensetime.com;pengzhanglin@sensetime.com;zhangruimao@sensetime.com;lijingyu@sensetime.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nluo2018differentiable,\ntitle={Differentiable Learning-to-Normalize via Switchable Normalization},\nauthor={Ping Luo and Jiamin Ren and Zhanglin Peng and Ruimao Zhang and Jingyu Li},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryggIs0cYQ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=ryggIs0cYQ)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "pdf_size": 0, "rating": "7;7;7", "confidence": "5;4;3", "wc_review": "398;242;112", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "142;354;0", "reply_reviewers": "0;0;0", "reply_authors": "1;1;0", "rating_avg": [ 7.0, 0.0 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 250.66666666666666, 116.91972554801102 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 165.33333333333334, 145.45866155792243 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.6666666666666666, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 262, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16272948606490934413&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryggIs0cYQ", "pdf": "https://openreview.net/pdf?id=ryggIs0cYQ", "email": ";;;;", "author_num": 5 }, { "id": "rygjN3C9F7", "title": "The Variational Deficiency Bottleneck", "track": "main", "status": "Reject", "tldr": "We develop a new bottleneck method based on channel deficiency.", "abstract": "We introduce a bottleneck method for learning data representations based on channel deficiency, rather than the more traditional information sufficiency. A variational upper bound allows us to implement this method efficiently. The bound itself is bounded above by the variational information bottleneck objective, and the two methods coincide in the regime of single-shot Monte Carlo approximations. The notion of deficiency provides a principled way of approximating complicated channels by relatively simpler ones. The deficiency of one channel w.r.t. another has an operational interpretation in terms of the optimal risk gap of decision problems, capturing classification as a special case. Unsupervised generalizations are possible, such as the deficiency autoencoder, which can also be formulated in a variational form. Experiments demonstrate that the deficiency bottleneck can provide advantages in terms of minimal sufficiency as measured by information bottleneck curves, while retaining a good test performance in classification and reconstruction tasks. ", "keywords": "Variational Information Bottleneck;Blackwell Sufficiency;Le Cam Deficiency;Information Channel", "primary_area": "", "supplementary_material": "", "author": "Pradeep Kr. Banerjee;Guido Montufar", "authorids": "pradeep@mis.mpg.de;montufar@math.ucla.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nbanerjee2019the,\ntitle={The Variational Deficiency Bottleneck},\nauthor={Pradeep Kr. Banerjee and Guido Montufar},\nyear={2019},\nurl={https://openreview.net/forum?id=rygjN3C9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rygjN3C9F7", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;2;2", "wc_review": "300;154;184", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "634;160;192", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 3.0, 1.4142135623730951 ], "wc_review_avg": [ 212.66666666666666, 62.95677529508286 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 328.6666666666667, 216.29814814022077 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3988396671472606932&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "title": "SOM-VAE: Interpretable Discrete Representation Learning on Time Series", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/729", "id": "rygjcsR9Y7", "author_site": "Vincent Fortuin, Matthias H\u00fcser, Francesco Locatello, Heiko Strathmann, Gunnar R\u00e4tsch", "tldr": "We present a method to learn interpretable representations on time series using ideas from variational autoencoders, self-organizing maps and probabilistic models.", "abstract": "High-dimensional time series are common in many domains. Since human cognition is not optimized to work well in high-dimensional spaces, these areas could benefit from interpretable low-dimensional representations. However, most representation learning algorithms for time series data are difficult to interpret. This is due to non-intuitive mappings from data features to salient properties of the representation and non-smoothness over time.\nTo address this problem, we propose a new representation learning framework building on ideas from interpretable discrete dimensionality reduction and deep generative modeling. This framework allows us to learn discrete representations of time series, which give rise to smooth and interpretable embeddings with superior clustering performance. We introduce a new way to overcome the non-differentiability in discrete representation learning and present a gradient-based version of the traditional self-organizing map algorithm that is more performant than the original. Furthermore, to allow for a probabilistic interpretation of our method, we integrate a Markov model in the representation space.\nThis model uncovers the temporal transition structure, improves clustering performance even further and provides additional explanatory insights as well as a natural representation of uncertainty.\nWe evaluate our model in terms of clustering performance and interpretability on static (Fashion-)MNIST data, a time series of linearly interpolated (Fashion-)MNIST images, a chaotic Lorenz attractor system with two macro states, as well as on a challenging real world medical time series application on the eICU data set. Our learned representations compare favorably with competitor methods and facilitate downstream tasks on the real world data.", "keywords": "deep learning;self-organizing map;variational autoencoder;representation learning;time series;machine learning;interpretability", "primary_area": "", "supplementary_material": "", "author": "Vincent Fortuin;Matthias H\u00fcser;Francesco Locatello;Heiko Strathmann;Gunnar R\u00e4tsch", "authorids": "fortuin@inf.ethz.ch;mhueser@inf.ethz.ch;locatelf@inf.ethz.ch;heiko.strathmann@gmail.com;raetsch@inf.ethz.ch", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nfortuin2018deep,\ntitle={Deep Self-Organization: Interpretable Discrete Representation Learning on Time Series},\nauthor={Vincent Fortuin and Matthias H\u00fcser and Francesco Locatello and Heiko Strathmann and Gunnar R\u00e4tsch},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rygjcsR9Y7},\n}", "github": "[![github](/images/github_icon.svg) ratschlab/SOM-VAE](https://github.com/ratschlab/SOM-VAE) + [![Papers with Code](/images/pwc_icon.svg) 5 community implementations](https://paperswithcode.com/paper/?openreview=rygjcsR9Y7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;6;9", "confidence": "4;2;4", "wc_review": "645;210;232", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 362.3333333333333, 200.07720731979663 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.5000000000000001, "gs_citation": 205, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9836294528958312436&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=rygjcsR9Y7", "pdf": "https://openreview.net/pdf?id=rygjcsR9Y7", "email": ";;;;", "author_num": 5 }, { "id": "rygk9oA9Ym", "title": "3D-RelNet: Joint Object and Relational Network for 3D Prediction", "track": "main", "status": "Reject", "tldr": "We reason about relative spatial relationships between the objects in a scene to produce better 3D predictions", "abstract": "We propose an approach to predict the 3D shape and pose for the objects present in a scene. Existing learning based methods that pursue this goal make independent predictions per object, and do not leverage the relationships amongst them. We argue that reasoning about these relationships is crucial, and present an approach to incorporate these in a 3D prediction framework. In addition to independent per-object predictions, we predict pairwise relations in the form of relative 3D pose, and demonstrate that these can be easily incorporated to improve object level estimates. We report performance across different datasets (SUNCG, NYUv2), and show that our approach significantly improves over independent prediction approaches while also outperforming alternate implicit reasoning methods.", "keywords": "3D Reconstruction;3D Scene Understanding;Relative Prediction", "primary_area": "", "supplementary_material": "", "author": "Nilesh Kulkarni;Ishan Misra;Shubham Tulsiani;Abhinav Gupta", "authorids": "nileshk@cs.cmu.edu;ishan@cmu.edu;shubhtuls@fb.com;abhinavg@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nkulkarni2019drelnet,\ntitle={3D-RelNet: Joint Object and Relational Network for 3D Prediction},\nauthor={Nilesh Kulkarni and Ishan Misra and Shubham Tulsiani and Abhinav Gupta},\nyear={2019},\nurl={https://openreview.net/forum?id=rygk9oA9Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rygk9oA9Ym", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;5;4", "wc_review": "1005;377;103", "wc_reply_reviewers": "0;135;0", "wc_reply_authors": "765;166;130", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 495.0, 377.5747166676639 ], "wc_reply_reviewers_avg": [ 45.0, 63.63961030678928 ], "wc_reply_authors_avg": [ 353.6666666666667, 291.22766962559643 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6734839568478501575&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10 }, { "title": "Hierarchical Generative Modeling for Controllable Speech Synthesis", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/754", "id": "rygkk305YQ", "author_site": "Wei-Ning Hsu, Yu Zhang, Ron Weiss, Heiga Zen, Yonghui Wu, Yuxuan Wang, Yuan Cao, Ye Jia, Zhifeng Chen, Jonathan Shen, Patrick Nguyen, Ruoming Pang", "tldr": "Building a TTS model with Gaussian Mixture VAEs enables fine-grained control of speaking style, noise condition, and more.", "abstract": "This paper proposes a neural end-to-end text-to-speech (TTS) model which can control latent attributes in the generated speech that are rarely annotated in the training data, such as speaking style, accent, background noise, and recording conditions. The model is formulated as a conditional generative model with two levels of hierarchical latent variables. The first level is a categorical variable, which represents attribute groups (e.g. clean/noisy) and provides interpretability. The second level, conditioned on the first, is a multivariate Gaussian variable, which characterizes specific attribute configurations (e.g. noise level, speaking rate) and enables disentangled fine-grained control over these attributes. This amounts to using a Gaussian mixture model (GMM) for the latent distribution. Extensive evaluation demonstrates its ability to control the aforementioned attributes. In particular, it is capable of consistently synthesizing high-quality clean speech regardless of the quality of the training data for the target speaker.", "keywords": "speech synthesis;representation learning;deep generative model;sequence-to-sequence model", "primary_area": "", "supplementary_material": "", "author": "Wei-Ning Hsu;Yu Zhang;Ron J. Weiss;Heiga Zen;Yonghui Wu;Yuxuan Wang;Yuan Cao;Ye Jia;Zhifeng Chen;Jonathan Shen;Patrick Nguyen;Ruoming Pang", "authorids": "wnhsu@mit.edu;ngyuzh@google.com;ronw@google.com;heigazen@google.com;yonghui@google.com;logpie@gmail.com;yuancao@google.com;jiaye@google.com;zhifengc@google.com;jonathanasdf@google.com;drpng@google.com;rpang@google.com", "gender": ";;;;;;;;;;;", "homepage": ";;;;;;;;;;;", "dblp": ";;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;", "orcid": ";;;;;;;;;;;", "linkedin": ";;;;;;;;;;;", "or_profile": ";;;;;;;;;;;", "aff": ";;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;", "position": ";;;;;;;;;;;", "bibtex": "@inproceedings{\nhsu2018hierarchical,\ntitle={Hierarchical Generative Modeling for Controllable Speech Synthesis},\nauthor={Wei-Ning Hsu and Yu Zhang and Ron Weiss and Heiga Zen and Yonghui Wu and Yuan Cao and Yuxuan Wang},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rygkk305YQ},\n}", "github": "[![Papers with Code](/images/pwc_icon.svg) 2 community implementations](https://paperswithcode.com/paper/?openreview=rygkk305YQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;6;8", "confidence": "4;5;4", "wc_review": "137;199;252", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "2063;1541;876", "reply_reviewers": "0;0;0", "reply_authors": "4;3;2", "rating_avg": [ 6.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 196.0, 46.99645376692444 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 1493.3333333333333, 485.76148696888504 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 3.0, 0.816496580927726 ], "replies_avg": [ 29, 0 ], "authors#_avg": [ 12, 0 ], "corr_rating_confidence": -0.18898223650461357, "gs_citation": 297, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7736857481159574881&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=rygkk305YQ", "pdf": "https://openreview.net/pdf?id=rygkk305YQ", "email": ";;;;;;;;;;;", "author_num": 12 }, { "id": "rygnfn0qF7", "title": "Language Model Pre-training for Hierarchical Document Representations", "track": "main", "status": "Reject", "tldr": "", "abstract": "Hierarchical neural architectures can efficiently capture long-distance dependencies and have been used for many document-level tasks such as summarization, document segmentation, and fine-grained sentiment analysis. However, effective usage of such a large context can difficult to learn, especially in the case where there is limited labeled data available.\nBuilding on the recent success of language model pretraining methods for learning flat representations of text, we propose algorithms for pre-training hierarchical document representations from unlabeled data. Unlike prior work, which has focused on pre-training contextual token representations or context-independent sentence/paragraph representations, our hierarchical document representations include fixed-length sentence/paragraph representations which integrate contextual information from the entire documents. Experiments on document segmentation, document-level question answering, and extractive document summarization demonstrate the effectiveness of the proposed pre-training algorithms.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Ming-Wei Chang;Kristina Toutanova;Kenton Lee;Jacob Devlin", "authorids": "mingweichang@google.com;kristout@google.com;kentonl@google.com;jacobdevlin@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nchang2019language,\ntitle={Language Model Pre-training for Hierarchical Document Representations},\nauthor={Ming-Wei Chang and Kristina Toutanova and Kenton Lee and Jacob Devlin},\nyear={2019},\nurl={https://openreview.net/forum?id=rygnfn0qF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rygnfn0qF7", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;4", "wc_review": "262;162;255", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "467;221;406", "reply_reviewers": "0;0;0", "reply_authors": "2;2;2", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 226.33333333333334, 45.580210130664774 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 364.6666666666667, 104.5955172185798 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6642822181177704358&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "id": "rygo9iR9F7", "title": "Progressive Weight Pruning Of Deep Neural Networks Using ADMM", "track": "main", "status": "Reject", "tldr": "We implement a DNN weight pruning approach that achieves the highest pruning rates.", "abstract": "Deep neural networks (DNNs) although achieving human-level performance in many domains, have very large model size that hinders their broader applications on edge computing devices. Extensive research work have been conducted on DNN model compression or pruning. However, most of the previous work took heuristic approaches. This work proposes a progressive weight pruning approach based on ADMM (Alternating Direction Method of Multipliers), a powerful technique to deal with non-convex optimization problems with potentially combinatorial constraints. Motivated by dynamic programming, the proposed method reaches extremely high pruning rate by using partial prunings with moderate pruning rates. Therefore, it resolves the accuracy degradation and long convergence time problems when pursuing extremely high pruning ratios. It achieves up to 34\u00d7 pruning rate for ImageNet dataset and 167\u00d7 pruning rate for MNIST dataset, significantly higher than those reached by the literature work. Under the same number of epochs, the proposed method also achieves faster convergence and higher compression rates. The codes and pruned DNN models are released in the anonymous link bit.ly/2zxdlss.", "keywords": "deep learning;model compression;optimization;ADMM;weight pruning", "primary_area": "", "supplementary_material": "", "author": "Shaokai Ye;Tianyun Zhang;Kaiqi Zhang;Jiayu Li;Kaidi Xu;Yunfei Yang;Fuxun Yu;Jian Tang;Makan Fardad;Sijia Liu;Xiang Chen;Xue Lin;Yanzhi Wang", "authorids": "sye106@syr.edu;tzhan120@syr.edu;kzhang17@syr.edu;jli221@syr.edu;xu.kaid@husky.neu.edu;yunfei.yang717@gmail.com;fyu@gmu.edu;jtang02@syr.edu;makan@syr.edu;sijia.liu@ibm.com;xchen26@gmu.edu;xue.lin@northeastern.edu;yanz.wang@northeastern.edu", "gender": ";;;;;;;;;;;;", "homepage": ";;;;;;;;;;;;", "dblp": ";;;;;;;;;;;;", "google_scholar": ";;;;;;;;;;;;", "orcid": ";;;;;;;;;;;;", "linkedin": ";;;;;;;;;;;;", "or_profile": ";;;;;;;;;;;;", "aff": ";;;;;;;;;;;;", "aff_domain": ";;;;;;;;;;;;", "position": ";;;;;;;;;;;;", "bibtex": "@misc{\nye2019progressive,\ntitle={Progressive Weight Pruning Of Deep Neural Networks Using {ADMM}},\nauthor={Shaokai Ye and Tianyun Zhang and Kaiqi Zhang and Jiayu Li and Kaidi Xu and Yunfei Yang and Fuxun Yu and Jian Tang and Makan Fardad and Sijia Liu and Xiang Chen and Xue Lin and Yanzhi Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=rygo9iR9F7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rygo9iR9F7", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "268;602;125", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "443;646;312", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 331.6666666666667, 199.87051363876117 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 467.0, 137.40693820425034 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 13, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=440498708647542751&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6 }, { "id": "rygp3iRcF7", "title": "Area Attention", "track": "main", "status": "Reject", "tldr": "The paper presents a novel approach for attentional mechanisms that can benefit a range of tasks such as machine translation and image captioning.", "abstract": "Existing attention mechanisms, are mostly item-based in that a model is trained to attend to individual items in a collection (the memory) where each item has a predefined, fixed granularity, e.g., a character or a word. Intuitively, an area in the memory consisting of multiple items can be worth attending to as a whole. We propose area attention: a way to attend to an area of the memory, where each area contains a group of items that are either spatially adjacent when the memory has a 2-dimensional structure, such as images, or temporally adjacent for 1-dimensional memory, such as natural language sentences. Importantly, the size of an area, i.e., the number of items in an area or the level of aggregation, is dynamically determined via learning, which can vary depending on the learned coherence of the adjacent items. By giving the model the option to attend to an area of items, instead of only individual items, a model can attend to information with varying granularity. Area attention can work along multi-head attention for attending to multiple areas in the memory. We evaluate area attention on two tasks: neural machine translation (both character and token-level) and image captioning, and improve upon strong (state-of-the-art) baselines in all the cases. These improvements are obtainable with a basic form of area attention that is parameter free. In addition to proposing the novel concept of area attention, we contribute an efficient way for computing it by leveraging the technique of summed area tables.", "keywords": "Deep Learning;attentional mechanisms;neural machine translation;image captioning", "primary_area": "", "supplementary_material": "", "author": "Yang Li;Lukasz Kaiser;Samy Bengio;Si Si", "authorids": "liyang@google.com;lukaszkaiser@google.com;bengio@google.com;sisidaisy@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nli2019area,\ntitle={Area Attention},\nauthor={Yang Li and Lukasz Kaiser and Samy Bengio and Si Si},\nyear={2019},\nurl={https://openreview.net/forum?id=rygp3iRcF7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rygp3iRcF7", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;5;4", "wc_review": "243;417;394", "wc_reply_reviewers": "59;197;53", "wc_reply_authors": "390;947;322", "reply_reviewers": "1;1;1", "reply_authors": "2;2;2", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 351.3333333333333, 77.17656523985906 ], "wc_reply_reviewers_avg": [ 103.0, 66.51315659326356 ], "wc_reply_authors_avg": [ 553.0, 279.9797611733153 ], "reply_reviewers_avg": [ 1.0, 0.0 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 31, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8914327285913451682&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15 }, { "title": "Learning Factorized Multimodal Representations", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/925", "id": "rygqqsA9KX", "author_site": "Yao-Hung Hubert Tsai, Paul Pu Liang, Amir Ali Bagherzade, Louis-Philippe Morency, Ruslan Salakhutdinov", "tldr": "We propose a model to learn factorized multimodal representations that are discriminative, generative, and interpretable.", "abstract": "Learning multimodal representations is a fundamentally complex research problem due to the presence of multiple heterogeneous sources of information. Although the presence of multiple modalities provides additional valuable information, there are two key challenges to address when learning from multimodal data: 1) models must learn the complex intra-modal and cross-modal interactions for prediction and 2) models must be robust to unexpected missing or noisy modalities during testing. In this paper, we propose to optimize for a joint generative-discriminative objective across multimodal data and labels. We introduce a model that factorizes representations into two sets of independent factors: multimodal discriminative and modality-specific generative factors. Multimodal discriminative factors are shared across all modalities and contain joint multimodal features required for discriminative tasks such as sentiment prediction. Modality-specific generative factors are unique for each modality and contain the information required for generating data. Experimental results show that our model is able to learn meaningful multimodal representations that achieve state-of-the-art or competitive performance on six multimodal datasets. Our model demonstrates flexible generative capabilities by conditioning on independent factors and can reconstruct missing modalities without significantly impacting performance. Lastly, we interpret our factorized representations to understand the interactions that influence multimodal learning.", "keywords": "multimodal learning;representation learning", "primary_area": "", "supplementary_material": "", "author": "Yao-Hung Hubert Tsai;Paul Pu Liang;Amir Zadeh;Louis-Philippe Morency;Ruslan Salakhutdinov", "authorids": "yaohungt@cs.cmu.edu;pliang@cs.cmu.edu;abagherz@cs.cmu.edu;morency@cs.cmu.edu;rsalakhu@cs.cmu.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\ntsai2018learning,\ntitle={Learning Factorized Multimodal Representations},\nauthor={Yao-Hung Hubert Tsai and Paul Pu Liang and Amir Zadeh and Louis-Philippe Morency and Ruslan Salakhutdinov},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rygqqsA9KX},\n}", "github": "[![github](/images/github_icon.svg) pliang279/factorized](https://github.com/pliang279/factorized) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rygqqsA9KX)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "3;2;3", "wc_review": "179;167;289", "wc_reply_reviewers": "5;0;7", "wc_reply_authors": "478;226;359", "reply_reviewers": "1;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 2.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 211.66666666666666, 54.901932773102104 ], "wc_reply_reviewers_avg": [ 4.0, 2.943920288775949 ], "wc_reply_authors_avg": [ 354.3333333333333, 102.93147666719295 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 561, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2626823666054989533&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rygqqsA9KX", "pdf": "https://openreview.net/pdf?id=rygqqsA9KX", "email": ";;;;", "author_num": 5 }, { "title": "Composing Complex Skills by Learning Transition Policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/792", "id": "rygrBhC5tQ", "author_site": "Youngwoon Lee, Shao-Hua Sun, Sriram Somasundaram, Edward S Hu, Joseph Lim", "tldr": "Transition policies enable agents to compose complex skills by smoothly connecting previously acquired primitive skills.", "abstract": "Humans acquire complex skills by exploiting previously learned skills and making transitions between them. To empower machines with this ability, we propose a method that can learn transition policies which effectively connect primitive skills to perform sequential tasks without handcrafted rewards. To efficiently train our transition policies, we introduce proximity predictors which induce rewards gauging proximity to suitable initial states for the next skill. The proposed method is evaluated on a set of complex continuous control tasks in bipedal locomotion and robotic arm manipulation which traditional policy gradient methods struggle at. We demonstrate that transition policies enable us to effectively compose complex skills with existing primitive skills. The proposed induced rewards computed using the proximity predictor further improve training efficiency by providing more dense information than the sparse rewards from the environments. We make our environments, primitive skills, and code public for further research at https://youngwoon.github.io/transition .", "keywords": "reinforcement learning;hierarchical reinforcement learning;continuous control;modular framework", "primary_area": "", "supplementary_material": "", "author": "Youngwoon Lee*;Shao-Hua Sun*;Sriram Somasundaram;Edward S. Hu;Joseph J. Lim", "authorids": "lee504@usc.edu;shaohuas@usc.edu;sriramso@usc.edu;hues@usc.edu;limjj@usc.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nlee2018composing,\ntitle={Composing Complex Skills by Learning Transition Policies with Proximity Reward Induction},\nauthor={Youngwoon Lee and Shao-Hua Sun and Sriram Somasundaram and Edward Hu and Joseph J. Lim},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rygrBhC5tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;9", "confidence": "4;4;4", "wc_review": "152;173;170", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "42;192;536", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 7.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 165.0, 9.273618495495704 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 256.6666666666667, 206.79350946186767 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 110, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17120904652529141450&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=rygrBhC5tQ", "pdf": "https://openreview.net/pdf?id=rygrBhC5tQ", "email": ";;;;", "author_num": 5 }, { "id": "rygunsAqYQ", "title": "Implicit Maximum Likelihood Estimation", "track": "main", "status": "Reject", "tldr": "We develop a new likelihood-free parameter estimation method that is equivalent to maximum likelihood under some conditions", "abstract": "Implicit probabilistic models are models defined naturally in terms of a sampling procedure and often induces a likelihood function that cannot be expressed explicitly. We develop a simple method for estimating parameters in implicit models that does not require knowledge of the form of the likelihood function or any derived quantities, but can be shown to be equivalent to maximizing likelihood under some conditions. Our result holds in the non-asymptotic parametric setting, where both the capacity of the model and the number of data examples are finite. We also demonstrate encouraging experimental results. ", "keywords": "likelihood-free inference;implicit probabilistic models", "primary_area": "", "supplementary_material": "", "author": "Ke Li;Jitendra Malik", "authorids": "ke.li@eecs.berkeley.edu;malik@eecs.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2019implicit,\ntitle={Implicit Maximum Likelihood Estimation},\nauthor={Ke Li and Jitendra Malik},\nyear={2019},\nurl={https://openreview.net/forum?id=rygunsAqYQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rygunsAqYQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;4;4", "wc_review": "658;766;511", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 645.0, 104.5083728703112 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3982368896707485286&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 9 }, { "id": "rygvZ2RcYm", "title": "Knowledge Representation for Reinforcement Learning using General Value Functions", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Reinforcement learning (RL) is a very powerful approach for learning good control strategies from data. Value functions are a key concept for reinforcement learning, as they guide the search for good policies. A lot of effort has been devoted to designing and improving algorithms for learning value functions. In this paper, we argue that value functions are also a very natural way of providing a framework for knowledge representation for reinforcement learning agents. We show that generalized value functions provide a unifying lens for many algorithms, including policy gradient, successor features, option models and policies, and other forms of hierarchical reinforcement learning. We also demonstrate the potential of this representation to provide new, useful algorithms.", "keywords": "Reinforcement Learning;General Value Functions;Policy Gradient;Hierarchical Reinforcement Learning;Successor Features", "primary_area": "", "supplementary_material": "", "author": "Gheorghe Comanici;Doina Precup;Andre Barreto;Daniel Kenji Toyama;Eser Ayg\u00fcn;Philippe Hamel;Sasha Vezhnevets;Shaobo Hou;Shibl Mourad", "authorids": "gcomanici@google.com;doinap@google.com;andrebarreto@google.com;kenjitoyama@google.com;eser@google.com;hamelphi@google.com;vezhnick@google.com;shaobohou@google.com;shibl@google.com", "gender": ";;;;;;;;", "homepage": ";;;;;;;;", "dblp": ";;;;;;;;", "google_scholar": ";;;;;;;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": ";;;;;;;;", "aff": ";;;;;;;;", "aff_domain": ";;;;;;;;", "position": ";;;;;;;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rygvZ2RcYm", "pdf_size": 0, "rating": "4;6;7", "confidence": "4;3;3", "wc_review": "1116;299;127", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 5.666666666666667, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 514.0, 431.43095237438246 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 9, 0 ], "corr_rating_confidence": -0.9449111825230683, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10894207029370142817&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rygypo0qtm", "title": "Show, Attend and Translate: Unsupervised Image Translation with Self-Regularization and Attention", "track": "main", "status": "Withdraw", "tldr": "We propose a simple generative model for unsupervised image translation and saliency detection.", "abstract": "Image translation between two domains is a class of problems aiming to learn mapping from an input image in the source domain to an output image in the target domain. It has been applied to numerous applications, such as data augmentation, domain adaptation, and unsupervised training. When paired training data is not accessible, image translation becomes an ill-posed problem. We constrain the problem with the assumption that the translated image needs to be perceptually similar to the original image and also appears to be drawn from the new domain, and propose a simple yet effective image translation model consisting of a single generator trained with a self-regularization term and an adversarial term. We further notice that existing image translation techniques are agnostic to the subjects of interest and often introduce unwanted changes or artifacts to the input. Thus we propose to add an attention module to predict an attention map to guide the image translation process. The module learns to attend to key parts of the image while keeping everything else unaltered, essentially avoiding undesired artifacts or changes. The predicted attention map also opens door to applications such as unsupervised segmentation and saliency detection. Extensive experiments and evaluations show that our model while being simpler, achieves significantly better performance than existing image translation methods.", "keywords": "image translation;domain adaptation;saliency detection", "primary_area": "", "supplementary_material": "", "author": "Chao Yang", "authorids": "harryyang.hk@gmail.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "", "github": "", "project": "", "reviewers": "", "site": "https://openreview.net/forum?id=rygypo0qtm", "pdf_size": 0, "rating": "", "confidence": "", "wc_review": "", "wc_reply_reviewers": "", "wc_reply_authors": "", "reply_reviewers": "", "reply_authors": "", "rating_avg": [ 0, 0 ], "confidence_avg": [ 0, 0 ], "wc_review_avg": [ 0, 0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 1, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8779342005524101434&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "title": "Human-level Protein Localization with Convolutional Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1108", "id": "ryl5khRcKm", "author_site": "Elisabeth Rumetshofer, Markus Hofmarcher, Clemens R\u00f6hrl, Sepp Hochreiter, G\u00fcnter Klambauer", "tldr": "", "abstract": "Localizing a specific protein in a human cell is essential for understanding cellular functions and biological processes of underlying diseases. A promising, low-cost,and time-efficient biotechnology for localizing proteins is high-throughput fluorescence microscopy imaging (HTI). This imaging technique stains the protein of interest in a cell with fluorescent antibodies and subsequently takes a microscopic image. Together with images of other stained proteins or cell organelles and the annotation by the Human Protein Atlas project, these images provide a rich source of information on the protein location which can be utilized by computational methods. It is yet unclear how precise such methods are and whether they can compete with human experts. We here focus on deep learning image analysis methods and, in particular, on Convolutional Neural Networks (CNNs)since they showed overwhelming success across different imaging tasks. We pro-pose a novel CNN architecture \u201cGapNet-PL\u201d that has been designed to tackle the characteristics of HTI data and uses global averages of filters at different abstraction levels. We present the largest comparison of CNN architectures including GapNet-PL for protein localization in HTI images of human cells. GapNet-PL outperforms all other competing methods and reaches close to perfect localization in all 13 tasks with an average AUC of 98% and F1 score of 78%. On a separate test set the performance of GapNet-PL was compared with three human experts and 25 scholars. GapNet-PL achieved an accuracy of 91%, significantly (p-value 1.1e\u22126) outperforming the best human expert with an accuracy of 72%.", "keywords": "Convolutional Neural Networks;High-resolution images;Multiple-Instance Learning;Microscopy Imaging;Protein Localization", "primary_area": "", "supplementary_material": "", "author": "Elisabeth Rumetshofer;Markus Hofmarcher;Clemens R\u00f6hrl;Sepp Hochreiter;G\u00fcnter Klambauer", "authorids": "rumetshofer@ml.jku.at;hofmarcher@ml.jku.at;clemens.roehrl@meduniwien.ac.at;hochreit@ml.jku.at;klambauer@ml.jku.at", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@inproceedings{\nrumetshofer2018humanlevel,\ntitle={Human-level Protein Localization with Convolutional Neural Networks},\nauthor={Elisabeth Rumetshofer and Markus Hofmarcher and Clemens R\u00f6hrl and Sepp Hochreiter and G\u00fcnter Klambauer},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryl5khRcKm},\n}", "github": "[![github](/images/github_icon.svg) ml-jku/gapnet-pl](https://github.com/ml-jku/gapnet-pl)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "4;5;8", "confidence": "4;3;4", "wc_review": "168;507;549", "wc_reply_reviewers": "225;0;0", "wc_reply_authors": "584;553;483", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.666666666666667, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 408.0, 170.56963387426262 ], "wc_reply_reviewers_avg": [ 75.0, 106.06601717798213 ], "wc_reply_authors_avg": [ 540.0, 42.245315322135625 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.2773500981126145, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9993156504734443423&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "openreview": "https://openreview.net/forum?id=ryl5khRcKm", "pdf": "https://openreview.net/pdf?id=ryl5khRcKm", "email": ";;;;", "author_num": 5 }, { "title": "Environment Probing Interaction Policies", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/915", "id": "ryl8-3AcFX", "author_site": "Wenxuan Zhou, Lerrel Pinto, Abhinav Gupta", "tldr": "", "abstract": "A key challenge in reinforcement learning (RL) is environment generalization: a policy trained to solve a task in one environment often fails to solve the same task in a slightly different test environment. A common approach to improve inter-environment transfer is to learn policies that are invariant to the distribution of testing environments. However, we argue that instead of being invariant, the policy should identify the specific nuances of an environment and exploit them to achieve better performance. In this work, we propose the \u201cEnvironment-Probing\u201d Interaction (EPI) policy, a policy that probes a new environment to extract an implicit understanding of that environment\u2019s behavior. Once this environment-specific information is obtained, it is used as an additional input to a task-specific policy that can now perform environment-conditioned actions to solve a task. To learn these EPI-policies, we present a reward function based on transition predictability. Specifically, a higher reward is given if the trajectory generated by the EPI-policy can be used to better predict transitions. We experimentally show that EPI-conditioned task-specific policies significantly outperform commonly used policy generalization methods on novel testing environments.", "keywords": "Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Wenxuan Zhou;Lerrel Pinto;Abhinav Gupta", "authorids": "wenxuanz@andrew.cmu.edu;lerrelp@andrew.cmu.edu;abhinavg@cs.cmu.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nzhou2018environment,\ntitle={Environment Probing Interaction Policies},\nauthor={Wenxuan Zhou and Lerrel Pinto and Abhinav Gupta},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryl8-3AcFX},\n}", "github": "[![github](/images/github_icon.svg) Wenxuan-Zhou/EPI](https://github.com/Wenxuan-Zhou/EPI)", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;2;3", "wc_review": "364;77;108", "wc_reply_reviewers": "45;0;0", "wc_reply_authors": "546;26;122", "reply_reviewers": "1;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.0, 0.0 ], "confidence_avg": [ 3.0, 0.816496580927726 ], "wc_review_avg": [ 183.0, 128.61052315680342 ], "wc_reply_reviewers_avg": [ 15.0, 21.213203435596427 ], "wc_reply_authors_avg": [ 231.33333333333334, 225.9282088530681 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2903789960714905866&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=ryl8-3AcFX", "pdf": "https://openreview.net/pdf?id=ryl8-3AcFX", "email": ";;", "author_num": 3 }, { "id": "rylBZ305KQ", "title": "Modeling Evolution of Language Through Time with Neural Networks", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "Language evolves over time with trends and shifts in technological, political, or cultural contexts. Capturing these variations is important to develop better language models. While recent works tackle temporal drifts by learning diachronic embeddings, we instead propose to integrate a temporal component into a recurrent language model. It takes the form of global latent variables, which are structured in time by a learned non-linear transition function. We perform experiments on three time annotated corpora. Experimental results on language modeling and classification tasks show that our model performs consistently better than temporal word embedding methods in two temporal evaluation settings: prediction and modeling. Moreover, we empirically show that the system is able to predict informative latent states in the future.", "keywords": "language modeling;variational inference;dynamic model;temporal data;deep learning", "primary_area": "", "supplementary_material": "", "author": "Edouard Delasalles;Sylvain Lamprier;Ludovic Denoyer", "authorids": "edouard.delasalles@lip6.fr;sylvain.lamprier@lip6.fr;ludovic.denoyer@lip6.fr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=rylBZ305KQ", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;5", "wc_review": "340;572;385", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 432.3333333333333, 100.45341651177867 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Wft6QWcm73MJ:scholar.google.com/&scioq=Modeling+Evolution+of+Language+Through+Time+with+Neural+Networks&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Lagging Inference Networks and Posterior Collapse in Variational Autoencoders", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/640", "id": "rylDfnCqF7", "author_site": "Junxian He, Daniel Spokoyny, Graham Neubig, Taylor Berg-Kirkpatrick", "tldr": "To address posterior collapse in VAEs, we propose a novel yet simple training procedure that aggressively optimizes inference network with more updates. This new training procedure mitigates posterior collapse and leads to a better VAE model. ", "abstract": "The variational autoencoder (VAE) is a popular combination of deep latent variable model and accompanying variational learning technique. By using a neural inference network to approximate the model's posterior on latent variables, VAEs efficiently parameterize a lower bound on marginal data likelihood that can be optimized directly via gradient methods. In practice, however, VAE training often results in a degenerate local optimum known as \"posterior collapse\" where the model learns to ignore the latent variable and the approximate posterior mimics the prior. In this paper, we investigate posterior collapse from the perspective of training dynamics. We find that during the initial stages of training the inference network fails to approximate the model's true posterior, which is a moving target. As a result, the model is encouraged to ignore the latent encoding and posterior collapse occurs. Based on this observation, we propose an extremely simple modification to VAE training to reduce inference lag: depending on the model's current mutual information between latent variable and observation, we aggressively optimize the inference network before performing each model update. Despite introducing neither new model components nor significant complexity over basic VAE, our approach is able to avoid the problem of collapse that has plagued a large amount of previous work. Empirically, our approach outperforms strong autoregressive baselines on text and image benchmarks in terms of held-out likelihood, and is competitive with more complex techniques for avoiding collapse while being substantially faster.", "keywords": "variational autoencoders;posterior collapse;generative models", "primary_area": "", "supplementary_material": "", "author": "Junxian He;Daniel Spokoyny;Graham Neubig;Taylor Berg-Kirkpatrick", "authorids": "junxianh@cs.cmu.edu;dspokoyn@cs.cmu.edu;gneubig@cs.cmu.edu;tberg@cs.cmu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nhe2018lagging,\ntitle={Lagging Inference Networks and Posterior Collapse in Variational Autoencoders},\nauthor={Junxian He and Daniel Spokoyny and Graham Neubig and Taylor Berg-Kirkpatrick},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rylDfnCqF7},\n}", "github": "[![github](/images/github_icon.svg) jxhe/vae-lagging-encoder](https://github.com/jxhe/vae-lagging-encoder) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rylDfnCqF7)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "pdf_size": 0, "rating": "7;8;8", "confidence": "4;4;4", "wc_review": "1857;473;393", "wc_reply_reviewers": "267;0;0", "wc_reply_authors": "1167;566;924", "reply_reviewers": "2;0;0", "reply_authors": "4;1;2", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 907.6666666666666, 672.0740699919583 ], "wc_reply_reviewers_avg": [ 89.0, 125.86500705120545 ], "wc_reply_authors_avg": [ 885.6666666666666, 246.84993191996 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 21, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 367, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5286759698670808442&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rylDfnCqF7", "pdf": "https://openreview.net/pdf?id=rylDfnCqF7", "email": ";;;", "author_num": 4 }, { "title": "A2BCD: Asynchronous Acceleration with Optimal Complexity", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/1125", "id": "rylIAsCqYm", "author_site": "Robert Hannah, Fei Feng, Wotao Yin", "tldr": "We prove the first-ever convergence proof of an asynchronous accelerated algorithm that attains a speedup.", "abstract": "\tIn this paper, we propose the Asynchronous Accelerated Nonuniform Randomized Block Coordinate Descent algorithm (A2BCD). We prove A2BCD converges linearly to a solution of the convex minimization problem at the same rate as NU_ACDM, so long as the maximum delay is not too large. This is the first asynchronous Nesterov-accelerated algorithm that attains any provable speedup. Moreover, we then prove that these algorithms both have optimal complexity. Asynchronous algorithms complete much faster iterations, and A2BCD has optimal complexity. Hence we observe in experiments that A2BCD is the top-performing coordinate descent algorithm, converging up to 4-5x faster than NU_ACDM on some data sets in terms of wall-clock time. To motivate our theory and proof techniques, we also derive and analyze a continuous-time analog of our algorithm and prove it converges at the same rate.", "keywords": "asynchronous;optimization;parallel;accelerated;complexity", "primary_area": "", "supplementary_material": "", "author": "Robert Hannah;Fei Feng;Wotao Yin", "authorids": "roberthannah89@gmail.com;fei.feng@math.ucla.edu;wotaoyin@math.ucla.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nhannah2018abcd,\ntitle={A2{BCD}: Asynchronous Acceleration with Optimal Complexity},\nauthor={Robert Hannah and Fei Feng and Wotao Yin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rylIAsCqYm},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "7;7;9", "confidence": "5;4;5", "wc_review": "212;296;119", "wc_reply_reviewers": "0;67;0", "wc_reply_authors": "383;983;189", "reply_reviewers": "0;1;0", "reply_authors": "1;2;1", "rating_avg": [ 7.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 209.0, 72.29107828771127 ], "wc_reply_reviewers_avg": [ 22.333333333333332, 31.584102892999123 ], "wc_reply_authors_avg": [ 518.3333333333334, 337.9796180573944 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14946343324761347269&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "openreview": "https://openreview.net/forum?id=rylIAsCqYm", "pdf": "https://openreview.net/pdf?id=rylIAsCqYm", "email": ";;", "author_num": 3 }, { "id": "rylIy3R9K7", "title": "Understand the dynamics of GANs via Primal-Dual Optimization", "track": "main", "status": "Reject", "tldr": "We show that, with a proper stepsize choice, the widely used first-order iterative algorithm in training GANs would in fact converge to a stationary solution with a sublinear rate.", "abstract": "Generative adversarial network (GAN) is one of the best known unsupervised learning techniques these days due to its superior ability to learn data distributions. In spite of its great success in applications, GAN is known to be notoriously hard to train. The tremendous amount of time it takes to run the training algorithm and its sensitivity to hyper-parameter tuning have been haunting researchers in this area. To resolve these issues, we need to first understand how GANs work. Herein, we take a step toward this direction by examining the dynamics of GANs. We relate a large class of GANs including the Wasserstein GANs to max-min optimization problems with the coupling term being linear over the discriminator. By developing new primal-dual optimization tools, we show that, with a proper stepsize choice, the widely used first-order iterative algorithm in training GANs would in fact converge to a stationary solution with a sublinear rate. The same framework also applies to multi-task learning and distributional robust learning problems. We verify our analysis on numerical examples with both synthetic and real data sets. We hope our analysis shed light on future studies on the theoretical properties of relevant machine learning problems.", "keywords": "non-convex optimization;generative adversarial network;primal dual algorithm", "primary_area": "", "supplementary_material": "", "author": "Songtao Lu;Rahul Singh;Xiangyi Chen;Yongxin Chen;Mingyi Hong", "authorids": "lus@umn.edu;rasingh@gatech.edu;chen5719@umn.edu;yongchen@gatech.edu;mhong@umn.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlu2019understand,\ntitle={Understand the dynamics of {GAN}s via Primal-Dual Optimization},\nauthor={Songtao Lu and Rahul Singh and Xiangyi Chen and Yongxin Chen and Mingyi Hong},\nyear={2019},\nurl={https://openreview.net/forum?id=rylIy3R9K7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rylIy3R9K7", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;3;3", "wc_review": "123;263;231", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "387;636;357", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 205.66666666666666, 59.89620652057654 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 460.0, 125.05198918849712 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17973376070849267234&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "rylKB3A9Fm", "title": "Assessing Generalization in Deep Reinforcement Learning", "track": "main", "status": "Reject", "tldr": "We provide the first benchmark and common experimental protocol for investigating generalization in RL, and conduct a systematic evaluation of state-of-the-art deep RL algorithms.", "abstract": "Deep reinforcement learning (RL) has achieved breakthrough results on many tasks, but has been shown to be sensitive to system changes at test time. As a result, building deep RL agents that generalize has become an active research area. Our aim is to catalyze and streamline community-wide progress on this problem by providing the first benchmark and a common experimental protocol for investigating generalization in RL. Our benchmark contains a diverse set of environments and our evaluation methodology covers both in-distribution and out-of-distribution generalization. To provide a set of baselines for future research, we conduct a systematic evaluation of state-of-the-art algorithms, including those that specifically tackle the problem of generalization. The experimental results indicate that in-distribution generalization may be within the capacity of current algorithms, while out-of-distribution generalization is an exciting challenge for future work.", "keywords": "reinforcement learning;generalization;benchmark", "primary_area": "", "supplementary_material": "", "author": "Charles Packer*;Katelyn Gao*;Jernej Kos;Philipp Krahenbuhl;Vladlen Koltun;Dawn Song", "authorids": "cpacker@berkeley.edu;katelyn.gao@intel.com;jernej@kos.mx;philkr@cs.utexas.edu;vladlen.koltun@intel.com;dawnsong@berkeley.edu", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\npacker*2019assessing,\ntitle={Assessing Generalization in Deep Reinforcement Learning},\nauthor={Charles Packer* and Katelyn Gao* and Jernej Kos and Philipp Krahenbuhl and Vladlen Koltun and Dawn Song},\nyear={2019},\nurl={https://openreview.net/forum?id=rylKB3A9Fm},\n}", "github": "[![github](/images/github_icon.svg) sunblaze-ucb/rl-generalization](https://github.com/sunblaze-ucb/rl-generalization)", "project": "", "reviewers": "AnonReviewer4;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=rylKB3A9Fm", "pdf_size": 0, "rating": "3;5;5", "confidence": "5;2;3", "wc_review": "505;154;165", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "881;238;300", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 4.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 3.3333333333333335, 1.247219128924647 ], "wc_review_avg": [ 274.6666666666667, 162.93216038857673 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 473.0, 289.6077807426221 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -0.9449111825230678, "gs_citation": 273, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6884255163861492539&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3 }, { "title": "Learning to Infer and Execute 3D Shape Programs", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/639", "id": "rylNH20qFQ", "author_site": "Yonglong Tian, Andrew Luo, Xingyuan Sun, Kevin Ellis, William Freeman, Joshua B Tenenbaum, Jiajun Wu", "tldr": "We propose 3D shape programs, a structured, compositional shape representation. Our model learns to infer and execute shape programs to explain 3D shapes.", "abstract": "Human perception of 3D shapes goes beyond reconstructing them as a set of points or a composition of geometric primitives: we also effortlessly understand higher-level shape structure such as the repetition and reflective symmetry of object parts. In contrast, recent advances in 3D shape sensing focus more on low-level geometry but less on these higher-level relationships. In this paper, we propose 3D shape programs, integrating bottom-up recognition systems with top-down, symbolic program structure to capture both low-level geometry and high-level structural priors for 3D shapes. Because there are no annotations of shape programs for real shapes, we develop neural modules that not only learn to infer 3D shape programs from raw, unannotated shapes, but also to execute these programs for shape reconstruction. After initial bootstrapping, our end-to-end differentiable model learns 3D shape programs by reconstructing shapes in a self-supervised manner. Experiments demonstrate that our model accurately infers and executes 3D shape programs for highly complex shapes from various categories. It can also be integrated with an image-to-shape module to infer 3D shape programs directly from an RGB image, leading to 3D shape reconstructions that are both more accurate and more physically plausible.", "keywords": "Program Synthesis;3D Shape Modeling;Self-supervised Learning", "primary_area": "", "supplementary_material": "", "author": "Yonglong Tian;Andrew Luo;Xingyuan Sun;Kevin Ellis;William T. Freeman;Joshua B. Tenenbaum;Jiajun Wu", "authorids": "yonglong@mit.edu;aluo@mit.edu;xs5@princeton.edu;ellisk@mit.edu;billf@mit.edu;jbt@mit.edu;jiajunwu@mit.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@inproceedings{\ntian2018learning,\ntitle={Learning to Infer and Execute 3D Shape Programs},\nauthor={Yonglong Tian and Andrew Luo and Xingyuan Sun and Kevin Ellis and William T. Freeman and Joshua B. Tenenbaum and Jiajun Wu},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rylNH20qFQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;5", "wc_review": "672;580;351", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "892;573;306", "reply_reviewers": "0;0;0", "reply_authors": "2;2;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 534.3333333333334, 134.967485796477 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 590.3333333333334, 239.54725815356122 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 20, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 169, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12000176727118199358&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 15, "openreview": "https://openreview.net/forum?id=rylNH20qFQ", "pdf": "https://openreview.net/pdf?id=rylNH20qFQ", "email": ";;;;;;", "author_num": 7 }, { "id": "rylRgh0qK7", "title": "NA", "track": "main", "status": "Withdraw", "tldr": "", "abstract": "NA", "keywords": "NA", "primary_area": "", "supplementary_material": "", "author": "Qingpeng Cai;Ling Pan;Pingzhong Tang", "authorids": "cqpcurry@gmail.com;penny.ling.pan@gmail.com;kenshinping@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rylRgh0qK7", "pdf_size": 0, "rating": "1;4;5", "confidence": "4;4;3", "wc_review": "631;499;299", "wc_reply_reviewers": "278;0;0", "wc_reply_authors": "125;0;0", "reply_reviewers": "1;0;0", "reply_authors": "1;0;0", "rating_avg": [ 3.3333333333333335, 1.699673171197595 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 476.3333333333333, 136.48280314953806 ], "wc_reply_reviewers_avg": [ 92.66666666666667, 131.0504567799068 ], "wc_reply_authors_avg": [ 41.666666666666664, 58.92556509887896 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 6, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.6933752452815362, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "rylU8oRctX", "title": "Learning with Little Data: Evaluation of Deep Learning Algorithms", "track": "main", "status": "Withdraw", "tldr": "Comparison of siamese neural networks, GANs, and VAT for few shot learning. ", "abstract": "Deep learning has become a widely used tool in many computational and classification problems. \nNevertheless obtaining and labeling data, which is needed for strong results, is often expensive or even not possible. \nIn this paper three different algorithmic approaches to deal with limited access to data are evaluated and compared to each other. \nWe show the drawbacks and benefits of each method. \nOne successful approach, especially in one- or few-shot learning tasks, is the use of external data during the classification task. \nAnother successful approach, which achieves state of the art results in semi-supervised learning (SSL) benchmarks, is consistency regularization.\nEspecially virtual adversarial training (VAT) has shown strong results and will be investigated in this paper. \nThe aim of consistency regularization is to force the network not to change the output, when the input or the network itself is perturbed.\nGenerative adversarial networks (GANs) have also shown strong empirical results. \nIn many approaches the GAN architecture is used in order to create additional data and therefor to increase the generalization capability of the classification network.\nFurthermore we consider the use of unlabeled data for further performance improvement. \nThe use of unlabeled data is investigated both for GANs and VAT. \n", "keywords": "semi-supervised learning;generative models;few shot learning", "primary_area": "", "supplementary_material": "", "author": "Andreas Look;Stefan Riedelbauch", "authorids": "andreas.look@ihs.uni-stuttgart.de;stefan.riedelbauch@ihs.uni-stuttgart.de", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=rylU8oRctX", "pdf_size": 0, "rating": "4;4;6", "confidence": "5;3;4", "wc_review": "1025;64;239", "wc_reply_reviewers": "73;0;0", "wc_reply_authors": "61;0;0", "reply_reviewers": "1;0;0", "reply_authors": "1;0;0", "rating_avg": [ 4.666666666666667, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 442.6666666666667, 417.9237038290868 ], "wc_reply_reviewers_avg": [ 24.333333333333332, 34.41253001774532 ], "wc_reply_authors_avg": [ 20.333333333333332, 28.755675768252935 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17636624619749178267&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Deep Decoder: Concise Image Representations from Untrained Non-convolutional Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/973", "id": "rylV-2C9KQ", "author_site": "Reinhard Heckel, Paul Hand", "tldr": "We introduce an underparameterized, nonconvolutional, and simple deep neural network that can, without training, effectively represent natural images and solve image processing tasks like compression and denoising competitively.", "abstract": "Deep neural networks, in particular convolutional neural networks, have become highly effective tools for compressing images and solving inverse problems including denoising, inpainting, and reconstruction from few and noisy measurements. This success can be attributed in part to their ability to represent and generate natural images well. Contrary to classical tools such as wavelets, image-generating deep neural networks have a large number of parameters---typically a multiple of their output dimension---and need to be trained on large datasets. \nIn this paper, we propose an untrained simple image model, called the deep decoder, which is a deep neural network that can generate natural images from very few weight parameters.\nThe deep decoder has a simple architecture with no convolutions and fewer weight parameters than the output dimensionality. This underparameterization enables the deep decoder to compress images into a concise set of network weights, which we show is on par with wavelet-based thresholding. Further, underparameterization provides a barrier to overfitting, allowing the deep decoder to have state-of-the-art performance for denoising. The deep decoder is simple in the sense that each layer has an identical structure that consists of only one upsampling unit, pixel-wise linear combination of channels, ReLU activation, and channelwise normalization. This simplicity makes the network amenable to theoretical analysis, and it sheds light on the aspects of neural networks that enable them to form effective signal representations.", "keywords": "natural image model;image prior;under-determined neural networks;untrained network;non-convolutional network;denoising;inverse problem", "primary_area": "", "supplementary_material": "", "author": "Reinhard Heckel;Paul Hand", "authorids": "rh43@rice.edu;p.hand@northeastern.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\nheckel2018deep,\ntitle={Deep Decoder: Concise Image Representations from Untrained Non-convolutional Networks},\nauthor={Reinhard Heckel and Paul Hand},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rylV-2C9KQ},\n}", "github": "[![github](/images/github_icon.svg) reinhardh/supplement_deep_decoder](https://github.com/reinhardh/supplement_deep_decoder) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=rylV-2C9KQ)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "7;8;8", "confidence": "3;4;4", "wc_review": "910;459;334", "wc_reply_reviewers": "233;0;0", "wc_reply_authors": "1045;379;79", "reply_reviewers": "2;0;0", "reply_authors": "2;1;1", "rating_avg": [ 7.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 567.6666666666666, 247.38678667669828 ], "wc_reply_reviewers_avg": [ 77.66666666666667, 109.83725334431037 ], "wc_reply_authors_avg": [ 501.0, 403.6929526261265 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.9999999999999997, "gs_citation": 374, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5031846359818705791&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rylV-2C9KQ", "pdf": "https://openreview.net/pdf?id=rylV-2C9KQ", "email": ";", "author_num": 2 }, { "id": "rylV6i09tX", "title": "Interpreting Adversarial Robustness: A View from Decision Surface in Input Space", "track": "main", "status": "Reject", "tldr": "", "abstract": "One popular hypothesis of neural network generalization is that the flat local minima of loss surface in parameter space leads to good generalization. However, we demonstrate that loss surface in parameter space has no obvious relationship with generalization, especially under adversarial settings. Through visualizing decision surfaces in both parameter space and input space, we instead show that the geometry property of decision surface in input space correlates well with the adversarial robustness. We then propose an adversarial robustness indicator, which can evaluate a neural network's intrinsic robustness property without testing its accuracy under adversarial attacks. Guided by it, we further propose our robust training method. Without involving adversarial training, our method could enhance network's intrinsic adversarial robustness against various adversarial attacks.", "keywords": "Adversarial examples;Robustness", "primary_area": "", "supplementary_material": "", "author": "Fuxun Yu;Chenchen Liu;Yanzhi Wang;Xiang Chen", "authorids": "fyu2@gmu.edu;chliu@clarkson.edu;yanz.wang@northeastern.edu;xchen26@gmu.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nyu2019interpreting,\ntitle={Interpreting Adversarial Robustness: A View from Decision Surface in Input Space},\nauthor={Fuxun Yu and Chenchen Liu and Yanzhi Wang and Xiang Chen},\nyear={2019},\nurl={https://openreview.net/forum?id=rylV6i09tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rylV6i09tX", "pdf_size": 0, "rating": "3;5;6", "confidence": "5;5;4", "wc_review": "950;326;386", "wc_reply_reviewers": "0;375;178", "wc_reply_authors": "293;1180;629", "reply_reviewers": "0;2;1", "reply_authors": "1;3;2", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 554.0, 281.0836174521738 ], "wc_reply_reviewers_avg": [ 184.33333333333334, 153.15859608770106 ], "wc_reply_authors_avg": [ 700.6666666666666, 365.64494010203697 ], "reply_reviewers_avg": [ 1.0, 0.816496580927726 ], "reply_authors_avg": [ 2.0, 0.816496580927726 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 36, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7342764919929139827&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5 }, { "id": "rylWVnR5YQ", "title": "Context Dependent Modulation of Activation Function", "track": "main", "status": "Reject", "tldr": "We propose a modification to traditional Artificial Neural Networks motivated by the biology of neurons to enable the shape of the activation function to be context dependent.", "abstract": "We propose a modification to traditional Artificial Neural Networks (ANNs), which provides the ANNs with new aptitudes motivated by biological neurons. Biological neurons work far beyond linearly summing up synaptic inputs and then transforming the integrated information. A biological neuron change firing modes accordingly to peripheral factors (e.g., neuromodulators) as well as intrinsic ones. Our modification connects a new type of ANN nodes, which mimic the function of biological neuromodulators and are termed modulators, to enable other traditional ANN nodes to adjust their activation sensitivities in run-time based on their input patterns. In this manner, we enable the slope of the activation function to be context dependent. This modification produces statistically significant improvements in comparison with traditional ANN nodes in the context of Convolutional Neural Networks and Long Short-Term Memory networks. ", "keywords": "Artificial Neural Network;Convolution Neural Network;Long Short-Term Memory;Activation Function;Neuromodulation", "primary_area": "", "supplementary_material": "", "author": "Long Sha;Jonathan Schwarcz;Pengyu Hong", "authorids": "longsha@brandeis.edu;johnschwarcz@brandeis.edu;hongpeng@brandeis.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nsha2019context,\ntitle={Context Dependent Modulation of Activation Function},\nauthor={Long Sha and Jonathan Schwarcz and Pengyu Hong},\nyear={2019},\nurl={https://openreview.net/forum?id=rylWVnR5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer5;AnonReviewer4;AnonReviewer2", "site": "https://openreview.net/forum?id=rylWVnR5YQ", "pdf_size": 0, "rating": "4;4;4;4;6", "confidence": "5;4;3;5;4", "wc_review": "382;628;473;493;366", "wc_reply_reviewers": "0;0;0;0;0", "wc_reply_authors": "126;198;138;166;118", "reply_reviewers": "0;0;0;0;0", "reply_authors": "1;1;1;1;1", "rating_avg": [ 4.4, 0.7999999999999999 ], "confidence_avg": [ 4.2, 0.7483314773547882 ], "wc_review_avg": [ 468.4, 93.8607479194578 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 149.2, 29.328484447717372 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.13363062095621223, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9078955103173238799&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "rylbWhC5Ym", "title": "HR-TD: A Regularized TD Method to Avoid Over-Generalization", "track": "main", "status": "Reject", "tldr": "A regularization technique for TD learning that avoids temporal over-generalization, especially in Deep Networks", "abstract": "Temporal Difference learning with function approximation has been widely used recently and has led to several successful results. However, compared with the original tabular-based methods, one major drawback of temporal difference learning with neural networks and other function approximators is that they tend to over-generalize across temporally successive states, resulting in slow convergence and even instability. In this work, we propose a novel TD learning method, Hadamard product Regularized TD (HR-TD), that reduces over-generalization and thus leads to faster convergence. This approach can be easily applied to both linear and nonlinear function approximators. \nHR-TD is evaluated on several linear and nonlinear benchmark domains, where we show improvement in learning behavior and performance.", "keywords": "Reinforcement Learning;TD Learning;Deep Learning", "primary_area": "", "supplementary_material": "", "author": "Ishan Durugkar;Bo Liu;Peter Stone", "authorids": "ishand@cs.utexas.edu;liubo19831214@gmail.com;pstone@cs.utexas.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\ndurugkar2019hrtd,\ntitle={{HR}-{TD}: A Regularized {TD} Method to Avoid Over-Generalization},\nauthor={Ishan Durugkar and Bo Liu and Peter Stone},\nyear={2019},\nurl={https://openreview.net/forum?id=rylbWhC5Ym},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=rylbWhC5Ym", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "wc_review": "602;1060;296", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 652.6666666666666, 313.95257957567765 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:EqyyoIWTfkUJ:scholar.google.com/&scioq=HR-TD:+A+Regularized+TD+Method+to+Avoid+Over-Generalization&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "rylhToC5YQ", "title": "Unsupervised Neural Multi-Document Abstractive Summarization of Reviews", "track": "main", "status": "Reject", "tldr": "We propose an end-to-end neural model for unsupervised multi-document abstractive summarization, applying it to business and product reviews.", "abstract": "Abstractive summarization has been studied using neural sequence transduction methods with datasets of large, paired document-summary examples. However, such datasets are rare and the models trained from them do not generalize to other domains. Recently, some progress has been made in learning sequence-to-sequence mappings with only unpaired examples. In our work, we consider the setting where there are only documents (product or business reviews) with no summaries provided, and propose an end-to-end, neural model architecture to perform unsupervised abstractive summarization. Our proposed model consists of an auto-encoder trained so that the mean of the representations of the input reviews decodes to a reasonable summary-review. We consider variants of the proposed architecture and perform an ablation study to show the importance of specific components. We show through metrics and human evaluation that the generated summaries are highly abstractive, fluent, relevant, and representative of the average sentiment of the input reviews.", "keywords": "unsupervised learning;abstractive summarization;reviews;text generation", "primary_area": "", "supplementary_material": "", "author": "Eric Chu;Peter J. Liu", "authorids": "echu@mit.edu;peterjliu@google.com", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nchu2019unsupervised,\ntitle={Unsupervised Neural Multi-Document Abstractive Summarization of Reviews},\nauthor={Eric Chu and Peter J. Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=rylhToC5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=rylhToC5YQ", "pdf_size": 0, "rating": "4;5;9", "confidence": "4;4;4", "wc_review": "554;771;225", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "553;661;104", "reply_reviewers": "0;0;0", "reply_authors": "2;1;1", "rating_avg": [ 6.0, 2.160246899469287 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 516.6666666666666, 224.46133049790313 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 439.3333333333333, 241.1808910801646 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7270218515250082134&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ryljV2A5KX", "title": "IB-GAN: Disentangled Representation Learning with Information Bottleneck GAN", "track": "main", "status": "Reject", "tldr": "Inspired by Information Bottleneck theory, we propose a new architecture of GAN for a disentangled representation learning", "abstract": "We present a novel architecture of GAN for a disentangled representation learning. The new model architecture is inspired by Information Bottleneck (IB) theory thereby named IB-GAN. IB-GAN objective is similar to that of InfoGAN but has a crucial difference; a capacity regularization for mutual information is adopted, thanks to which the generator of IB-GAN can harness a latent representation in disentangled and interpretable manner. To facilitate the optimization of IB-GAN in practice, a new variational upper-bound is derived. With experiments on CelebA, 3DChairs, and dSprites datasets, we demonstrate that the visual quality of samples generated by IB-GAN is often better than those by \u03b2-VAEs. Moreover, IB-GAN achieves much higher disentanglement metrics score than \u03b2-VAEs or InfoGAN on the dSprites dataset.", "keywords": "Unsupervised disentangled representation learning;GAN;Information Bottleneck;Variational Inference", "primary_area": "", "supplementary_material": "", "author": "Insu Jeon;Wonkwang Lee;Gunhee Kim", "authorids": "isjeon@vision.snu.ac.kr;wonkwang.lee.94@gmail.com;gunhee@snu.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\njeon2019ibgan,\ntitle={{IB}-{GAN}: Disentangled Representation Learning with Information Bottleneck {GAN}},\nauthor={Insu Jeon and Wonkwang Lee and Gunhee Kim},\nyear={2019},\nurl={https://openreview.net/forum?id=ryljV2A5KX},\n}", "github": "[![github](/images/github_icon.svg) insuj3on/IB-GAN](https://github.com/insuj3on/IB-GAN) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=ryljV2A5KX)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryljV2A5KX", "pdf_size": 0, "rating": "4;7;7", "confidence": "4;3;4", "wc_review": "510;383;788", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1139;569;899", "reply_reviewers": "0;0;0", "reply_authors": "2;1;2", "rating_avg": [ 6.0, 1.4142135623730951 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 560.3333333333334, 169.1278280538783 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 869.0, 233.66642891095844 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.5, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "title": "SNAS: stochastic neural architecture search", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/700", "id": "rylqooRqK7", "author_site": "Sirui Xie, Hehui Zheng, Chunxiao Liu, Liang Lin", "tldr": "", "abstract": "We propose Stochastic Neural Architecture Search (SNAS), an economical end-to-end solution to Neural Architecture Search (NAS) that trains neural operation parameters and architecture distribution parameters in same round of back-propagation, while maintaining the completeness and differentiability of the NAS pipeline. In this work, NAS is reformulated as an optimization problem on parameters of a joint distribution for the search space in a cell. To leverage the gradient information in generic differentiable loss for architecture search, a novel search gradient is proposed. We prove that this search gradient optimizes the same objective as reinforcement-learning-based NAS, but assigns credits to structural decisions more efficiently. This credit assignment is further augmented with locally decomposable reward to enforce a resource-efficient constraint. In experiments on CIFAR-10, SNAS takes less epochs to find a cell architecture with state-of-the-art accuracy than non-differentiable evolution-based and reinforcement-learning-based NAS, which is also transferable to ImageNet. It is also shown that child networks of SNAS can maintain the validation accuracy in searching, with which attention-based NAS requires parameter retraining to compete, exhibiting potentials to stride towards efficient NAS on big datasets.", "keywords": "Neural Architecture Search", "primary_area": "", "supplementary_material": "", "author": "Sirui Xie;Hehui Zheng;Chunxiao Liu;Liang Lin", "authorids": "xiesirui@sensetime.com;zhenghehui@sensetime.com;liuchunxiao@sensetime.com;linliang@ieee.org", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nxie2018snas,\ntitle={{SNAS}: stochastic neural architecture search},\nauthor={Sirui Xie and Hehui Zheng and Chunxiao Liu and Liang Lin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=rylqooRqK7},\n}", "github": "[![github](/images/github_icon.svg) SNAS-Series/SNAS-Series](https://github.com/SNAS-Series/SNAS-Series) + [![Papers with Code](/images/pwc_icon.svg) 1 community implementation](https://paperswithcode.com/paper/?openreview=rylqooRqK7)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "4;4;4", "wc_review": "176;204;429", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1161;237;32", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 269.6666666666667, 113.2440825042184 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 476.6666666666667, 491.0806677341537 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 26, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13328811299154907405&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "openreview": "https://openreview.net/forum?id=rylqooRqK7", "pdf": "https://openreview.net/pdf?id=rylqooRqK7", "email": ";;;", "author_num": 4 }, { "id": "rylxrsR9Fm", "title": "Neuron Hierarchical Networks", "track": "main", "status": "Withdraw", "tldr": "By breaking the layer hierarchy, we propose a 3-step approach to the construction of neuron-hierarchy networks that outperform NAS, SMASH and hierarchical representation with fewer parameters and shorter searching time.", "abstract": "In this paper, we propose a neural network framework called neuron hierarchical network (NHN), that evolves beyond the hierarchy in layers, and concentrates on the hierarchy of neurons. We observe mass redundancy in the weights of both handcrafted and randomly searched architectures. Inspired by the development of human brains, we prune low-sensitivity neurons in the model and add new neurons to the graph, and the relation between individual neurons are emphasized and the existence of layers weakened. We propose a process to discover the best base model by random architecture search, and discover the best locations and connections of the added neurons by evolutionary search. Experiment results show that the NHN achieves higher test accuracy on Cifar-10 than state-of-the-art handcrafted and randomly searched architectures, while requiring much fewer parameters and less searching time.", "keywords": "neural network;architecture search;evolution strategy", "primary_area": "", "supplementary_material": "", "author": "Han Yue;De-An Wu;Lei Wu;Ji Xie", "authorids": "johnhany@163.com;wudean.cn@uestc.edu.cn;wulei@uestc.edu.cn;zonghengxs@163.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=rylxrsR9Fm", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "288;602;240", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 376.6666666666667, 160.53521593580507 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "rylxxhRctX", "title": "Coverage and Quality Driven Training of Generative Image Models", "track": "main", "status": "Reject", "tldr": "Generative models that yield Gan-like samples and achieve competitive likelihood on held-out data. ", "abstract": "Generative modeling of natural images has been extensively studied in recent years, yielding remarkable progress. Current state-of-the-art methods are either based on maximum likelihood estimation or adversarial training. Both methods have their own drawbacks, which are complementary in nature. The first leads to over-generalization as the maximum likelihood criterion encourages models to cover the support of the training data by heavily penalizing small masses assigned to training data. Simplifying assumptions in such models limits their capacity and makes them spill mass on unrealistic samples. The second leads to mode-dropping since adversarial training encourages high quality samples from the model, but only indirectly enforces diversity among the samples. To overcome these drawbacks we make two contributions. First, we propose a model that extends variational autoencoders by using deterministic invertible transformation layers to map samples from the decoder to the image space. This induces correlations among the pixels given the latent variables, improving over factorial decoders commonly used in variational autoencoders. Second, we propose a unified training approach that leverages coverage and quality based criteria. Our models obtain likelihood scores competitive with state-of-the-art likelihood-based models, while achieving sample quality typical of adversarially trained networks. ", "keywords": "deep learning;generative modeling;unsupervised learning;maximum likelihood;adversarial learning;gan;vae", "primary_area": "", "supplementary_material": "", "author": "Thomas LUCAS;Konstantin SHMELKOV;Karteek ALAHARI;Cordelia SCHMID;Jakob VERBEEK", "authorids": "thomas.lucas@inria.fr;konstantin.shmelkov@inria.fr;karteek.alahari@inria.fr;cordelia.schmid@inria.fr;", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nlucas2019coverage,\ntitle={Coverage and Quality Driven Training of Generative Image Models},\nauthor={Thomas LUCAS and Konstantin SHMELKOV and Karteek ALAHARI and Cordelia SCHMID and Jakob VERBEEK},\nyear={2019},\nurl={https://openreview.net/forum?id=rylxxhRctX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer4;AnonReviewer1", "site": "https://openreview.net/forum?id=rylxxhRctX", "pdf_size": 0, "rating": "4;5;7", "confidence": "5;4;4", "wc_review": "282;705;113", "wc_reply_reviewers": "832;611;0", "wc_reply_authors": "3227;2099;25", "reply_reviewers": "3;3;0", "reply_authors": "6;5;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 366.6666666666667, 248.9877283901536 ], "wc_reply_reviewers_avg": [ 481.0, 351.8816088781377 ], "wc_reply_authors_avg": [ 1783.6666666666667, 1326.0913325341592 ], "reply_reviewers_avg": [ 2.0, 1.4142135623730951 ], "reply_authors_avg": [ 4.0, 2.160246899469287 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": -0.7559289460184544, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5030699934186415750&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ryx3_iAcY7", "title": "Contextualized Role Interaction for Neural Machine Translation", "track": "main", "status": "Reject", "tldr": "We propose a role interaction layer that explicitly models the modulation of token representations by contextualized roles.", "abstract": "Word inputs tend to be represented as single continuous vectors in deep neural networks. It is left to the subsequent layers of the network to extract relevant aspects of a word's meaning based on the context in which it appears. In this paper, we investigate whether word representations can be improved by explicitly incorporating the idea of latent roles. That is, we propose a role interaction layer (RIL) that consists of context-dependent (latent) role assignments and role-specific transformations. We evaluate the RIL on machine translation using two language pairs (En-De and En-Fi) and three datasets of varying size. We find that the proposed mechanism improves translation quality over strong baselines with limited amounts of data, but that the improvement diminishes as the size of data grows, indicating that powerful neural MT systems are capable of implicitly modeling role-word interaction by themselves. Our qualitative analysis reveals that the RIL extracts meaningful context-dependent roles and that it allows us to inspect more deeply the internal mechanisms of state-of-the-art neural machine translation systems.", "keywords": "Neural Machine Translation;Natural Language Processing", "primary_area": "", "supplementary_material": "", "author": "Dirk Weissenborn;Douwe Kiela;Jason Weston;Kyunghyun Cho", "authorids": "dirk.weissenborn@gmail.com;dkiela@fb.com;jase@fb.com;kyunghyun.cho@nyu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nweissenborn2019contextualized,\ntitle={Contextualized Role Interaction for Neural Machine Translation},\nauthor={Dirk Weissenborn and Douwe Kiela and Jason Weston and Kyunghyun Cho},\nyear={2019},\nurl={https://openreview.net/forum?id=ryx3_iAcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryx3_iAcY7", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;5;4", "wc_review": "95;162;215", "wc_reply_reviewers": "0;84;169", "wc_reply_authors": "124;236;147", "reply_reviewers": "0;1;1", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 157.33333333333334, 49.100803342602134 ], "wc_reply_reviewers_avg": [ 84.33333333333333, 68.99436369894444 ], "wc_reply_authors_avg": [ 169.0, 48.29768800539697 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2926208306175889158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2 }, { "id": "ryxDUs05KQ", "title": "Difference-Seeking Generative Adversarial Network", "track": "main", "status": "Reject", "tldr": "We proposed \"Difference-Seeking Generative Adversarial Network\" (DSGAN) model to learn the target distribution which is hard to collect training data.", "abstract": "We propose a novel algorithm, Difference-Seeking Generative Adversarial Network (DSGAN), developed from traditional GAN. DSGAN considers the scenario that the training samples of target distribution, $p_{t}$, are difficult to collect.\n\nSuppose there are two distributions $p_{\\bar{d}}$ and $p_{d}$ such that the density of the target distribution can be the differences between the densities of $p_{\\bar{d}}$ and $p_{d}$. We show how to learn the target distribution $p_{t}$ only via samples from $p_{d}$ and $p_{\\bar{d}}$ (relatively easy to obtain).\n\nDSGAN has the flexibility to produce samples from various target distributions (e.g. the out-of-distribution). Two key applications, semi-supervised learning and adversarial training, are taken as examples to validate the effectiveness of DSGAN. We also provide theoretical analyses about the convergence of DSGAN.", "keywords": "Generative Adversarial Network;Semi-Supervised Learning;Adversarial Training", "primary_area": "", "supplementary_material": "", "author": "Yi-Lin Sung;Sung-Hsien Hsieh;Soo-Chang Pei;Chun-Shien Lu", "authorids": "r06942076@ntu.edu.tw;parvaty316@hotmail.com;peisc@ntu.edu.tw;lcs@iis.sinica.edu.tw", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nsung2019differenceseeking,\ntitle={Difference-Seeking Generative Adversarial Network},\nauthor={Yi-Lin Sung and Sung-Hsien Hsieh and Soo-Chang Pei and Chun-Shien Lu},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxDUs05KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryxDUs05KQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;4", "wc_review": "542;378;373", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "847;478;389", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 431.0, 78.51539127245478 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 571.3333333333334, 198.28318693782947 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:oIM3pkUhCt4J:scholar.google.com/&scioq=Difference-Seeking+Generative+Adversarial+Network&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ryxDjjCqtQ", "title": "Deconfounding Reinforcement Learning in Observational Settings", "track": "main", "status": "Reject", "tldr": "This is the first attempt to build a bridge between confounding and the full reinforcement learning problem.", "abstract": "In this paper, we propose a general formulation to cope with a family of reinforcement learning tasks in observational settings, that is, learning good policies solely from the historical data produced by real environments with confounders (i.e., the factors affecting both actions and rewards). Based on the proposed approach, we extend one representative of reinforcement learning algorithms: the Actor-Critic method, to its deconfounding variant, which is also straightforward to be applied to other algorithms. In addition, due to lack of datasets in this direction, a benchmark is developed for deconfounding reinforcement learning algorithms by revising OpenAI Gym and MNIST. We demonstrate that the proposed algorithms are superior to traditional reinforcement learning algorithms in confounded environments. To the best of our knowledge, this is the first time that confounders are taken into consideration for addressing full reinforcement learning problems.", "keywords": "confounder;causal inference;reinforcement learning", "primary_area": "", "supplementary_material": "", "author": "Chaochao Lu;Jos\u00e9 Miguel Hern\u00e1ndez Lobato", "authorids": "cl641@cam.ac.uk;jmh233@cam.ac.uk", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nlu2019deconfounding,\ntitle={Deconfounding Reinforcement Learning in Observational Settings},\nauthor={Chaochao Lu and Jos\u00e9 Miguel Hern\u00e1ndez Lobato},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxDjjCqtQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryxDjjCqtQ", "pdf_size": 0, "rating": "2;4;4", "confidence": "4;4;3", "wc_review": "707;877;163", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "981;1581;101", "reply_reviewers": "0;0;0", "reply_authors": "2;3;2", "rating_avg": [ 3.3333333333333335, 0.9428090415820634 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 582.3333333333334, 304.52732043100644 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 887.6666666666666, 607.801137068879 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 0.4714045207910317 ], "replies_avg": [ 41, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.5, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9271066941467023390&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5 }, { "id": "ryxHii09KQ", "title": "In Your Pace: Learning the Right Example at the Right Time", "track": "main", "status": "Reject", "tldr": "We provide a formal definition of curriculum learning for deep neural networks, empirically showing how it can improve learning performance without additional human supervision and in a problem-free manner.", "abstract": "Training neural networks is traditionally done by sequentially providing random mini-batches sampled uniformly from the entire dataset. In our work, we show that sampling mini-batches non-uniformly can both enhance the speed of learning and improve the final accuracy of the trained network. Specifically, we decompose the problem using the principles of curriculum learning: first, we sort the data by some difficulty measure; second, we sample mini-batches with a gradually increasing level of difficulty. We focus on CNNs trained on image recognition. Initially, we define the difficulty of a training image using transfer learning from some competitive \"teacher\" network trained on the Imagenet database, showing improvement in learning speed and final performance for both small and competitive networks, using the CIFAR-10 and the CIFAR-100 datasets. We then suggest a bootstrap alternative to evaluate the difficulty of points using the same network without relying on a \"teacher\" network, thus increasing the applicability of our suggested method. We compare this approach to a related version of Self-Paced Learning, showing that our method benefits learning while SPL impairs it.", "keywords": "Curriculum Learning;Transfer Learning;Self-Paced Learning;Image Recognition", "primary_area": "", "supplementary_material": "", "author": "Guy Hacohen;Daphna Weinshall", "authorids": "guy.hacohen@mail.huji.ac.il;daphna@cs.huji.ac.il", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nhacohen2019in,\ntitle={In Your Pace: Learning the Right Example at the Right Time},\nauthor={Guy Hacohen and Daphna Weinshall},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxHii09KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryxHii09KQ", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;4;4", "wc_review": "1119;366;278", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 587.6666666666666, 377.4231341905557 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FJcaDZF3Um4J:scholar.google.com/&scioq=In+Your+Pace:+Learning+the+Right+Example+at+the+Right+Time&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ryxLG2RcYX", "title": "Learning Abstract Models for Long-Horizon Exploration", "track": "main", "status": "Reject", "tldr": "We automatically construct and explore a small abstract Markov Decision Process, enabling us to achieve state-of-the-art results on Montezuma's Revenge, Pitfall!, and Private Eye by a significant margin.", "abstract": "In high-dimensional reinforcement learning settings with sparse rewards, performing\neffective exploration to even obtain any reward signal is an open challenge.\nWhile model-based approaches hold promise of better exploration via planning, it\nis extremely difficult to learn a reliable enough Markov Decision Process (MDP)\nin high dimensions (e.g., over 10^100 states). In this paper, we propose learning\nan abstract MDP over a much smaller number of states (e.g., 10^5), which we can\nplan over for effective exploration. We assume we have an abstraction function\nthat maps concrete states (e.g., raw pixels) to abstract states (e.g., agent position,\nignoring other objects). In our approach, a manager maintains an abstract\nMDP over a subset of the abstract states, which grows monotonically through targeted\nexploration (possible due to the abstract MDP). Concurrently, we learn a\nworker policy to travel between abstract states; the worker deals with the messiness\nof concrete states and presents a clean abstraction to the manager. On three of\nthe hardest games from the Arcade Learning Environment (Montezuma's,\nPitfall!, and Private Eye), our approach outperforms the previous\nstate-of-the-art by over a factor of 2 in each game. In Pitfall!, our approach is\nthe first to achieve superhuman performance without demonstrations.", "keywords": "Reinforcement Learning;Hierarchical Reinforcement Learning;Model-based Reinforcement Learning;Exploration", "primary_area": "", "supplementary_material": "", "author": "Evan Zheran Liu;Ramtin Keramati;Sudarshan Seshadri;Kelvin Guu;Panupong Pasupat;Emma Brunskill;Percy Liang", "authorids": "evanliu@cs.stanford.edu;keramati@stanford.edu;ssesha@stanford.edu;kguu@stanford.edu;ppasupat@cs.stanford.edu;ebrun@cs.stanford.edu;pliang@cs.stanford.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nliu2019learning,\ntitle={Learning Abstract Models for Long-Horizon Exploration},\nauthor={Evan Zheran Liu and Ramtin Keramati and Sudarshan Seshadri and Kelvin Guu and Panupong Pasupat and Emma Brunskill and Percy Liang},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxLG2RcYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryxLG2RcYX", "pdf_size": 0, "rating": "4;5;6", "confidence": "4;4;2", "wc_review": "458;1091;209", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1490;880;76", "reply_reviewers": "0;0;0", "reply_authors": "4;2;1", "rating_avg": [ 5.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 586.0, 371.27617752826535 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 815.3333333333334, 579.0712871102793 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 2.3333333333333335, 1.247219128924647 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844387, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18286489431539955973&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryxMX2R9YQ", "title": "CGNF: Conditional Graph Neural Fields", "track": "main", "status": "Reject", "tldr": "", "abstract": "Graph convolutional networks have achieved tremendous success in the tasks of graph node classification. These models could learn a better node representation through encoding the graph structure and node features. However, the correlation between the node labels are not considered. In this paper, we propose a novel architecture for graph node classification, named conditional graph neural fields (CGNF). By integrating the conditional random fields (CRF) in the graph convolutional networks, we explicitly model a joint probability of the entire set of node labels, thus taking advantage of neighborhood label information in the node label prediction task. \nOur model could have both the representation capacity of graph neural networks and the prediction power of CRFs. Experiments on several graph datasets demonstrate effectiveness of CGNF.", "keywords": "graph neural networks;energy models;conditional random fields;label correlation", "primary_area": "", "supplementary_material": "", "author": "Tengfei Ma;Cao Xiao;Junyuan Shang;Jimeng Sun", "authorids": "tengfei.ma1@ibm.com;cxiao@us.ibm.com;sjy1203@pku.edu.cn;jsun@cc.gatech.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@misc{\nma2019cgnf,\ntitle={{CGNF}: Conditional Graph Neural Fields},\nauthor={Tengfei Ma and Cao Xiao and Junyuan Shang and Jimeng Sun},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxMX2R9YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryxMX2R9YQ", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;5;4", "wc_review": "355;557;158", "wc_reply_reviewers": "0;200;0", "wc_reply_authors": "248;354;194", "reply_reviewers": "0;1;0", "reply_authors": "1;1;1", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 356.6666666666667, 162.89533108376338 ], "wc_reply_reviewers_avg": [ 66.66666666666667, 94.28090415820634 ], "wc_reply_authors_avg": [ 265.3333333333333, 66.45967866976855 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2394644672571989690&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryxMjoRcYm", "title": "Logically-Constrained Neural Fitted Q-iteration", "track": "main", "status": "Withdraw", "tldr": "As safety is becoming a critical notion in machine learning we believe that this work can act as a foundation for a number of research directions such as safety-aware learning algorithms.", "abstract": "This paper proposes a method for efficient training of Q-function for continuous-state Markov Decision Processes (MDP), such that the traces of the resulting policies satisfy a Linear Temporal Logic (LTL) property. LTL, a modal logic, can express a wide range of time-dependent logical properties including safety and liveness. We convert the LTL property into a limit deterministic Buchi automaton with which a synchronized product MDP is constructed. The control policy is then synthesised by a reinforcement learning algorithm assuming that no prior knowledge is available from the MDP. The proposed method is evaluated in a numerical study to test the quality of the generated control policy and is compared against conventional methods for policy synthesis such as MDP abstraction (Voronoi quantizer) and approximate dynamic programming (fitted value iteration). ", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Mohammadhosein Hasanbeig;Alessandro Abate;Daniel Kroening", "authorids": "hosein.hasanbeig@cs.ox.ac.uk;aabate@cs.ox.ac.uk;kroening@cs.ox.ac.uk", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryxMjoRcYm", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;2", "wc_review": "419;482;239", "wc_reply_reviewers": "0;195;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;1;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 1.247219128924647 ], "wc_review_avg": [ 380.0, 102.96601381038309 ], "wc_reply_reviewers_avg": [ 65.0, 91.92388155425118 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 9, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.7559289460184545, "gs_citation": 51, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2887283739030398048&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 12 }, { "id": "ryxOIsA5FQ", "title": "Stacking for Transfer Learning", "track": "main", "status": "Reject", "tldr": "How to use stacked generalization to improve the performance of existing transfer learning algorithms when limited labeled data is available.", "abstract": "In machine learning tasks, overtting frequently crops up when the number of samples of target domain is insuf\ufb01cient, for the generalization ability of the classi\ufb01er is poor in this circumstance. To solve this problem, transfer learning utilizes the knowledge of similar domains to improve the robustness of the learner. The main idea of existing transfer learning algorithms is to reduce the dierence between domains by sample selection or domain adaptation. However, no matter what transfer learning algorithm we use, the difference always exists and the hybrid training of source and target data leads to reducing \ufb01tting capability of the learner on target domain. Moreover, when the relatedness between domains is too low, negative transfer is more likely to occur. To tackle the problem, we proposed a two-phase transfer learning architecture based on ensemble learning, which uses the existing transfer learning algorithms to train the weak learners in the \ufb01rst stage, and uses the predictions of target data to train the \ufb01nal learner in the second stage. Under this architecture, the \ufb01tting capability and generalization capability can be guaranteed at the same time. We evaluated the proposed method on public datasets, which demonstrates the effectiveness and robustness of our proposed method.", "keywords": "data diversi\ufb01cation;domain adaptation;transfer learning;stacked generalization", "primary_area": "", "supplementary_material": "", "author": "Peng Yuankai", "authorids": "pyk3350266@163.com", "gender": "", "homepage": "", "dblp": "", "google_scholar": "", "orcid": "", "linkedin": "", "or_profile": "", "aff": "", "aff_domain": "", "position": "", "bibtex": "@misc{\nyuankai2019stacking,\ntitle={Stacking for Transfer Learning},\nauthor={Peng Yuankai},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxOIsA5FQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryxOIsA5FQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;5", "wc_review": "197;414;413", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.0, 0.816496580927726 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 341.3333333333333, 102.05989526852466 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 1, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5903591779960345812&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "Revealing interpretable object representations from human behavior", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/712", "id": "ryxSrhC9KX", "author_site": "Charles Zheng, Francisco Pereira, Chris I Baker, Martin N Hebart", "tldr": "Human behavioral judgments are used to obtain sparse and interpretable representations of objects that generalize to other tasks", "abstract": "To study how mental object representations are related to behavior, we estimated sparse, non-negative representations of objects using human behavioral judgments on images representative of 1,854 object categories. These representations predicted a latent similarity structure between objects, which captured most of the explainable variance in human behavioral judgments. Individual dimensions in the low-dimensional embedding were found to be highly reproducible and interpretable as conveying degrees of taxonomic membership, functionality, and perceptual attributes. We further demonstrated the predictive power of the embeddings for explaining other forms of human behavior, including categorization, typicality judgments, and feature ratings, suggesting that the dimensions reflect human conceptual representations of objects beyond the specific task.", "keywords": "category representation;sparse coding;representation learning;interpretable representations", "primary_area": "", "supplementary_material": "", "author": "Charles Y. Zheng;Francisco Pereira;Chris I. Baker;Martin N. Hebart", "authorids": "charles.zheng@nih.gov;francisco.pereira@nih.gov;bakerchris@mail.nih.gov;martin.hebart@nih.gov", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nzheng2018revealing,\ntitle={Revealing interpretable object representations from human behavior},\nauthor={Charles Y. Zheng and Francisco Pereira and Chris I. Baker and Martin N. Hebart},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxSrhC9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;7;7", "confidence": "4;4;4", "wc_review": "323;253;304", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "1610;297;133", "reply_reviewers": "0;0;0", "reply_authors": "3;1;1", "rating_avg": [ 6.333333333333333, 0.9428090415820634 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 293.3333333333333, 29.555973263547855 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 680.0, 661.0088249536966 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.6666666666666667, 0.9428090415820634 ], "replies_avg": [ 10, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16835293067180738408&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 10, "openreview": "https://openreview.net/forum?id=ryxSrhC9KX", "pdf": "https://openreview.net/pdf?id=ryxSrhC9KX", "email": ";;;", "author_num": 4 }, { "id": "ryxY73AcK7", "title": "Sorting out Lipschitz function approximation", "track": "main", "status": "Reject", "tldr": "We identify pathologies in existing activation functions when learning neural networks with Lipschitz constraints and use these insights to design neural networks which are universal Lipschitz function approximators.", "abstract": "Training neural networks subject to a Lipschitz constraint is useful for generalization bounds, provable adversarial robustness, interpretable gradients, and Wasserstein distance estimation. By the composition property of Lipschitz functions, it suffices to ensure that each individual affine transformation or nonlinear activation function is 1-Lipschitz. The challenge is to do this while maintaining the expressive power. We identify a necessary property for such an architecture: each of the layers must preserve the gradient norm during backpropagation. Based on this, we propose to combine a gradient norm preserving activation function, GroupSort, with norm-constrained weight matrices. We show that norm-constrained GroupSort architectures are universal Lipschitz function approximators. Empirically, we show that norm-constrained GroupSort networks achieve tighter estimates of Wasserstein distance than their ReLU counterparts and can achieve provable adversarial robustness guarantees with little cost to accuracy.", "keywords": "deep learning;lipschitz neural networks;generalization;universal approximation;adversarial examples;generative models;optimal transport;adversarial robustness", "primary_area": "", "supplementary_material": "", "author": "Cem Anil;James Lucas;Roger B. Grosse", "authorids": "cem.anil@mail.utoronto.ca;jlucas@cs.toronto.edu;rgrosse@cs.toronto.edu", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nanil2019sorting,\ntitle={Sorting out Lipschitz function approximation},\nauthor={Cem Anil and James Lucas and Roger B. Grosse},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxY73AcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryxY73AcK7", "pdf_size": 0, "rating": "4;5;7", "confidence": "4;4;3", "wc_review": "139;400;219", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "99;605;145", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 5.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 252.66666666666666, 109.17977020594164 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 283.0, 228.46152119485387 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.944911182523068, "gs_citation": 406, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6756522330495028804&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6 }, { "id": "ryxaSsActQ", "title": "Dual Skew Divergence Loss for Neural Machine Translation", "track": "main", "status": "Reject", "tldr": "", "abstract": "For neural sequence model training, maximum likelihood (ML) has been commonly adopted to optimize model parameters with respect to the corresponding objective. However, in the case of sequence prediction tasks like neural machine translation (NMT), training with the ML-based cross entropy loss would often lead to models that overgeneralize and plunge into local optima. In this paper, we propose an extended loss function called dual skew divergence (DSD), which aims to give a better tradeoff between generalization ability and error avoidance during NMT training. Our empirical study indicates that switching to DSD loss after the convergence of ML training helps the model skip the local optimum and stimulates a stable performance improvement. The evaluations on WMT 2014 English-German and English-French translation tasks demonstrate that the proposed loss indeed helps bring about better translation performance than several baselines.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Yingting Wu;Hai Zhao;Rui Wang", "authorids": "wuyingting@sjtu.edu.cn;zhaohai@cs.sjtu.edu.cn;wangrui.nlp@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nwu2019dual,\ntitle={Dual Skew Divergence Loss for Neural Machine Translation},\nauthor={Yingting Wu and Hai Zhao and Rui Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxaSsActQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryxaSsActQ", "pdf_size": 0, "rating": "3;5;6", "confidence": "4;4;4", "wc_review": "285;420;324", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 1.247219128924647 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 343.0, 56.72741841473134 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:JyxU_b_sUb8J:scholar.google.com/&scioq=Dual+Skew+Divergence+Loss+for+Neural+Machine+Translation&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ryxeB30cYX", "title": "Stochastic Quantized Activation: To prevent Overfitting in Fast Adversarial Training", "track": "main", "status": "Reject", "tldr": "This paper proposes Stochastic Quantized Activation that solves overfitting problems in FGSM adversarial training and fastly achieves the robustness comparable to multi-step training.", "abstract": "Existing neural networks are vulnerable to \"adversarial examples\"---created by adding maliciously designed small perturbations in inputs to induce a misclassification by the networks. The most investigated defense strategy is adversarial training which augments training data with adversarial examples. However, applying single-step adversaries in adversarial training does not support the robustness of the networks, instead, they will even make the networks to be overfitted. In contrast to the single-step, multi-step training results in the state-of-the-art performance on MNIST and CIFAR10, yet it needs a massive amount of time. Therefore, we propose a method, Stochastic Quantized Activation (SQA) that solves overfitting problems in single-step adversarial training and fastly achieves the robustness comparable to the multi-step. SQA attenuates the adversarial effects by providing random selectivity to activation functions and allows the network to learn robustness with only single-step training. Throughout the experiment, our method demonstrates the state-of-the-art robustness against one of the strongest white-box attacks as PGD training, but with much less computational cost. Finally, we visualize the learning process of the network with SQA to handle strong adversaries, which is different from existing methods.", "keywords": "adversarial examples;deep learning", "primary_area": "", "supplementary_material": "", "author": "Wonjun Yoon;Jisuk Park;Daeshik Kim", "authorids": "wonjun.yoon@kaist.ac.kr;ssuk30@kaist.ac.kr;daeshik@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nyoon2019stochastic,\ntitle={Stochastic Quantized Activation: To prevent Overfitting in Fast Adversarial Training},\nauthor={Wonjun Yoon and Jisuk Park and Daeshik Kim},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxeB30cYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryxeB30cYX", "pdf_size": 0, "rating": "4;4;5", "confidence": "4;3;5", "wc_review": "246;1210;232", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 562.6666666666666, 457.7694713378 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11360133244947998216&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "title": "AntisymmetricRNN: A Dynamical System View on Recurrent Neural Networks", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/696", "id": "ryxepo0cFX", "author_site": "Bo Chang, Minmin Chen, Eldad Haber, Ed H. Chi", "tldr": "", "abstract": "Recurrent neural networks have gained widespread use in modeling sequential data. Learning long-term dependencies using these models remains difficult though, due to exploding or vanishing gradients. In this paper, we draw connections between recurrent networks and ordinary differential equations. A special form of recurrent networks called the AntisymmetricRNN is proposed under this theoretical framework, which is able to capture long-term dependencies thanks to the stability property of its underlying differential equation. Existing approaches to improving RNN trainability often incur significant computation overhead. In comparison, AntisymmetricRNN achieves the same goal by design. We showcase the advantage of this new architecture through extensive simulations and experiments. AntisymmetricRNN exhibits much more predictable dynamics. It outperforms regular LSTM models on tasks requiring long-term memory and matches the performance on tasks where short-term dependencies dominate despite being much simpler.", "keywords": "", "primary_area": "", "supplementary_material": "", "author": "Bo Chang;Minmin Chen;Eldad Haber;Ed H. Chi", "authorids": "bchang@stat.ubc.ca;minminc@google.com;haber@math.ubc.ca;edchi@google.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nchang2018antisymmetricrnn,\ntitle={Antisymmetric{RNN}: A Dynamical System View on Recurrent Neural Networks},\nauthor={Bo Chang and Minmin Chen and Eldad Haber and Ed H. Chi},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxepo0cFX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "pdf_size": 0, "rating": "6;7;7", "confidence": "5;5;5", "wc_review": "274;455;430", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "299;329;562", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 6.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 5.0, 0.0 ], "wc_review_avg": [ 386.3333333333333, 80.08467740807573 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 396.6666666666667, 117.54809890234532 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 7, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 284, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5419518435083318687&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ryxepo0cFX", "pdf": "https://openreview.net/pdf?id=ryxepo0cFX", "email": ";;;", "author_num": 4 }, { "id": "ryxfHnCctX", "title": "A Main/Subsidiary Network Framework for Simplifying Binary Neural Networks", "track": "main", "status": "Withdraw", "tldr": "we define the filter-level pruning problem for binary neural networks for the first time and propose method to solve it.", "abstract": "To reduce memory footprint and run-time latency, techniques such as neural net-work pruning and binarization have been explored separately. However, it is un-clear how to combine the best of the two worlds to get extremely small and efficient models. In this paper, we, for the first time, define the filter-level pruning problem for binary neural networks, which cannot be solved by simply migrating existing structural pruning methods for full-precision models. A novel learning-based approach is proposed to prune filters in our main/subsidiary network frame-work, where the main network is responsible for learning representative features to optimize the prediction performance, and the subsidiary component works as a filter selector on the main network. To avoid gradient mismatch when training the subsidiary component, we propose a layer-wise and bottom-up scheme. We also provide the theoretical and experimental comparison between our learning-based and greedy rule-based methods. Finally, we empirically demonstrate the effectiveness of our approach applied on several binary models, including binarizedNIN, VGG-11, and ResNet-18, on various image classification datasets. For bi-nary ResNet-18 on ImageNet, we use 78.6% filters but can achieve slightly better test error 49.87% (50.02%-0.15%) than the original model", "keywords": "efficient machine learning\uff0cbinary neural network", "primary_area": "", "supplementary_material": "", "author": "Yinghao Xu;Xin Dong;Yudian Li;Hao Su", "authorids": "justimyhxu@zju.edu.cn;xindong@g.harvard.edu;daniellee2519@gmail.com;haosu@eng.ucsd.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2", "site": "https://openreview.net/forum?id=ryxfHnCctX", "pdf_size": 0, "rating": "5", "confidence": "4", "wc_review": "276", "wc_reply_reviewers": "0", "wc_reply_authors": "0", "reply_reviewers": "0", "reply_authors": "0", "rating_avg": [ 5.0, 0.0 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 276.0, 0.0 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 2, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 42, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2514471162991614370&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7 }, { "id": "ryxhB3CcK7", "title": "Probabilistic Neural-Symbolic Models for Interpretable Visual Question Answering", "track": "main", "status": "Reject", "tldr": "A probabilistic neural symbolic model with a latent program space, for more interpretable question answering", "abstract": "We propose a new class of probabilistic neural-symbolic models for visual question answering (VQA) that provide interpretable explanations of their decision making in the form of programs, given a small annotated set of human programs. The key idea of our approach is to learn a rich latent space which effectively propagates program annotations from known questions to novel questions. We do this by formalizing prior work on VQA, called module networks (Andreas, 2016) as discrete, structured, latent variable models on the joint distribution over questions and answers given images, and devise a procedure to train the model effectively. Our results on a dataset of compositional questions about SHAPES (Andreas, 2016) show that our model generates more interpretable programs and obtains better accuracy on VQA in the low-data regime than prior work. ", "keywords": "Neural-symbolic models;visual question answering;reasoning;interpretability;graphical models;variational inference", "primary_area": "", "supplementary_material": "", "author": "Ramakrishna Vedantam;Stefan Lee;Marcus Rohrbach;Dhruv Batra;Devi Parikh", "authorids": "vrama@gatech.edu;steflee@gatech.edu;maroffm@gmail.com;dbatra@gatech.edu;parikh@gatech.edu", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nvedantam2019probabilistic,\ntitle={Probabilistic Neural-Symbolic Models for Interpretable Visual Question Answering},\nauthor={Ramakrishna Vedantam and Stefan Lee and Marcus Rohrbach and Dhruv Batra and Devi Parikh},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxhB3CcK7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryxhB3CcK7", "pdf_size": 0, "rating": "6;7;8", "confidence": "3;3;5", "wc_review": "128;320;89", "wc_reply_reviewers": "1675;0;0", "wc_reply_authors": "1624;0;0", "reply_reviewers": "4;0;0", "reply_authors": "3;0;0", "rating_avg": [ 7.0, 0.816496580927726 ], "confidence_avg": [ 3.6666666666666665, 0.9428090415820634 ], "wc_review_avg": [ 179.0, 100.96534058774823 ], "wc_reply_reviewers_avg": [ 558.3333333333334, 789.602572324978 ], "wc_reply_authors_avg": [ 541.3333333333334, 765.5609417646353 ], "reply_reviewers_avg": [ 1.3333333333333333, 1.8856180831641267 ], "reply_authors_avg": [ 1.0, 1.4142135623730951 ], "replies_avg": [ 22, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.8660254037844387, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12432634378302033141&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 6 }, { "id": "ryxhynC9KX", "title": "CNNSAT: Fast, Accurate Boolean Satisfiability using Convolutional Neural Networks", "track": "main", "status": "Reject", "tldr": "We introduce CNNSAT, a fast and accurate statistical decision procedure for SAT based on convolutional neural networks.", "abstract": "Boolean satisfiability (SAT) is one of the most well-known NP-complete\nproblems and has been extensively studied. State-of-the-art solvers\nexist and have found a wide range of applications. However, they still\ndo not scale well to formulas with hundreds of variables. To tackle\nthis fundamental scalability challenge, we introduce CNNSAT, a fast\nand accurate statistical decision procedure for SAT based on\nconvolutional neural networks. CNNSAT's effectiveness is due to a\nprecise and compact representation of Boolean\nformulas. On both real and synthetic formulas, CNNSAT is highly\n accurate and orders of magnitude faster than the\nstate-of-the-art solver Z3. We also describe how to extend CNNSAT to\npredict satisfying assignments when it predicts a formula to be\nsatisfiable.", "keywords": "Convolutional Neural Networks;Boolean satisfiability problem;Satisfiability modulo theories", "primary_area": "", "supplementary_material": "", "author": "Yu Wang;Fengjuan Gao;Amin Alipour;Linzhang Wang;Xuandong Li;Zhendong Su", "authorids": "yuwang@seg.nju.edu.cn;fjgao@seg.nju.edu.cn;alipour@cs.uh.edu;lzwang@nju.edu.cn;lxd@nju.edu.cn;zhendong.su@inf.ethz.ch", "gender": ";;;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": ";;;;;", "aff": ";;;;;", "aff_domain": ";;;;;", "position": ";;;;;", "bibtex": "@misc{\nwang2019cnnsat,\ntitle={{CNNSAT}: Fast, Accurate Boolean Satisfiability using Convolutional Neural Networks},\nauthor={Yu Wang and Fengjuan Gao and Amin Alipour and Linzhang Wang and Xuandong Li and Zhendong Su},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxhynC9KX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryxhynC9KX", "pdf_size": 0, "rating": "5;5;6", "confidence": "4;4;2", "wc_review": "748;651;823", "wc_reply_reviewers": "926;1329;32", "wc_reply_authors": "2121;1499;482", "reply_reviewers": "2;3;1", "reply_authors": "6;3;3", "rating_avg": [ 5.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 740.6666666666666, 70.40991091853161 ], "wc_reply_reviewers_avg": [ 762.3333333333334, 541.9977449727587 ], "wc_reply_authors_avg": [ 1367.3333333333333, 675.5651132364831 ], "reply_reviewers_avg": [ 2.0, 0.816496580927726 ], "reply_authors_avg": [ 4.0, 1.4142135623730951 ], "replies_avg": [ 25, 0 ], "authors#_avg": [ 6, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1078866865895793840&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "id": "ryxjH3R5KQ", "title": "Single Shot Neural Architecture Search Via Direct Sparse Optimization", "track": "main", "status": "Reject", "tldr": "single shot neural architecture search via direct sparse optimization", "abstract": "Recently Neural Architecture Search (NAS) has aroused great interest in both academia and industry, however it remains challenging because of its huge and non-continuous search space. Instead of applying evolutionary algorithm or reinforcement learning as previous works, this paper proposes a Direct Sparse Optimization NAS (DSO-NAS) method. In DSO-NAS, we provide a novel model pruning view to NAS problem. In specific, we start from a completely connected block, and then introduce scaling factors to scale the information flow between operations. Next, we impose sparse regularizations to prune useless connections in the architecture. Lastly, we derive an efficient and theoretically sound optimization method to solve it. Our method enjoys both advantages of differentiability and efficiency, therefore can be directly applied to large datasets like ImageNet. Particularly, On CIFAR-10 dataset, DSO-NAS achieves an average test error 2.84%, while on the ImageNet dataset DSO-NAS achieves 25.4% test error under 600M FLOPs with 8 GPUs in 18 hours.", "keywords": "Neural Architecture Search;Sparse Optimization", "primary_area": "", "supplementary_material": "", "author": "Xinbang Zhang;Zehao Huang;Naiyan Wang", "authorids": "xinbang.zhang@nlpr.ia.ac.cn;zehaohuang18@gmail.com;winsty@gmail.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@misc{\nzhang2019single,\ntitle={Single Shot Neural Architecture Search Via Direct Sparse Optimization},\nauthor={Xinbang Zhang and Zehao Huang and Naiyan Wang},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxjH3R5KQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer2;AnonReviewer1", "site": "https://openreview.net/forum?id=ryxjH3R5KQ", "pdf_size": 0, "rating": "6;6;7", "confidence": "3;4;3", "wc_review": "201;547;254", "wc_reply_reviewers": "0;0;84", "wc_reply_authors": "349;483;385", "reply_reviewers": "0;0;1", "reply_authors": "1;1;1", "rating_avg": [ 6.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 334.0, 152.16000350508233 ], "wc_reply_reviewers_avg": [ 28.0, 39.59797974644666 ], "wc_reply_authors_avg": [ 405.6666666666667, 56.62351298022953 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 23, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1 }, { "id": "ryxnDoRcK7", "title": "Estimating Heterogeneous Treatment Effects Using Neural Networks With The Y-Learner", "track": "main", "status": "Withdraw", "tldr": "We develop a CATE estimation strategy that takes advantage some of the intriguing properties of neural networks. ", "abstract": "We develop the Y-learner for estimating heterogeneous treatment effects in experimental and observational studies. The Y-learner is designed to leverage the abilities of neural networks to optimize multiple objectives and continually update, which allows for better pooling of underlying feature information between treatment and control groups. We evaluate the Y-learner on three test problems: (1) A set of six simulated data benchmarks from the literature. (2) A real-world large-scale experiment on voter persuasion. (3) A task from the literature that estimates artificially generated treatment effects on MNIST didgits. The Y-learner achieves state of the art results on two of the three tasks. On the MNIST task, it gets the second best results. ", "keywords": "causal inference;CATE estimation;ITE;deep learning", "primary_area": "", "supplementary_material": "", "author": "Bradly C. Stadie;S\u00f6ren R. K\u00fcnzel;Nikita Vemuri;Jasjeet S. Sekhon", "authorids": "bstadie@berkeley.edu;srk@berkeley.edu;nikitavemuri@berkeley.edu;sekhon@berkeley.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer1;AnonReviewer3", "site": "https://openreview.net/forum?id=ryxnDoRcK7", "pdf_size": 0, "rating": "4;5;5", "confidence": "4;4;3", "wc_review": "514;325;162", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 3.6666666666666665, 0.4714045207910317 ], "wc_review_avg": [ 333.6666666666667, 143.83400926840017 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14512782053631475129&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "Global-to-local Memory Pointer Networks for Task-Oriented Dialogue", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/690", "id": "ryxnHhRqFm", "author_site": "Chien-Sheng Wu, richard socher, Caiming Xiong", "tldr": "GLMP: Global memory encoder (context RNN, global pointer) and local memory decoder (sketch RNN, local pointer) that share external knowledge (MemNN) are proposed to strengthen response generation in task-oriented dialogue.", "abstract": "End-to-end task-oriented dialogue is challenging since knowledge bases are usually large, dynamic and hard to incorporate into a learning framework. We propose the global-to-local memory pointer (GLMP) networks to address this issue. In our model, a global memory encoder and a local memory decoder are proposed to share external knowledge. The encoder encodes dialogue history, modifies global contextual representation, and generates a global memory pointer. The decoder first generates a sketch response with unfilled slots. Next, it passes the global memory pointer to filter the external knowledge for relevant information, then instantiates the slots via the local memory pointers. We empirically show that our model can improve copy accuracy and mitigate the common out-of-vocabulary problem. As a result, GLMP is able to improve over the previous state-of-the-art models in both simulated bAbI Dialogue dataset and human-human Stanford Multi-domain Dialogue dataset on automatic and human evaluation.", "keywords": "pointer networks;memory networks;task-oriented dialogue systems;natural language processing", "primary_area": "", "supplementary_material": "", "author": "Chien-Sheng Wu;Richard Socher;Caiming Xiong", "authorids": "jason.wu@connect.ust.hk;rsocher@salesforce.com;cxiong@salesforce.com", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nwu2018globaltolocal,\ntitle={Global-to-local Memory Pointer Networks for Task-Oriented Dialogue},\nauthor={Chien-Sheng Wu and Richard Socher and Caiming Xiong},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxnHhRqFm},\n}", "github": "[![github](/images/github_icon.svg) jasonwu0731/GLMP](https://github.com/jasonwu0731/GLMP) + [![Papers with Code](/images/pwc_icon.svg) 3 community implementations](https://paperswithcode.com/paper/?openreview=ryxnHhRqFm)", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "pdf_size": 0, "rating": "5;8;8", "confidence": "3;2;2", "wc_review": "127;343;447", "wc_reply_reviewers": "0;77;0", "wc_reply_authors": "413;907;214", "reply_reviewers": "0;2;0", "reply_authors": "1;2;1", "rating_avg": [ 7.0, 1.4142135623730951 ], "confidence_avg": [ 2.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 305.6666666666667, 133.27998932906453 ], "wc_reply_reviewers_avg": [ 25.666666666666668, 36.29814810090944 ], "wc_reply_authors_avg": [ 511.3333333333333, 291.3352402683586 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.9428090415820634 ], "reply_authors_avg": [ 1.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": -1.0, "gs_citation": 196, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8042905846859720405&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "openreview": "https://openreview.net/forum?id=ryxnHhRqFm", "pdf": "https://openreview.net/pdf?id=ryxnHhRqFm", "email": ";;", "author_num": 3 }, { "id": "ryxsCiAqKm", "title": "Spectral Convolutional Networks on Hierarchical Multigraphs", "track": "main", "status": "Withdraw", "tldr": "A novel approach to graph classification based on spectral graph convolutional networks and its extension to multigraphs with learnable relations and hierarchical structure. We show state-of-the art results on chemical, social and image datasets.", "abstract": "Spectral Graph Convolutional Networks (GCNs) are a generalization of convolutional networks to learning on graph-structured data. Applications of spectral GCNs have been successful, but limited to a few problems where the graph is fixed, such as shape correspondence and node classification. In this work, we address this limitation by revisiting a particular family of spectral graph networks, Chebyshev GCNs, showing its efficacy in solving graph classification tasks with a variable graph structure and size. Current GCNs also restrict graphs to have at most one edge between any pair of nodes. To this end, we propose a novel multigraph network that learns from multi-relational graphs. We explicitly model different types of edges: annotated edges, learned edges with abstract meaning, and hierarchical edges. We also experiment with different ways to fuse the representations extracted from different edge types. This restriction is sometimes implied from a dataset, however, we relax this restriction for all kinds of datasets. We achieve state-of-the-art results on a variety of chemical, social, and vision graph classification benchmarks.", "keywords": "graph convolution;hierarchical models;neural networks;multigraph;deep learning", "primary_area": "", "supplementary_material": "", "author": "Boris Knyazev;Xiao Lin;Mohamed R. Amer;Graham W. Taylor", "authorids": "bknyazev@uoguelph.ca;xiao.lin@sri.com;mohamed.amer@sri.com;gwtaylor@uoguelph.ca", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryxsCiAqKm", "pdf_size": 0, "rating": "3;4;4", "confidence": "5;4;4", "wc_review": "1312;241;348", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 3.6666666666666665, 0.4714045207910317 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 633.6666666666666, 481.63909955714996 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:r4r1x6oAnssJ:scholar.google.com/&scioq=Spectral+Convolutional+Networks+on+Hierarchical+Multigraphs&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "ryxsS3A5Km", "title": "Continual Learning via Explicit Structure Learning", "track": "main", "status": "Reject", "tldr": "", "abstract": "Despite recent advances in deep learning, neural networks suffer catastrophic forgetting when tasks are learned sequentially. We propose a conceptually simple and general framework for continual learning, where structure optimization is considered explicitly during learning. We implement this idea by separating the structure and parameter learning. During structure learning, the model optimizes for the best structure for the current task. The model learns when to reuse or modify structure from previous tasks, or create new ones when necessary. The model parameters are then estimated with the optimal structure. Empirically, we found that our approach leads to sensible structures when learning multiple tasks continuously. Additionally, catastrophic forgetting is also largely alleviated from explicit learning of structures. Our method also outperforms all other baselines on the permuted MNIST and split CIFAR datasets in continual learning setting.", "keywords": "continuous learning;catastrophic forgetting;architecture learning", "primary_area": "", "supplementary_material": "", "author": "Xilai Li;Yingbo Zhou;Tianfu Wu;Richard Socher;Caiming Xiong", "authorids": "xli47@ncsu.edu;yingbo.zhou@salesforce.com;tianfu_wu@ncsu.edu;rsocher@salesforce.com;cxiong@salesforce.com", "gender": ";;;;", "homepage": ";;;;", "dblp": ";;;;", "google_scholar": ";;;;", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": ";;;;", "aff": ";;;;", "aff_domain": ";;;;", "position": ";;;;", "bibtex": "@misc{\nli2019continual,\ntitle={Continual Learning via Explicit Structure Learning},\nauthor={Xilai Li and Yingbo Zhou and Tianfu Wu and Richard Socher and Caiming Xiong},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxsS3A5Km},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryxsS3A5Km", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;5;4", "wc_review": "538;308;261", "wc_reply_reviewers": "182;0;0", "wc_reply_authors": "516;387;135", "reply_reviewers": "1;0;0", "reply_authors": "2;1;2", "rating_avg": [ 4.0, 0.0 ], "confidence_avg": [ 4.333333333333333, 0.4714045207910317 ], "wc_review_avg": [ 369.0, 121.03167629454144 ], "wc_reply_reviewers_avg": [ 60.666666666666664, 85.79562278396777 ], "wc_reply_authors_avg": [ 346.0, 158.22136391777187 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 1.6666666666666667, 0.4714045207910317 ], "replies_avg": [ 12, 0 ], "authors#_avg": [ 5, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:HvnHxmnZFV8J:scholar.google.com/&scioq=Continual+Learning+via+Explicit+Structure+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0 }, { "id": "ryxtE3C5Fm", "title": "From Adversarial Training to Generative Adversarial Networks", "track": "main", "status": "Withdraw", "tldr": "We found adversarial training not only speeds up the GAN training but also increases the image quality", "abstract": "In this paper, we are interested in two seemingly different concepts: \\textit{adversarial training} and \\textit{generative adversarial networks (GANs)}. Particularly, how these techniques work to improve each other. To this end, we analyze the limitation of adversarial training as a defense method, starting from questioning how well the robustness of a model can generalize. Then, we successfully improve the generalizability via data augmentation by the ``fake'' images sampled from generative adversarial network. After that, we are surprised to see that the resulting robust classifier leads to a better generator, for free. We intuitively explain this interesting phenomenon and leave the theoretical analysis for future work.\nMotivated by these observations, we propose a system that combines generator, discriminator, and adversarial attacker together in a single network. After end-to-end training and fine tuning, our method can simultaneously improve the robustness of classifiers, measured by accuracy under strong adversarial attacks, and the quality of generators, evaluated both aesthetically and quantitatively. In terms of the classifier, we achieve better robustness than the state-of-the-art adversarial training algorithm proposed in (Madry \\textit{et al.}, 2017), while our generator achieves competitive performance compared with SN-GAN (Miyato and Koyama, 2018).", "keywords": "adversarial training;conditional GAN", "primary_area": "", "supplementary_material": "", "author": "Xuanqing Liu;Cho-Jui Hsieh", "authorids": "xqliu@cs.ucla.edu;chohsieh@cs.ucla.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer3;AnonReviewer2", "site": "https://openreview.net/forum?id=ryxtE3C5Fm", "pdf_size": 0, "rating": "3;4;6", "confidence": "3;4;3", "wc_review": "298;121;285", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;484;0", "reply_reviewers": "0;0;0", "reply_authors": "0;1;0", "rating_avg": [ 4.333333333333333, 1.247219128924647 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 234.66666666666666, 80.54950168823034 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 161.33333333333334, 228.15978806285932 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0.3333333333333333, 0.4714045207910317 ], "replies_avg": [ 5, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.18898223650461363, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17996034955538414586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 }, { "title": "InstaGAN: Instance-aware Image-to-Image Translation", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/742", "id": "ryxwJhC9YX", "author_site": "Sangwoo Mo, Minsu Cho, Jinwoo Shin", "tldr": "We propose a novel method to incorporate the set of instance attributes for image-to-image translation.", "abstract": "Unsupervised image-to-image translation has gained considerable attention due to the recent impressive progress based on generative adversarial networks (GANs). However, previous methods often fail in challenging cases, in particular, when an image has multiple target instances and a translation task involves significant changes in shape, e.g., translating pants to skirts in fashion images. To tackle the issues, we propose a novel method, coined instance-aware GAN (InstaGAN), that incorporates the instance information (e.g., object segmentation masks) and improves multi-instance transfiguration. The proposed method translates both an image and the corresponding set of instance attributes while maintaining the permutation invariance property of the instances. To this end, we introduce a context preserving loss that encourages the network to learn the identity function outside of target instances. We also propose a sequential mini-batch inference/training technique that handles multiple instances with a limited GPU memory and enhances the network to generalize better for multiple instances. Our comparative evaluation demonstrates the effectiveness of the proposed method on different image datasets, in particular, in the aforementioned challenging cases. Code and results are available in https://github.com/sangwoomo/instagan", "keywords": "Image-to-Image Translation;Generative Adversarial Networks", "primary_area": "", "supplementary_material": "", "author": "Sangwoo Mo;Minsu Cho;Jinwoo Shin", "authorids": "swmo@kaist.ac.kr;mscho@postech.ac.kr;jinwoos@kaist.ac.kr", "gender": ";;", "homepage": ";;", "dblp": ";;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": ";;", "aff": ";;", "aff_domain": ";;", "position": ";;", "bibtex": "@inproceedings{\nmo2018instanceaware,\ntitle={Instance-aware Image-to-Image Translation},\nauthor={Sangwoo Mo and Minsu Cho and Jinwoo Shin},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxwJhC9YX},\n}", "github": "[![github](/images/github_icon.svg) sangwoomo/instagan](https://github.com/sangwoomo/instagan)", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;4;5", "wc_review": "240;531;544", "wc_reply_reviewers": "38;9;0", "wc_reply_authors": "254;418;413", "reply_reviewers": "1;1;0", "reply_authors": "1;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 4.666666666666667, 0.4714045207910317 ], "wc_review_avg": [ 438.3333333333333, 140.34323005482744 ], "wc_reply_reviewers_avg": [ 15.666666666666666, 16.21384867602041 ], "wc_reply_authors_avg": [ 361.6666666666667, 76.15919000502974 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 3, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 215, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14041898124180765737&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "openreview": "https://openreview.net/forum?id=ryxwJhC9YX", "pdf": "https://openreview.net/pdf?id=ryxwJhC9YX", "email": ";;", "author_num": 3 }, { "title": "Deep Layers as Stochastic Solvers", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/978", "id": "ryxxCiRqYX", "author_site": "Adel Bibi, Bernard Ghanem, Vladlen Koltun, Rene Ranftl", "tldr": "A framework that links deep network layers to stochastic optimization algorithms; can be used to improve model accuracy and inform network design.", "abstract": "We provide a novel perspective on the forward pass through a block of layers in a deep network. In particular, we show that a forward pass through a standard dropout layer followed by a linear layer and a non-linear activation is equivalent to optimizing a convex objective with a single iteration of a $\\tau$-nice Proximal Stochastic Gradient method. We further show that replacing standard Bernoulli dropout with additive dropout is equivalent to optimizing the same convex objective with a variance-reduced proximal method. By expressing both fully-connected and convolutional layers as special cases of a high-order tensor product, we unify the underlying convex optimization problem in the tensor setting and derive a formula for the Lipschitz constant $L$ used to determine the optimal step size of the above proximal methods. We conduct experiments with standard convolutional networks applied to the CIFAR-10 and CIFAR-100 datasets and show that replacing a block of layers with multiple iterations of the corresponding solver, with step size set via $L$, consistently improves classification accuracy.", "keywords": "deep networks;optimization", "primary_area": "", "supplementary_material": "", "author": "Adel Bibi;Bernard Ghanem;Vladlen Koltun;Rene Ranftl", "authorids": "adel.bibi@kaust.edu.sa;bernard.ghanem@kaust.edu.sa;vkoltun@gmail.com;ranftlr@gmail.com", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nbibi2018deep,\ntitle={Deep Layers as Stochastic Solvers},\nauthor={Adel Bibi and Bernard Ghanem and Vladlen Koltun and Rene Ranftl},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxxCiRqYX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "pdf_size": 0, "rating": "7;7;8", "confidence": "5;4;1", "wc_review": "548;334;222", "wc_reply_reviewers": "18;0;0", "wc_reply_authors": "1618;501;255", "reply_reviewers": "1;0;0", "reply_authors": "4;1;1", "rating_avg": [ 7.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 1.699673171197595 ], "wc_review_avg": [ 368.0, 135.2429911923966 ], "wc_reply_reviewers_avg": [ 6.0, 8.48528137423857 ], "wc_reply_authors_avg": [ 791.3333333333334, 593.1061362765428 ], "reply_reviewers_avg": [ 0.3333333333333333, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 1.4142135623730951 ], "replies_avg": [ 14, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": -0.9707253433941506, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14411629229022892325&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "openreview": "https://openreview.net/forum?id=ryxxCiRqYX", "pdf": "https://openreview.net/pdf?id=ryxxCiRqYX", "email": ";;;", "author_num": 4 }, { "id": "ryxyHnR5tX", "title": "Accelerated Sparse Recovery Under Structured Measurements", "track": "main", "status": "Reject", "tldr": "", "abstract": "Extensive work on compressed sensing has yielded a rich collection of sparse recovery algorithms, each making different tradeoffs between recovery condition and computational efficiency. In this paper, we propose a unified framework for accelerating various existing sparse recovery algorithms without sacrificing recovery guarantees by exploiting structure in the measurement matrix. Unlike fast algorithms that are specific to particular choices of measurement matrices where the columns are Fourier or wavelet filters for example, the proposed approach works on a broad range of measurement matrices that satisfy a particular property. We precisely characterize this property, which quantifies how easy it is to accelerate sparse recovery for the measurement matrix in question. We also derive the time complexity of the accelerated algorithm, which is sublinear in the signal length in each iteration. Moreover, we present experimental results on real world data that demonstrate the effectiveness of the proposed approach in practice. ", "keywords": "sparse recovery", "primary_area": "", "supplementary_material": "", "author": "Ke Li;Jitendra Malik", "authorids": "ke.li@eecs.berkeley.edu;malik@eecs.berkeley.edu", "gender": ";", "homepage": ";", "dblp": ";", "google_scholar": ";", "orcid": ";", "linkedin": ";", "or_profile": ";", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@misc{\nli2019accelerated,\ntitle={Accelerated Sparse Recovery Under Structured Measurements},\nauthor={Ke Li and Jitendra Malik},\nyear={2019},\nurl={https://openreview.net/forum?id=ryxyHnR5tX},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "site": "https://openreview.net/forum?id=ryxyHnR5tX", "pdf_size": 0, "rating": "4;5;5", "confidence": "5;4;3", "wc_review": "280;297;170", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 4.666666666666667, 0.4714045207910317 ], "confidence_avg": [ 4.0, 0.816496580927726 ], "wc_review_avg": [ 249.0, 56.290911048469155 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 4, 0 ], "authors#_avg": [ 2, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:zBH0QeicAZEJ:scholar.google.com/&scioq=Accelerated+Sparse+Recovery+Under+Structured+Measurements&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "title": "Learning Multi-Level Hierarchies with Hindsight", "status": "Poster", "track": "main", "site": "https://iclr.cc/virtual/2019/poster/913", "id": "ryzECoAcY7", "author_site": "Andrew Levy, George D Konidaris, Robert Platt, Kate Saenko", "tldr": "We introduce the first Hierarchical RL approach to successfully learn 3-level hierarchies in parallel in tasks with continuous state and action spaces.", "abstract": "Hierarchical agents have the potential to solve sequential decision making tasks with greater sample efficiency than their non-hierarchical counterparts because hierarchical agents can break down tasks into sets of subtasks that only require short sequences of decisions. In order to realize this potential of faster learning, hierarchical agents need to be able to learn their multiple levels of policies in parallel so these simpler subproblems can be solved simultaneously. Yet, learning multiple levels of policies in parallel is hard because it is inherently unstable: changes in a policy at one level of the hierarchy may cause changes in the transition and reward functions at higher levels in the hierarchy, making it difficult to jointly learn multiple levels of policies. In this paper, we introduce a new Hierarchical Reinforcement Learning (HRL) framework, Hierarchical Actor-Critic (HAC), that can overcome the instability issues that arise when agents try to jointly learn multiple levels of policies. The main idea behind HAC is to train each level of the hierarchy independently of the lower levels by training each level as if the lower level policies are already optimal. We demonstrate experimentally in both grid world and simulated robotics domains that our approach can significantly accelerate learning relative to other non-hierarchical and hierarchical methods. Indeed, our framework is the first to successfully learn 3-level hierarchies in parallel in tasks with continuous state and action spaces.", "keywords": "Hierarchical Reinforcement Learning;Reinforcement Learning;Deep Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Andrew Levy;George Konidaris;Robert Platt;Kate Saenko", "authorids": "andrew_levy2@brown.edu;gdk@cs.brown.edu;saenko@bu.edu;rplatt@ccs.neu.edu", "gender": ";;;", "homepage": ";;;", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": ";;;", "or_profile": ";;;", "aff": ";;;", "aff_domain": ";;;", "position": ";;;", "bibtex": "@inproceedings{\nlevy2018hierarchical,\ntitle={Hierarchical Reinforcement Learning with Hindsight},\nauthor={Andrew Levy and Robert Platt and Kate Saenko},\nbooktitle={International Conference on Learning Representations},\nyear={2019},\nurl={https://openreview.net/forum?id=ryzECoAcY7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer1;AnonReviewer2;AnonReviewer3", "pdf_size": 0, "rating": "5;6;7", "confidence": "4;4;4", "wc_review": "503;180;420", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "0;0;0", "reply_reviewers": "0;0;0", "reply_authors": "0;0;0", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.0, 0.0 ], "wc_review_avg": [ 367.6666666666667, 136.958225585598 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 0, 0 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 0, 0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 4, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 368, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11558193958091287134&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 11, "openreview": "https://openreview.net/forum?id=ryzECoAcY7", "pdf": "https://openreview.net/pdf?id=ryzECoAcY7", "email": ";;;", "author_num": 4 }, { "id": "ryzHXnR5Y7", "title": "Select Via Proxy: Efficient Data Selection For Training Deep Networks", "track": "main", "status": "Reject", "tldr": "we develop an efficient method for selecting training data to quickly and efficiently learn large machine learning models.", "abstract": "At internet scale, applications collect a tremendous amount of data by logging user events, analyzing text, and collecting images. This data powers a variety of machine learning models for tasks such as image classification, language modeling, content recommendation, and advertising. However, training large models over all available data can be computationally expensive, creating a bottleneck in the development of new machine learning models. In this work, we develop a novel approach to efficiently select a subset of training data to achieve faster training with no loss in model predictive performance. In our approach, we first train a small proxy model quickly, which we then use to estimate the utility of individual training data points, and then select the most informative ones for training the large target model. Extensive experiments show that our approach leads to a 1.6x and 1.8x speed-up on CIFAR10 and SVHN by selecting 60% and 50% subsets of the data, while maintaining the predictive performance of the model trained on the entire dataset.", "keywords": "data selection;deep learning;uncertainty sampling", "primary_area": "", "supplementary_material": "", "author": "Cody Coleman;Stephen Mussmann;Baharan Mirzasoleiman;Peter Bailis;Percy Liang;Jure Leskovec;Matei Zaharia", "authorids": "cody@cs.stanford.edu;mussmann@stanford.edu;baharanm@stanford.edu;pbailis@stanford.edu;pliang@cs.stanford.edu;jure@cs.stanford.edu;mzaharia@stanford.edu", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\ncoleman2019select,\ntitle={Select Via Proxy: Efficient Data Selection For Training Deep Networks},\nauthor={Cody Coleman and Stephen Mussmann and Baharan Mirzasoleiman and Peter Bailis and Percy Liang and Jure Leskovec and Matei Zaharia},\nyear={2019},\nurl={https://openreview.net/forum?id=ryzHXnR5Y7},\n}", "github": "", "project": "", "reviewers": "AnonReviewer2;AnonReviewer3;AnonReviewer1", "site": "https://openreview.net/forum?id=ryzHXnR5Y7", "pdf_size": 0, "rating": "4;4;5", "confidence": "2;4;4", "wc_review": "537;388;335", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "353;375;332", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.333333333333333, 0.4714045207910317 ], "confidence_avg": [ 3.3333333333333335, 0.9428090415820634 ], "wc_review_avg": [ 420.0, 85.514131385793 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 353.3333333333333, 17.55625877635159 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 11, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.4999999999999999, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16664380975093473312&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2 }, { "id": "ryza73R9tQ", "title": "Machine Translation With Weakly Paired Bilingual Documents", "track": "main", "status": "Reject", "tldr": "", "abstract": "Neural machine translation, which achieves near human-level performance in some languages, strongly relies on the availability of large amounts of parallel sentences, which hinders its applicability to low-resource language pairs. Recent works explore the possibility of unsupervised machine translation with monolingual data only, leading to much lower accuracy compared with the supervised one. Observing that weakly paired bilingual documents are much easier to collect than bilingual sentences, e.g., from Wikipedia, news websites or books, in this paper, we investigate the training of translation models with weakly paired bilingual documents. Our approach contains two components/steps. First, we provide a simple approach to mine implicitly bilingual sentence pairs from document pairs which can then be used as supervised signals for training. Second, we leverage the topic consistency of two weakly paired documents and learn the sentence-to-sentence translation by constraining the word distribution-level alignments. We evaluate our proposed method on weakly paired documents from Wikipedia on four tasks, the widely used WMT16 German$\\leftrightarrow$English and WMT13 Spanish$\\leftrightarrow$English tasks, and obtain $24.1$/$30.3$ and $28.0$/$27.6$ BLEU points separately, outperforming\nstate-of-the-art unsupervised results by more than 5 BLEU points and reducing the gap between unsupervised translation and supervised translation up to 50\\%. ", "keywords": "Natural Language Processing;Machine Translation;Unsupervised Learning", "primary_area": "", "supplementary_material": "", "author": "Lijun Wu;Jinhua Zhu;Di He;Fei Gao;Xu Tan;Tao Qin;Tie-Yan Liu", "authorids": "wulijun3@mail2.sysu.edu.cn;teslazhu@mail.ustc.edu.cn;di_he@pku.edu.cn;feiga@microsoft.com;xuta@microsoft.com;taoqin@microsoft.com;tyliu@microsoft.com", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nwu2019machine,\ntitle={Machine Translation With Weakly Paired Bilingual Documents},\nauthor={Lijun Wu and Jinhua Zhu and Di He and Fei Gao and Xu Tan and Tao Qin and Tie-Yan Liu},\nyear={2019},\nurl={https://openreview.net/forum?id=ryza73R9tQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryza73R9tQ", "pdf_size": 0, "rating": "5;6;7", "confidence": "5;3;5", "wc_review": "181;363;274", "wc_reply_reviewers": "0;49;246", "wc_reply_authors": "1228;775;872", "reply_reviewers": "0;1;1", "reply_authors": "2;2;2", "rating_avg": [ 6.0, 0.816496580927726 ], "confidence_avg": [ 4.333333333333333, 0.9428090415820634 ], "wc_review_avg": [ 272.6666666666667, 74.3071702835975 ], "wc_reply_reviewers_avg": [ 98.33333333333333, 106.31504544931019 ], "wc_reply_authors_avg": [ 958.3333333333334, 194.75169375957228 ], "reply_reviewers_avg": [ 0.6666666666666666, 0.4714045207910317 ], "reply_authors_avg": [ 2.0, 0.0 ], "replies_avg": [ 19, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=221680362095162698&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0 }, { "id": "ryzfcoR5YQ", "title": "Layerwise Recurrent Autoencoder for General Real-world Traffic Flow Forecasting", "track": "main", "status": "Reject", "tldr": "We propose Layerwise Recurrent Autoencoder with effective spatiotemporal dependencies modeling for general traffic flow forecasting.", "abstract": "Accurate spatio-temporal traffic forecasting is a fundamental task that has wide applications in city management, transportation area and financial domain. There are many factors that make this significant task also challenging, like: (1) maze-like road network makes the spatial dependency complex; (2) the traffic-time relationships bring non-linear temporal complication; (3) with the larger road network, the difficulty of flow forecasting grows. The prevalent and state-of-the-art methods have mainly been discussed on datasets covering relatively small districts and short time span, e.g., the dataset that is collected within a city during months. To forecast the traffic flow across a wide area and overcome the mentioned challenges, we design and propose a promising forecasting model called Layerwise Recurrent Autoencoder (LRA), in which a three-layer stacked autoencoder (SAE) architecture is used to obtain temporal traffic correlations and a recurrent neural networks (RNNs) model for prediction. The convolutional neural networks (CNNs) model is also employed to extract spatial traffic information within the transport topology for more accurate prediction. To the best of our knowledge, there is no general and effective method for traffic flow prediction in large area which covers a group of cities. The experiment is completed on such large scale real-world traffic datasets to show superiority. And a smaller dataset is exploited to prove universality of the proposed model. And evaluations show that our model outperforms the state-of-the-art baselines by 6% - 15%.", "keywords": "traffic flow forecasting;spatiotemporal dependencies;deep learning;intelligent transportation system", "primary_area": "", "supplementary_material": "", "author": "Peize Zhao;Danfeng Cai;Shaokun Zhang;Feng Chen;Zhemin Zhang;Cheng Wang;Jonathan Li", "authorids": "zhaopeize@sensetime.com;caidanfeng@sensetime.com;zhangshaokun@sensetime.com;chenfeng@xmu.edu.cn;zhangzhemin@xmu.edu.cn;cwang@xmu.edu.cn;junli@xmu.edu.cn", "gender": ";;;;;;", "homepage": ";;;;;;", "dblp": ";;;;;;", "google_scholar": ";;;;;;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": ";;;;;;", "aff": ";;;;;;", "aff_domain": ";;;;;;", "position": ";;;;;;", "bibtex": "@misc{\nzhao2019layerwise,\ntitle={Layerwise Recurrent Autoencoder for General Real-world Traffic Flow Forecasting},\nauthor={Peize Zhao and Danfeng Cai and Shaokun Zhang and Feng Chen and Zhemin Zhang and Cheng Wang and Jonathan Li},\nyear={2019},\nurl={https://openreview.net/forum?id=ryzfcoR5YQ},\n}", "github": "", "project": "", "reviewers": "AnonReviewer3;AnonReviewer1;AnonReviewer2", "site": "https://openreview.net/forum?id=ryzfcoR5YQ", "pdf_size": 0, "rating": "3;4;5", "confidence": "4;3;3", "wc_review": "377;140;423", "wc_reply_reviewers": "0;0;0", "wc_reply_authors": "659;304;512", "reply_reviewers": "0;0;0", "reply_authors": "1;1;1", "rating_avg": [ 4.0, 0.816496580927726 ], "confidence_avg": [ 3.3333333333333335, 0.4714045207910317 ], "wc_review_avg": [ 313.3333333333333, 123.99551963231933 ], "wc_reply_reviewers_avg": [ 0, 0 ], "wc_reply_authors_avg": [ 491.6666666666667, 145.63958558334645 ], "reply_reviewers_avg": [ 0, 0 ], "reply_authors_avg": [ 1.0, 0.0 ], "replies_avg": [ 8, 0 ], "authors#_avg": [ 7, 0 ], "corr_rating_confidence": -0.8660254037844385, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9200738196553408551&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0 } ]